In [1]:
# Import functions from modules in the spac package
from spac.data_utils import *
from spac.transformations import *
from spac.visualization import *
from spac.spatial_analysis import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Load CSV files
file_names = ['SPAC_Synthetic_Example.csv']
data = load_csv_files(file_names)

2024-08-08 20:05:43,951 - INFO - CSVs are converted into dataframes and combined into a list!
2024-08-08 20:05:43,952 - INFO - Total of 1 dataframes in the list.
2024-08-08 20:05:43,953 - INFO - File name: SPAC_Synthetic_Example.csv
2024-08-08 20:05:43,954 - INFO - Info: 
2024-08-08 20:05:43,966 - INFO - None
2024-08-08 20:05:43,967 - INFO - Description: 
2024-08-08 20:05:43,991 - INFO -               XMin         YMin         XMax         YMax  Marker 1 Intensity  \
count  4000.000000  4000.000000  4000.000000  4000.000000         4000.000000   
mean     15.023263    27.503914    35.001047    47.485636           54.968939   
std      10.075772    22.520211    10.067823    22.526520           45.013052   
min       2.010295     1.830543    21.849440    22.012755            6.807492   
25%       4.968221     5.011522    24.988489    24.982587            9.966993   
50%      15.295660    27.223661    34.973920    47.365072           54.940240   
75%      25.078264    50.011596    45.0361

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   file_name           4000 non-null   object 
 1   XMin                4000 non-null   float64
 2   YMin                4000 non-null   float64
 3   XMax                4000 non-null   float64
 4   YMax                4000 non-null   float64
 5   Marker 1 Intensity  4000 non-null   float64
 6   Marker 2 Intensity  4000 non-null   float64
 7   Marker 3 Intensity  4000 non-null   float64
 8   Immuno Cells        4000 non-null   int64  
 9   Normal Cells        4000 non-null   int64  
 10  Cancer Cells        4000 non-null   int64  
 11  loaded_file_name    4000 non-null   object 
dtypes: float64(7), int64(3), object(2)
memory usage: 375.1+ KB


In [3]:
# Step 2: Select values based on file name
# Subset data for each file name
subsets = []
slide_identifier_column = 'file_name'
unique_file_names = data[slide_identifier_column].unique()
for file_name in unique_file_names:
    print(f'Subsetting data for {file_name}')
    subset = select_values(data, annotation=slide_identifier_column, values=[file_name])
    subsets.append(subset)

2024-08-08 20:05:44,012 - INFO - Summary of returned dataset: 1000 cells match the selected labels.
2024-08-08 20:05:44,015 - INFO - Summary of returned dataset: 1000 cells match the selected labels.
2024-08-08 20:05:44,018 - INFO - Summary of returned dataset: 1000 cells match the selected labels.
2024-08-08 20:05:44,020 - INFO - Summary of returned dataset: 1000 cells match the selected labels.


Subsetting data for Halo_Synthetic_Example_1
Subsetting data for Halo_Synthetic_Example_2
Subsetting data for Halo_Synthetic_Example_3
Subsetting data for Halo_Synthetic_Example_4


In [4]:
# Step 3: Append new annotations to each subset
# Create a dictionary with file-specific labels
annotations_for_file=[
    {'slide': '1', 'animal': 'A'},
    {'slide': '2', 'animal': 'B'},
    {'slide': '3', 'animal': 'C'},
    {'slide': '4', 'animal': 'D'},
]

for i, subset in enumerate(subsets):
    subset = append_annotation(subset, annotations_for_file[i])


In [5]:
# Step 4: Combine the subsets back into a single DataFrame
combined_data = combine_dfs(subsets)
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   file_name           4000 non-null   object 
 1   XMin                4000 non-null   float64
 2   YMin                4000 non-null   float64
 3   XMax                4000 non-null   float64
 4   YMax                4000 non-null   float64
 5   Marker 1 Intensity  4000 non-null   float64
 6   Marker 2 Intensity  4000 non-null   float64
 7   Marker 3 Intensity  4000 non-null   float64
 8   Immuno Cells        4000 non-null   int64  
 9   Normal Cells        4000 non-null   int64  
 10  Cancer Cells        4000 non-null   int64  
 11  loaded_file_name    4000 non-null   object 
 12  slide               4000 non-null   object 
 13  animal              4000 non-null   object 
dtypes: float64(7), int64(3), object(4)
memory usage: 437.6+ KB


In [6]:
# Step 5: Convert binary columns to a categorical column
binary_columns = ['Immuno Cells', 'Normal Cells', 'Cancer Cells']
new_annotation = 'cell_type'
combined_data = bin2cat(combined_data, binary_columns, new_annotation)
combined_data.info()

Immuno Cells
Normal Cells
Cancer Cells
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   file_name           4000 non-null   object 
 1   XMin                4000 non-null   float64
 2   YMin                4000 non-null   float64
 3   XMax                4000 non-null   float64
 4   YMax                4000 non-null   float64
 5   Marker 1 Intensity  4000 non-null   float64
 6   Marker 2 Intensity  4000 non-null   float64
 7   Marker 3 Intensity  4000 non-null   float64
 8   Immuno Cells        4000 non-null   int64  
 9   Normal Cells        4000 non-null   int64  
 10  Cancer Cells        4000 non-null   int64  
 11  loaded_file_name    4000 non-null   object 
 12  slide               4000 non-null   object 
 13  animal              4000 non-null   object 
 14  cell_type           4000 non-null   object 
dtypes: float64(7), i

In [7]:
# Step 6: Calculate centroids for each cell
x_min_col = 'XMin'  # replace with your actual column name
x_max_col = 'XMin'  # replace with your actual column name
y_min_col = 'YMin'  # replace with your actual column name
y_max_col = 'YMax'  # replace with your actual column name
x_centroid = 'X_Centroid'
y_centroid = 'Y_Centroid'

combined_data = calculate_centroid(
    combined_data,
    x_min=x_min_col,
    x_max=x_max_col,
    y_min=y_min_col,
    y_max=y_max_col,
    new_x=x_centroid,
    new_y=y_centroid
)

combined_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   file_name           4000 non-null   object 
 1   XMin                4000 non-null   float64
 2   YMin                4000 non-null   float64
 3   XMax                4000 non-null   float64
 4   YMax                4000 non-null   float64
 5   Marker 1 Intensity  4000 non-null   float64
 6   Marker 2 Intensity  4000 non-null   float64
 7   Marker 3 Intensity  4000 non-null   float64
 8   Immuno Cells        4000 non-null   int64  
 9   Normal Cells        4000 non-null   int64  
 10  Cancer Cells        4000 non-null   int64  
 11  loaded_file_name    4000 non-null   object 
 12  slide               4000 non-null   object 
 13  animal              4000 non-null   object 
 14  cell_type           4000 non-null   object 
 15  X_Centroid          4000 non-null   float64
 16  Y_Cent

In [8]:
# Step 6: Select and prepare columns for analysis
intensity_columns = [col for col in combined_data.columns if col.startswith('Marker')]

# Columns for centroid calculations and other necessary metadata
x_col = x_centroid
y_col = y_centroid
annotation_columns = ['slide', 'animal', 'cell_type']

# Ingest the data into an AnnData object
adata = ingest_cells(
    dataframe=combined_data,
    regex_str=intensity_columns,
    x_col=x_col,
    y_col=y_col,
    annotation=annotation_columns
)

print(adata)

Marker 1 Intensity
Marker 2 Intensity
Marker 3 Intensity
AnnData object with n_obs × n_vars = 4000 × 3
    obs: 'slide', 'animal', 'cell_type'
    obsm: 'spatial'


