# Scyan histocytometry Images

In [1]:
import scyan
import pandas as pd
import os
import matplotlib
import numpy as np
import anndata
%matplotlib inline
import re 
from pathlib import Path
## Run w/ scyan conda environment 

  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 0


I want to automate the simple clustering on the histocytometry images to free up time for Colin and Jayashree as well as reduce delays in iterating results.

## Concatenating data files together
I will have to make a different combined data frame for each group of four proteins.

In [None]:
CCR4_image_dir = "/stor/scratch/Ehrlich/Users/John/CCR4/T_cell_zone"
csv_paths      = [csv_path for csv_path in Path(CCR4_image_dir).rglob('*_mask.csv*')]
mask_df_list   = [None] * len(csv_paths)

for i in range(0, len(csv_paths)):
    csv_path = csv_paths[i]
    mask_df_list[i] = pd.read_csv(csv_path)
    # mask_df_list[i]["experiment"] = re.sub(".*/sub_", "", str(csv_path))
    mask_df_list[i]["experiment"] = str(csv_path)

combined_mask_df = pd.concat(mask_df_list, axis= 0)
codes, uniques = pd.factorize(combined_mask_df.experiment)
    ## uniques is what I'll need to identify which cluster is which experiment later
combined_mask_df.experiment = codes
combined_mask_path = os.path.join(CCR4_image_dir, "data/combined_mask_df.csv")
combined_mask_df.to_csv(combined_mask_path, index= False)
experiment_dummy = pd.DataFrame({"experiment" : np.unique(codes),
                                "exp_name"    : uniques})
experiment_dummy.to_csv(os.path.join(CCR4_image_dir, "data/experiment_dummy.csv"), index= False)

## Convert df to adata

In [None]:
adata = anndata.AnnData(combined_mask_df)
adata.obs = adata[:, ["CellID", "X_centroid", "Y_centroid", "experiment"]].to_df()
adata = adata[:, ["CD8", "CD4", "B220"]]
                       
print(f"Created anndata object with {adata.n_obs} cells and {adata.n_vars} markers.\n\n-> The markers names are: {', '.join(adata.var_names)}\n-> The non-marker names are: {', '.join(adata.obs.columns)}")

In [None]:
## I should try running scyan with the scale_asinh_scale normalization data and their recommended data. 
## I can then compare which is better. I'm not sure how that affects the scyan input. I should read their paper in more depth. 

In [None]:
## Process data
is_cytof = False

if is_cytof: # we recommend asinh for CyTOF data 
    scyan.preprocess.asinh_transform(adata)
else: # we recommend auto_logicle for flow or spectral flow
    scyan.preprocess.auto_logicle_transform(adata)
        ## Some transformation designed for flow: https://pubmed.ncbi.nlm.nih.gov/16604519/
    
## The flowsom paper uses asinh for low number of markers (~7)
scyan.preprocess.scale(adata)