In [1]:
import os
import glob 
import numpy as np
import pandas as pd 
import scanpy as sc
import seaborn as sns

In [2]:
AuthorMetadata = pd.read_csv('/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/AuthorProcessedData/GSE236581_CRC-ICB_metadata.txt.gz', sep=' ')
AuthorMetadata.head(2)

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Ident,Patient,Treatment,Tissue,MajorCellType,SubCellType
CRC01-N-I_AAACGGGTCGTTACGA,SeuratProject,2110,777,CRC01-N-I,P01,I,Normal,Epi,c91_Epi_Tumor
CRC01-N-I_AACACGTTCCTTCAAT,SeuratProject,1857,1303,CRC01-N-I,P01,I,Normal,Epi,c87_Goblet_MUC2


In [3]:
RunMetadata = pd.read_csv('/node200data/18parkky/datasets/data/public/FASTQ/Immunotherapy_CRC_Chen_et_al/metadata/Chen_et_al.MSI.run_meta.organized.tsv', sep='\t')
dict_HRRid_to_metadata = { tup.Accession : [tup.SampleID, tup.PatientID, tup.TreatmentStage, tup.BiopsySite, tup.TMB, tup.Response ] for tup in RunMetadata.itertuples() }

In [4]:
dict_Identiifer_to_CellType = { tup.Index : [tup.MajorCellType, tup.SubCellType ] for tup in AuthorMetadata.itertuples() } 

## Select epithelial cells from each .h5ad file and save to disk

### Tumor

#### (1) Label author annotation to each h5ad

In [46]:
list_DIR_cellranger_tumor = [ D for D in glob.glob('/node200data/18parkky/datasets/data/public/BAM/Immunotherapy_CRC_Chen_et_al/MSI/Tumor/*') if os.path.isdir(D)==True ]

for DIR_cellranger_tumor_out in list_DIR_cellranger_tumor:
    PATH_scrublet_out = f'{DIR_cellranger_tumor_out}/outs/filtered_feature_bc_matrix.scrublet.h5'
    
    if os.path.exists(f'{DIR_cellranger_tumor_out}/outs/filtered_feature_bc_matrix.scrublet.CellTypeLabeled.h5'): continue 
    
    HRR_accession = DIR_cellranger_tumor_out.split("/")[-1]
    
    adata_e = sc.read_h5ad(PATH_scrublet_out)
    
    if 'Identifier' in adata_e.obs.columns:
        pass 
    else:
        adata_e.obs['Accession'] = HRR_accession
        adata_e.obs['SampleID'] = dict_HRRid_to_metadata[HRR_accession][0]
        adata_e.obs['PatientID'] = dict_HRRid_to_metadata[HRR_accession][1]
        adata_e.obs['TreatmentStage'] = dict_HRRid_to_metadata[HRR_accession][2]
        adata_e.obs['BiopsySite'] = dict_HRRid_to_metadata[HRR_accession][3]
        adata_e.obs['TMB'] = dict_HRRid_to_metadata[HRR_accession][4]
        adata_e.obs['Response'] = dict_HRRid_to_metadata[HRR_accession][5]
        adata_e.obs = adata_e.obs[['Accession', 'SampleID', 'PatientID', 'TreatmentStage', 'BiopsySite', 'TMB', 'Response', 'doublet_score', 'predicted_doublet']]
        adata_e.write(PATH_scrublet_out)
            
    adata_e.obs['Identifier'] = [ f'{tup.SampleID.replace("P", "CRC")}_{tup.Index.split("-")[0]}' for tup in adata_e.obs.itertuples() ]
    for Identifier in adata_e.obs['Identifier']:
        try: dict_Identiifer_to_CellType[Identifier]
        except KeyError: dict_Identiifer_to_CellType[Identifier] = [ None, None ]
    adata_e.obs['Author_MajorCellType'] = [ dict_Identiifer_to_CellType[tup.Identifier][0] for tup in adata_e.obs.itertuples()  ]
    adata_e.obs['Author_SubCellType'] = [ dict_Identiifer_to_CellType[tup.Identifier][1] for tup in adata_e.obs.itertuples()  ]
    adata_e.write( f'{DIR_cellranger_tumor_out}/outs/filtered_feature_bc_matrix.scrublet.CellTypeLabeled.h5' )
    # adata_e[adata_e.obs['Author_MajorCellType']=='Epi'].write( f'{DIR_cellranger_tumor_out}/outs/filtered_feature_bc_matrix.scrublet.CellTypeLabeled.Epi.h5' )

#### (2) Merge author cell-type annotated h5ad

In [48]:
adata_merged = list()
for DIR_cellranger_tumor_out in list_DIR_cellranger_tumor:
    PATH_celltype_annotated_adata = f'{DIR_cellranger_tumor_out}/outs/filtered_feature_bc_matrix.scrublet.CellTypeLabeled.h5'
    if os.path.exists(PATH_celltype_annotated_adata) == False:
        print(DIR_cellranger_tumor_out)
        raise ValueError
    adata_merged.append( sc.read_h5ad(PATH_celltype_annotated_adata) )

adata_merged = sc.concat( adata_merged )

  utils.warn_names_duplicates("obs")


In [50]:
adata_merged.write('/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/Tumor.scrublet.CellTypeLabeled.h5ad')
adata_merged[adata_merged.obs['Author_MajorCellType']=='Epi'].write('/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/Tumor.scrublet.CellTypeLabeled.Epi.h5ad')

### NAT

#### (1) Label author annotation to each h5ad

In [44]:
list_DIR_cellranger_NAT = [ D for D in glob.glob('/node200data/18parkky/datasets/data/public/BAM/Immunotherapy_CRC_Chen_et_al/MSI/Adjacent_normal_tissue/*') if os.path.isdir(D)==True ]

for DIR_cellranger_NAT_out in list_DIR_cellranger_NAT:
    PATH_scrublet_out = f'{DIR_cellranger_NAT_out}/outs/filtered_feature_bc_matrix.scrublet.h5'
    HRR_accession = DIR_cellranger_NAT_out.split("/")[-1]

    if os.path.exists(f'{DIR_cellranger_NAT_out}/outs/filtered_feature_bc_matrix.scrublet.CellTypeLabeled.h5'): 
        # print(f'Cell-type labeled h5ad already exists for :{HRR_accession}')
        continue 
    else:
        print(f'{HRR_accession}')
    
    try:
        adata_e = sc.read_h5ad(PATH_scrublet_out)
    except:
        print(f"h5ad could not be opened for: {HRR_accession}")
        raise ValueError
    
    if 'Identifier' in adata_e.obs.columns:
        pass 
    else:
        adata_e.obs['Accession'] = HRR_accession
        adata_e.obs['SampleID'] = dict_HRRid_to_metadata[HRR_accession][0]
        adata_e.obs['PatientID'] = dict_HRRid_to_metadata[HRR_accession][1]
        adata_e.obs['TreatmentStage'] = dict_HRRid_to_metadata[HRR_accession][2]
        adata_e.obs['BiopsySite'] = dict_HRRid_to_metadata[HRR_accession][3]
        adata_e.obs['TMB'] = dict_HRRid_to_metadata[HRR_accession][4]
        adata_e.obs['Response'] = dict_HRRid_to_metadata[HRR_accession][5]
        adata_e.obs = adata_e.obs[['Accession', 'SampleID', 'PatientID', 'TreatmentStage', 'BiopsySite', 'TMB', 'Response', 'doublet_score', 'predicted_doublet']]
        adata_e.write(PATH_scrublet_out)
            
    adata_e.obs['Identifier'] = [ f'{tup.SampleID.replace("P", "CRC")}_{tup.Index.split("-")[0]}' for tup in adata_e.obs.itertuples() ]
    for Identifier in adata_e.obs['Identifier']:
        try: dict_Identiifer_to_CellType[Identifier]
        except KeyError: dict_Identiifer_to_CellType[Identifier] = [ None, None ]
    adata_e.obs['Author_MajorCellType'] = [ dict_Identiifer_to_CellType[tup.Identifier][0] for tup in adata_e.obs.itertuples()  ]
    adata_e.obs['Author_SubCellType'] = [ dict_Identiifer_to_CellType[tup.Identifier][1] for tup in adata_e.obs.itertuples()  ]
    adata_e.write( f'{DIR_cellranger_NAT_out}/outs/filtered_feature_bc_matrix.scrublet.CellTypeLabeled.h5' )

HRR1373951


#### (2) Merge author cell-type annotated h5ad

In [59]:
adata_merged = list()
for DIR_cellranger_NAT_out in list_DIR_cellranger_NAT:
    PATH_celltype_annotated_adata = f'{DIR_cellranger_NAT_out}/outs/filtered_feature_bc_matrix.scrublet.CellTypeLabeled.h5'
    if os.path.exists(PATH_celltype_annotated_adata) == False:
        print(DIR_cellranger_NAT_out)
        continue
    
    adata_merged.append( sc.read_h5ad(PATH_celltype_annotated_adata) )

adata_merged = sc.concat( adata_merged )

/node200data/18parkky/datasets/data/public/BAM/Immunotherapy_CRC_Chen_et_al/MSI/Adjacent_normal_tissue/HRR1373951


  utils.warn_names_duplicates("obs")


In [None]:
adata_merged.write('/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/NAT.scrublet.CellTypeLabeled.h5ad')
adata_merged[adata_merged.obs['Author_MajorCellType']=='Epi'].write('/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/NAT.scrublet.CellTypeLabeled.Epi.h5ad')

## Merge h5ad from each dataset and match .obs columns and save to disk

In [28]:
NAT_Chen     = sc.read_h5ad('/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/NAT.scrublet.CellTypeLabeled.h5ad')
Tumor_Chen   = sc.read_h5ad('/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/Tumor.scrublet.CellTypeLabeled.h5ad')

adata_Chen = sc.concat( [NAT_Chen, Tumor_Chen] )
adata_Chen.obs_names_make_unique()

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [29]:
adata_Joanito_epi = sc.read_h5ad('/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/raw_h5ad/MSI/SplitByCellType/Joanito.Epithelial.h5ad')
adata_Joanito_NonEpi = sc.read_h5ad('/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/raw_h5ad/MSI/SplitByCellType/Joanito.NonEpithelial.h5ad')

adata_Joanito = sc.concat( [adata_Joanito_epi, adata_Joanito_NonEpi] )
adata_Joanito.obs_names_make_unique()

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


### Match columns between the two dataset .obs

In [30]:
adata_Chen.obs.rename({'HRR_id' : 'Accession'}, axis=1, inplace=True)
adata_Chen.obs['Author_MajorCellType'] = adata_Chen.obs['Author_MajorCellType'].replace( {'Epi' : 'Epithelial', 'Mye' : 'Myeloid'} )

  adata_Chen.obs['Author_MajorCellType'] = adata_Chen.obs['Author_MajorCellType'].replace( {'Epi' : 'Epithelial', 'Mye' : 'Myeloid'} )


In [31]:
adata_Joanito.obs.rename({'Sample' : 'Accession', 
                              "sampleID" : 'SampleID',
                              'patientID' : 'PatientID', 
                              'AuthorCellType' : 'Author_MajorCellType', 
                              'CB_wPatient_BiopsySite' : 'Identifier'}, axis=1, inplace=True)

adata_Joanito.obs['TreatmentStage'] = 'n/a'
adata_Joanito.obs['TMB'] = 'n/a'
adata_Joanito.obs['Response'] = 'n/a'
adata_Joanito.obs['Author_SubCellType'] = 'n/a'

adata_Joanito.obs = adata_Joanito.obs[ ['Accession', 'SampleID', 'PatientID', 'TreatmentStage', 'BiopsySite',
       'TMB', 'Response', 'doublet_score', 'predicted_doublet', 'Identifier',
       'Author_MajorCellType', 'Author_SubCellType'] ].copy()

adata_Joanito.obs['BiopsySite'] = adata_Joanito.obs['BiopsySite'].replace( {'Normal' : 'Adjacent normal tissue' } )
adata_Joanito.obs.index = adata_Joanito.obs['Identifier']

  adata_Joanito.obs['BiopsySite'] = adata_Joanito.obs['BiopsySite'].replace( {'Normal' : 'Adjacent normal tissue' } )


In [36]:
adata_Chen.shape

(2112895, 36601)

In [35]:
adata_Joanito.shape

(58411, 36601)

In [32]:
adata_merged = sc.concat( [ adata_Chen, adata_Joanito] )

In [33]:
adata_merged.write('/node200data/18parkky/datasets/data/public/processed_data/CRC_MSI_intensity_analysis/h5ad/adata.merged_raw.h5ad')

In [34]:
adata_merged[adata_merged.obs['Author_MajorCellType']=='Epithelial'].write('/node200data/18parkky/datasets/data/public/processed_data/CRC_MSI_intensity_analysis/h5ad/adata.merged_raw.epi.h5ad')