In [None]:
''' 
You will be able to access the raw sequencing data used in this study by providing the authors with a valid request.
After downloading the raw sequencing files, 

After receiving the raw sequencing files, we used CellRanger (8.0.1) to obtain gene expression data and barcoded BAM files.
Then, we ran Scrublet for *each* sample (which is best practice according to Scrublet), then merged the resulting h5ad files into one.
Note that for Chen et al. dataset, you must input multiple FASTQ files according to their sample IDs for each sample.
Below is are examples of the command lines used:
    ~/cellranger-8.0.1/bin/cellranger count --id {id} --transcriptome ~/refdata-gex-GRCh38-2020-A --fastqs {DIR_TO_FASTQ} --create-bam true
    ~/cellranger-8.0.1/bin/cellranger count --id {id} --transcriptome ~/refdata-gex-GRCh38-2020-A \ 
            --fastqs {DIR_TO_FASTQ_1},{DIR_TO_FASTQ_2},{DIR_TO_FASTQ_3},{DIR_TO_FASTQ_4} \ 
            --create-bam true --nosecondary
    
Then, as we will see below, 
    (1) the resulting h5ad files will be preprocessed using the standard Scanpy workflow
    (2) releveant metadata (e.g., patient, author-provided cell type, etc) will be appended to Scanpy object.
    (3) and MSI profile information will be appended to the Scanpy object.
'''

In [10]:
DIRECTORY_OUT = '/node200data/18parkky/datasets/data/public/processed_data/CRC_MSI_intensity_analysis_clean_data'

PATH_TO_KINKER_ADATA    = '/node200data/18parkky/datasets/data/public/processed_data/Kinker_et_al/CPM_data.metalabeled.h5ad'
PATH_TO_CHEN_ADATA      = '/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/1_raw_h5ad/Chen.MSI.raw.h5ad'
PATH_TO_JOANITO_ADATA   = '/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/1_raw_h5ad/Joanito.raw.h5ad'

PATH_TO_CHEN_METADATA   = '/node200data/18parkky/datasets/data/public/processed_data/Immunotherapy_CRC_Chen_et_al/AuthorProcessedData/GSE236581_CRC-ICB_metadata.txt.gz'
PATH_TO_CHEN_METADATA2  = '/node200data/18parkky/datasets/data/public/FASTQ/Immunotherapy_CRC_Chen_et_al/metadata/Chen_et_al.MSI.CRC.woSD.run_meta.organized.tsv'

PATH_TO_KINKER_METADATA = '/node200data/18parkky/datasets/data/public/processed_data/Kinker_et_al/Metadata.txt'
PATH_TO_KINKER_NANOMNT  = '/node200data/18parkky/datasets/data/public/processed_data/Kinker_et_al/AlleleTable.merged.tsv.gz'

PATH_TO_JOANITO_METADATA_EPI    = '/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/CRC-SG1/synapse-metadata/Epithelial_metadata.csv'
PATH_TO_JOANITO_METADATA_NONEPI = '/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/CRC-SG1/synapse-metadata/NonEpithelial_metadata.csv'
PATH_TO_JOANITO_METADATA2       = '/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/CRC-SG1/synapse-metadata/patient_clinical_information.csv'
PATH_TO_JOANITO_SG1_METADATA    = '/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/CRC-SG1/EGAD00001008555-metadata/organized_metadata.tsv'
PATH_TO_JOANITO_KUL_METADATA    = '/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/KUL/organized_metadata.tsv'

DIRECTORY_CHEN_CELLRANGER_OUT       = '/node200data/18parkky/datasets/data/public/BAM/Immunotherapy_CRC_Chen_et_al/MSI/CellRangerOut' 
DIRECTORY_JOANITO_CELLRANGER_OUT    = '/node200data/18parkky/datasets/data/public/BAM/Joanito_et_al/CellRangerOut'

In [11]:
import pickle
import numpy as np 
import pandas as pd
import scanpy as sc

In [28]:
def preprocessScanpy(adata, batch_key, random_state=42):
    
    adata.obs_names_make_unique()
    adata.var_names_make_unique()
        
    sc.pp.filter_cells(adata, min_genes=300)
    sc.pp.filter_genes(adata, min_cells=10)
    
    adata.raw = adata.copy()
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    sc.pp.highly_variable_genes(adata, n_top_genes=2000, batch_key=batch_key)
    adata = adata[:, adata.var.highly_variable]
    
    sc.pp.scale(adata, max_value=10)
    sc.tl.pca(adata, svd_solver="arpack")
    
    sc.pp.neighbors(adata, n_neighbors=15, n_pcs=40)
    sc.tl.umap(adata, random_state=random_state)
    
    return adata

''' 
The following function assumes you ran NanoMnT getAlleleTable using possorted_genome_bam.bam and saved the results in ~/{SampleID}/outs/possorted_genome_bam.STR_allele_table.tsv.
'''
def preprocessNanoMnT( set_SampleIDs, DIRECTORY_TO_CELLRANGER_OUT ):
    results = list()

    # for SampleID, edf in metadata[(metadata['BiopsySite']=='Tumor') & (metadata['TreatmentStage']=='Pre')].groupby("SampleID"):
    for SampleID in set_SampleIDs:

        PATH_alleleTable = f'{DIRECTORY_TO_CELLRANGER_OUT}/{SampleID}/outs/possorted_genome_bam.STR_allele_table.tsv'
        PATH_processed_alleleTable = f'{DIRECTORY_TO_CELLRANGER_OUT}/{SampleID}/outs/possorted_genome_bam.STR_allele_table.preprocessed.tsv'
    
        AlleleTable = pd.read_csv(PATH_alleleTable, sep='\t')
        AlleleTable['SampleID'] = SampleID

        ### 1. Filter out low-quality flankings (e.g., indels within flankings)        
        col_flanking_quality = list()
        for tup2 in AlleleTable.itertuples():
            bf = f'{tup2.left_flanking_seq}{tup2.right_flanking_seq}'
            if '*' in bf:
                col_flanking_quality.append( 'Poor' )
            elif bf.upper() != bf:
                col_flanking_quality.append( 'Poor' )
            else:
                col_flanking_quality.append( 'Good' )
                
        AlleleTable['flanking_quality'] = col_flanking_quality
        AlleleTable = AlleleTable[(AlleleTable['flanking_quality']=='Good')].copy()
        
        ### 2. Filter out G/C repeats
        AlleleTable = AlleleTable[(AlleleTable['repeat_unit'].isin(['A', 'T']))].copy()
        
        ### 3. Filter out reads without CB or UMI
        AlleleTable.dropna(inplace=True,)
        AlleleTable = AlleleTable[AlleleTable['reference_STR_allele']<=24].copy()
        
        AlleleTable['diff'] = AlleleTable['read_STR_allele'] - AlleleTable['reference_STR_allele']
        AlleleTable.to_csv(PATH_processed_alleleTable, sep='\t', index=False)
        results.append(AlleleTable)

    results = pd.concat(results)        
    results.reset_index(inplace=True, drop=True)
    results['Identifier'] = [ f'{tup.SampleID}-{tup.CB}' for tup in results.itertuples() ]
    
    return results

def saveWithPickle(obj, PATH_out, filename="saveWithPickle"):
    with open(f'{PATH_out}/{filename}.pickle', 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def loadFromPickle(dir_pickle):
    with open(dir_pickle, 'rb') as handle:
        unserialized_pickle = pickle.load(handle)
    return unserialized_pickle

In [47]:
adata_k = sc.read_h5ad(PATH_TO_KINKER_ADATA)    # Kinker et al
adata_c = sc.read_h5ad(PATH_TO_CHEN_ADATA)      # Chen et al
adata_j = sc.read_h5ad(PATH_TO_JOANITO_ADATA)   # Joanito et al

  utils.warn_names_duplicates("obs")


## [0] Doublet removal

In [48]:
# Doublet removal
doublet_score_threshold = np.mean( adata_j.obs['doublet_score'] ) + np.std( adata_j.obs['doublet_score'] )
adata_j = adata_j[adata_j.obs['doublet_score']<=doublet_score_threshold].copy()

doublet_score_threshold = np.mean( adata_c.obs['doublet_score'] ) + np.std( adata_c.obs['doublet_score'] )
adata_c = adata_c[adata_c.obs['doublet_score']<=doublet_score_threshold].copy()

  utils.warn_names_duplicates("obs")


## [1] Appending metadata

In [14]:
# Kinker et al - Cell line, Pooling ID, Type of cancer
metadata_k = pd.read_csv(PATH_TO_KINKER_METADATA, sep='\t')
dict_CB_to_MetaData = { tup.NAME : [tup.Cell_line, tup.Pool_ID, tup.Cancer_type] for tup in metadata_k.itertuples() }
for CB in adata_k.obs['CB']:
    try: dict_CB_to_MetaData[CB]
    except KeyError: dict_CB_to_MetaData[CB] = [None, None, None]
adata_k.obs['CellLine'] = [ dict_CB_to_MetaData[CB][0] for CB in adata_k.obs['CB'] ]
adata_k.obs['PoolID'] = [ dict_CB_to_MetaData[CB][1] for CB in adata_k.obs['CB'] ]
adata_k.obs['CancerType'] = [ dict_CB_to_MetaData[CB][2] for CB in adata_k.obs['CB'] ]

adata_k.obs['Identifier'] = [ f'{tup.CB.split("-")[0]}-{tup.PoolID}' for tup in adata_k.obs.itertuples() ]

  metadata_k = pd.read_csv(PATH_TO_KINKER_METADATA, sep='\t')


In [49]:
# Chen et al - Unique identifier, Author-annotated cell type (broad-lvl1 and specific-lvl2)
metadata_c = pd.read_csv(PATH_TO_CHEN_METADATA, sep=' ')
metadata_c['CB'] = [ idx.split("_")[-1] for idx in metadata_c.index ]
metadata_c['SampleID']      = [ tup.Ident.replace('CRC', 'P') for tup in metadata_c.itertuples() ]
metadata_c['Identifier']    = [ f'{tup.SampleID}-{tup.CB}' for tup in metadata_c.itertuples() ] # Identifier unique for each cell

adata_c.obs['Identifier'] = [ f'{tup.SampleID}-{tup.Index.split("-")[0]}' for tup in adata_c.obs.itertuples() ]

dict_Identifier_to_Celltypes = { tup.Identifier : [tup.MajorCellType, tup.SubCellType] for tup in metadata_c.itertuples() }

col_MajorCellType, col_SubCellType = list(), list()
for tup in adata_c.obs.itertuples():
    try:
        CellTypes = dict_Identifier_to_Celltypes[tup.Identifier]
        col_MajorCellType.append( CellTypes[0] )
        col_SubCellType.append( CellTypes[1] )
    except KeyError:
        col_MajorCellType.append( 'N/A' )
        col_SubCellType.append( 'N/A' )
        
adata_c.obs['Author_CellType_lvl_1'] = col_MajorCellType
adata_c.obs['Author_CellType_lvl_2'] = col_SubCellType
adata_c = adata_c[adata_c.obs['Author_CellType_lvl_1']!='N/A'].copy()

In [50]:
# Joanito et al - 
metadata_j_epi    = pd.read_csv(PATH_TO_JOANITO_METADATA_EPI)
metadata_j_nonepi = pd.read_csv(PATH_TO_JOANITO_METADATA_NONEPI)
metadata_j = pd.concat([ metadata_j_epi, metadata_j_nonepi ])

metadata_j = metadata_j[(metadata_j['patient.ID'].isin( set(adata_j.obs['PatientID']) ))].copy()
metadata_j.rename({'sample.origin' : 'BiopsySite'}, axis=1, inplace=True)
metadata_j['CB'] = [ cellID.split("_")[-1].split("-")[0] for cellID in metadata_j['cell.ID'] ]

dict_MUXID_to_SampleID = {  'MUX8579': 'XHC102',
                            'MUX8580': 'XHC103',
                            'MUX8581': 'XHC104',
                            'MUX8582': 'XHC105',
                            'MUX8583': 'XHC106',
                            'MUX8584': 'XHC107',
                            
                            'MUX8815': 'XHC129',
                            'MUX8816': 'XHC130',
                            'MUX8817': 'XHC131',
                            'MUX9005': 'XHC134',
                            'MUX9006': 'XHC135',
                            'MUX9007': 'XHC136',
                            'MUX9008': 'XHC137',
                            'MUX9009': 'XHC138',
                            'MUX9010': 'XHC139',
                            'MUX9322': 'XHC154',
                            'MUX9380': 'XHC166',
                             
                            'MUX9011': 'XHC140',
                            }

col_SampleID = list()
for tup in metadata_j.itertuples():
    try:
        col_SampleID.append(dict_MUXID_to_SampleID[list(tup)[5]])
    except KeyError:
        col_SampleID.append( list(tup)[5] )
        
metadata_j['SampleID'] = col_SampleID
metadata_j['Identifier'] = [ f'{tup.SampleID}-{tup.CB}' for tup in metadata_j.itertuples() ]

adata_j.obs['Identifier'] = [ f'{tup.SampleID}-{tup.Index.split("-")[0]}' for tup in adata_j.obs.itertuples() ]
dict_Identifier_to_Celltypes = { tup.Identifier : list(tup)[-6] for tup in metadata_j.itertuples() }

col_MajorCellType = list()
for tup in adata_j.obs.itertuples():
    try:
        CellTypes = dict_Identifier_to_Celltypes[tup.Identifier]
        col_MajorCellType.append( CellTypes )
    except KeyError:
        col_MajorCellType.append( 'N/A' )
        
adata_j.obs['Author_CellType_lvl_1'] = col_MajorCellType
adata_j.obs['Author_CellType_lvl_2'] = 'N/A'
adata_j = adata_j[adata_j.obs['Author_CellType_lvl_1']!='N/A'].copy()

  utils.warn_names_duplicates("obs")


## [2] Scanpy Preprocessing

In [54]:
adata_j = preprocessScanpy(adata_j, batch_key='SampleID', random_state=0)  
adata_c = preprocessScanpy(adata_c, batch_key='SampleID', random_state=42)

  view_to_actual(adata)


## [3] Appending NanoMnT results

### Chen et al

In [56]:
# [1] Collect NanoMnT results and store into dictionary
metadata_c2 = pd.read_csv(PATH_TO_CHEN_METADATA2, sep='\t')
NanoMnT_results_c = preprocessNanoMnT( set(metadata_c2[(metadata_c2['TreatmentStage']=='Pre')]['SampleID']), 
                                            DIRECTORY_CHEN_CELLRANGER_OUT )
NanoMnT_results_c.to_csv(f'{DIRECTORY_OUT}/NanoMnT.AlleleTable.Chen_et_al.tsv.gz', sep='\t', index=False, compression='gzip')
NanoMnT_results_c = pd.read_csv(f'{DIRECTORY_OUT}/NanoMnT.AlleleTable.Chen_et_al.tsv.gz', sep='\t')

dict_Identifier_to_MSprofile_c = dict()

for Identifier, edf in NanoMnT_results_c.groupby("Identifier"):
    edf_o = edf['diff'].dropna()
    if len(edf_o) > 0:
        dict_Identifier_to_MSprofile_c[Identifier] = [ np.mean(edf_o), np.std(edf_o), len(edf_o) ]
        
for Identifier in adata_c.obs['Identifier']:
    try: dict_Identifier_to_MSprofile_c[Identifier]
    except KeyError: dict_Identifier_to_MSprofile_c[Identifier]=[0, 0, 0]

  NanoMnT_results_c = pd.read_csv(f'{DIRECTORY_OUT}/NanoMnT.AlleleTable.Chen_et_al.tsv.gz', sep='\t')


In [57]:
# [2] Overlay microsatellite information to Scanpy object
adata_c.obs['AvgSTRDiff'] = [ dict_Identifier_to_MSprofile_c[Identifier][0] for Identifier in adata_c.obs['Identifier'] ]
adata_c.obs['StdSTRDiff'] = [ dict_Identifier_to_MSprofile_c[Identifier][1] for Identifier in adata_c.obs['Identifier'] ]
adata_c.obs['NumSTRLoci'] = [ dict_Identifier_to_MSprofile_c[Identifier][2] for Identifier in adata_c.obs['Identifier'] ]
adata_c.obs['MSI_score']  = -1 * adata_c.obs['AvgSTRDiff'] * adata_c.obs['StdSTRDiff']

### Joanito et al

In [58]:
# [1] Collect NanoMnT results and store into dictionary
# NanoMnT_results_j = preprocessNanoMnT( set(adata_j.obs['SampleID']), DIRECTORY_JOANITO_CELLRANGER_OUT )
# NanoMnT_results_j.to_csv(f'{DIRECTORY_OUT}/NanoMnT.AlleleTable.Joanito_et_al.tsv.gz', sep='\t', index=False, compression='gzip')
NanoMnT_results_j = pd.read_csv(f'{DIRECTORY_OUT}/NanoMnT.AlleleTable.Joanito_et_al.tsv.gz', sep='\t')

dict_Identifier_to_MSprofile_j = dict()

for Identifier, edf in NanoMnT_results_j.groupby("Identifier"):
    edf_o = edf['diff'].dropna()
    if len(edf_o) > 0:
        dict_Identifier_to_MSprofile_j[Identifier] = [ np.mean(edf_o), np.std(edf_o), len(edf_o) ]

for Identifier in adata_j.obs['Identifier']:
    try: dict_Identifier_to_MSprofile_j[Identifier]
    except KeyError: dict_Identifier_to_MSprofile_j[Identifier]=[0, 0, 0]

In [59]:
# [2] Overlay microsatellite information to Scanpy object
adata_j.obs['AvgSTRDiff'] = [ dict_Identifier_to_MSprofile_j[Identifier][0] for Identifier in adata_j.obs['Identifier'] ]
adata_j.obs['StdSTRDiff'] = [ dict_Identifier_to_MSprofile_j[Identifier][1] for Identifier in adata_j.obs['Identifier'] ]
adata_j.obs['NumSTRLoci'] = [ dict_Identifier_to_MSprofile_j[Identifier][2] for Identifier in adata_j.obs['Identifier'] ]
adata_j.obs['MSI_score']  = -1 * adata_j.obs['AvgSTRDiff'] * adata_j.obs['StdSTRDiff']

### Kinker et al

In [33]:
# [1] Collect NanoMnT results and store into dictionary
''' 
Unlike Joanito et al. and Chen et al. datasets, you don't need to run CellRanger for Kinker et al. dataset.
Instead, BAM files are available for download at: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE157220 
You can directly run NanoMnT getAlleleTable on these BAM files and merge them. 
'''
NanoMnT_results_k = pd.read_csv(PATH_TO_KINKER_NANOMNT, sep='\t')
NanoMnT_results_k['PoolID'] = [ str(pi.split('_')[1]) for pi in NanoMnT_results_k['pool_id'] ]
NanoMnT_results_k['Identifier'] = [ f'{tup.CB.split("-")[0]}-{tup.PoolID}' for tup in NanoMnT_results_k.itertuples() ]
NanoMnT_results_k.to_csv(f'{DIRECTORY_OUT}/NanoMnT.AlleleTable.Kinker_et_al.tsv.gz', sep='\t', index=False, compression='gzip')

col_flanking_quality = list()
for tup2 in NanoMnT_results_k.itertuples():
    bf = f'{tup2.left_flanking_seq}{tup2.right_flanking_seq}'
    if '*' in bf:
        col_flanking_quality.append( 'Poor' )
    elif bf.upper() != bf:
        col_flanking_quality.append( 'Poor' )
    else:
        col_flanking_quality.append( 'Good' )
NanoMnT_results_k['flanking_quality'] = col_flanking_quality
NanoMnT_results_k = NanoMnT_results_k[(NanoMnT_results_k['flanking_quality']=='Good')].copy()

NanoMnT_results_k = NanoMnT_results_k[(NanoMnT_results_k['repeat_unit'].isin(['A', 'T']))].copy()
NanoMnT_results_k.dropna(inplace=True,)
NanoMnT_results_k = NanoMnT_results_k[NanoMnT_results_k['reference_STR_allele']<=24].copy()

NanoMnT_results_k['diff'] = NanoMnT_results_k['read_STR_allele'] - NanoMnT_results_k['reference_STR_allele']
NanoMnT_results_k.reset_index(inplace=True, drop=True)

dict_Identifier_to_MSprofile_k = dict()

for Identifier, edf in NanoMnT_results_k.groupby("Identifier"):
    edf_o = edf['diff'].dropna()
    if len(edf_o) > 0:
        dict_Identifier_to_MSprofile_k[Identifier] = [ np.mean(edf_o), np.std(edf_o), len(edf_o) ]
        
for Identifier in adata_k.obs['Identifier']:
    try: dict_Identifier_to_MSprofile_k[Identifier]
    except KeyError: dict_Identifier_to_MSprofile_k[Identifier]=[0, 0, 0]

In [35]:
# [2] Overlay microsatellite information to Scanpy object
adata_k.obs['AvgSTRDiff'] = [ dict_Identifier_to_MSprofile_k[Identifier][0] for Identifier in adata_k.obs['Identifier'] ]
adata_k.obs['StdSTRDiff'] = [ dict_Identifier_to_MSprofile_k[Identifier][1] for Identifier in adata_k.obs['Identifier'] ]
adata_k.obs['NumSTRLoci'] = [ dict_Identifier_to_MSprofile_k[Identifier][2] for Identifier in adata_k.obs['Identifier'] ]
adata_k.obs['MSI_score']  = -1 * adata_k.obs['AvgSTRDiff'] * adata_k.obs['StdSTRDiff']

### Save results

In [60]:
adata_k.write(f'{DIRECTORY_OUT}/adata_k.preprocessed.NanoMnT.h5ad')
adata_c.write(f'{DIRECTORY_OUT}/adata_c.preprocessed.NanoMnT.h5ad')
adata_j.write(f'{DIRECTORY_OUT}/adata_j.preprocessed.NanoMnT.h5ad')