## Get cancer EMT cells for CRC
Score mutliple pan-cancer EMT signatures. Convert scores to rank and compute median rank per cell.
Cancer EMT cells are sells within the 10% of cells with the smallest ranks.

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import os 
import sys

sys.path.append('../../..')

import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

sys.path.append('../../..')
from data.load_data import load_datasets
from data.constants import BASE_PATH_DATA, BASE_PATH_EXPERIMENTS

from signaturescoring import score_signature
from signaturescoring.utils.utils import get_mean_and_variance_gene_expression, check_signature_genes

In [None]:
sc.settings.verbosity = 2

pl_size = 6

save = True

dataset = 'crc'

In [None]:
storing_path = os.path.join(BASE_PATH_EXPERIMENTS, f'EMT_signature_scoring_case_study/{dataset}')
if not os.path.exists(storing_path):
    os.makedirs(storing_path)
    sc.logging.info(f'Creating new directory to store the results.')

Load preprocessed data

In [None]:
adata = load_datasets(dataset, preprocessed=True, norm_method='mean')
if 'log1p' in adata.uns_keys():
    adata.uns['log1p']['base'] = None
else:
    adata.uns['log1p'] = {'base': None}

In [None]:
adata.obs.celltype.value_counts().sort_index()

In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

sc.pl.umap(adata, color=['sample_id','malignant_key', 'celltype', 'malignant_celltype', 'full_trans_celltype'], ncols=1)

### Select cancer EMT cells and store barcodes

In [None]:
import glob
import json

def get_sig_from_emtome_sig_file(filepath):
    assert os.path.exists(filepath)
    with open(filepath, 'r') as f:
        lines = f.readlines()
        lines = lines[2:]
        lines = [str(x[1:-1]) for x in lines]
        
    return lines

## define ase path to emt signatures
base_path_emt_signatures = os.path.join(BASE_PATH_DATA,'annotations/emt')

pan_cancer_emt_sigs = glob.glob(base_path_emt_signatures+"/sigs_from_emtome/pan_cancer/*.txt")
pan_cancer_emt_sigs.sort()
pan_cancer_emt_sigs = {x.split('/')[-1].split('.')[0]:x for x in pan_cancer_emt_sigs}
pan_cancer_emt_sigs = {key:get_sig_from_emtome_sig_file(val) for key, val in pan_cancer_emt_sigs.items()}

with open(base_path_emt_signatures+'/HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION.v7.5.1.json', 'r') as f:
    hemt = json.load(f)

pan_cancer_emt_sigs['hallmark_emt'] = hemt['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION']['geneSymbols']

In [None]:
GM_B_22 = pd.read_csv(base_path_emt_signatures+'/gene_modules_from_Barkley_et_al_2022.csv')

mes_gm  = GM_B_22.Mesenchymal
mes_gm = mes_gm.dropna().tolist()

pEMT_gm = GM_B_22.pEMT
pEMT_gm = pEMT_gm.dropna().tolist()

cEMT = GM_B_22.cEMT
cEMT = cEMT.dropna().tolist()

pEMT = GM_B_22['pEMT.1']
pEMT = pEMT.dropna().tolist()

In [None]:
pan_cancer_emt_sigs['pEMT_gm'] = pEMT_gm

In [None]:
pan_cancer_emt_sigs.keys()

In [None]:
df_mean_var = get_mean_and_variance_gene_expression(adata, estim_var=False)

In [None]:
for key, val in pan_cancer_emt_sigs.items():
    score_signature(method="adjusted_neighborhood_scoring",
                        adata=adata,
                        gene_list= val,
                        ctrl_size=100,
                        df_mean_var = df_mean_var,
                        score_name=key)

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

with plt.rc_context({'figure.figsize':(10,8)}):
    fig = sc.pl.umap(adata[adata.obs.malignant_key =='malignant'], 
               color=['sample_id','malignant_key', 'celltype', 'malignant_celltype', 'full_trans_celltype']+list(pan_cancer_emt_sigs.keys()), 
               ncols=4,
               return_fig=True)
    if save:
        curr_path = os.path.join(storing_path, 'cancer_emt_barcode_selection')
        if not os.path.exists(curr_path):
            os.mkdir(curr_path)
        fig.savefig(os.path.join(curr_path, f'mal_cells_umap_emt_sigs.png'), dpi=600)

In [None]:
score_names_pan_cancer = list(pan_cancer_emt_sigs.keys())

In [None]:
cancer_cells  = adata[adata.obs.malignant_key=='malignant'].copy()

In [None]:
cell_ranks_for_score_names =[]

In [None]:
for col in score_names_pan_cancer:
    cell_ranks_for_score_names.append((cancer_cells.obs[col]).rank(na_option='bottom', ascending=False))

In [None]:
ranked_cells = pd.concat(cell_ranks_for_score_names, axis=1)

In [None]:
sorted_ranks = ranked_cells.loc[ranked_cells.median(axis=1).sort_values().index,:]

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

sorted_ranks.median(axis=1).hist(bins=100)
plt.axvline(sorted_ranks.median(axis=1).quantile(0.1),c='r', label='quantile 0.1')
plt.title(f'Distribution median ranks.')
if save:
    curr_path = os.path.join(storing_path, 'cancer_emt_barcode_selection')
    if not os.path.exists(curr_path):
        os.mkdir(curr_path)
    plt.savefig(os.path.join(curr_path, f'distr_median_ranks.png'), dpi=600)

In [None]:
sorted_ranks_median = sorted_ranks.median(axis=1)

In [None]:
barcodes_cancer_emt = sorted_ranks_median[sorted_ranks_median <= sorted_ranks_median.quantile(0.1)].index.tolist()

In [None]:
## define path to store the cancer emt cell barcodes
if save:
    pd.Series(barcodes_cancer_emt).to_csv(os.path.join(storing_path, 'barcodes_cancer_emt.csv'))

In [None]:
adata.obs['celltype_emt'] = adata.obs.celltype.copy()
adata.obs['celltype_emt'] = adata.obs['celltype_emt'].astype(str)

adata.obs.loc[adata.obs['celltype_emt']=='Epi', 'celltype_emt'] = 'Malignant'
adata.obs.loc[adata.obs['celltype_emt']=='Fibro', 'celltype_emt'] = 'Fibroblast'

In [None]:
adata.obs.loc[barcodes_cancer_emt, 'celltype_emt'] = 'Malignant with EMT'

In [None]:
adata.obs['celltype_emt'] = adata.obs['celltype_emt'].astype('category')

In [None]:
adata.obs['celltype_emt'].value_counts().sort_index()

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

with plt.rc_context({'figure.figsize':(10,8)}):
    umap_fig = sc.pl.umap(adata[adata.obs.celltype.isin(['Epi', 'Fibro'])], color=score_names_pan_cancer+['sample_id', 'celltype', 'celltype_emt'],return_fig=True,cmap='viridis')
    if save:
        curr_path = os.path.join(storing_path, 'cancer_emt_barcode_selection')
        if not os.path.exists(curr_path):
            os.mkdir(curr_path)
        umap_fig.savefig(os.path.join(curr_path, f'mal_n_caf_cells_umap_emt_sigs.png'), dpi=600)

In [None]:
adata.obs.celltype_emt = adata.obs.celltype_emt.astype('category') 

In [None]:
tmp = adata.obs[['sample_id','celltype_emt']]

In [None]:
cross_tab_prop = pd.crosstab(index=tmp['sample_id'],
                             columns=tmp['celltype_emt'],
                             normalize="index")

In [None]:
cols = ['Malignant with EMT','Malignant', 'Fibroblast', 'B', 'DC', 
        'Endo',  'Granulo', 'ILC', 'Macro', 'Mast', 'Mono', 'NK', 
        'Peri', 'Plasma', 'Schwann', 'SmoothMuscle', 'TCD4', 'TCD8', 'TZBTB16', 'Tgd']
cross_tab_prop = cross_tab_prop[cols]

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab20', 
                    figsize=(15, 8))
plt.legend(loc='center left',bbox_to_anchor=(1.0, 0.5),ncol=1)
plt.xlabel("Sample ID")
plt.ylabel("Proportions")
plt.tight_layout()

if save:
    curr_path = os.path.join(storing_path, 'proportions')
    if not os.path.exists(curr_path):
        os.mkdir(curr_path)
    plt.savefig(os.path.join(curr_path, f'distribution_celltypes.svg'))
    cross_tab_prop.to_csv(os.path.join(curr_path, f'proportions_celltype.csv'))

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

prop_counts = pd.crosstab(index=tmp['sample_id'],
            columns=tmp['celltype_emt'],)
prop_counts = prop_counts[cols]

if save:
    curr_path = os.path.join(storing_path, 'proportions')
    if not os.path.exists(curr_path):
        os.mkdir(curr_path)
    prop_counts.to_csv(os.path.join(curr_path, f'counts_celltype.csv'))

In [None]:
os.path.join(BASE_PATH_DATA, '

### Score crc cells for ESOPHAG_CANCER_EMT signature

In [None]:
ESOPHAG_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/ESOPHAG_CANCER_EMT_SIGNATURE_1.csv'))
ESOPHAG_CANCER_EMT_SIGNATURE_1 = ESOPHAG_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

ESOPHAG_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/ESOPHAG_CANCER_EMT_SIGNATURE_2.csv'))
ESOPHAG_CANCER_EMT_SIGNATURE_2 = ESOPHAG_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()

In [None]:
LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1.csv'))
LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1 = LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2.csv'))
LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2 = LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()

LUNG2_ESCC2_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/LUNG2_ESCC2_CANCER_EMT_SIGNATURE_1.csv'))
LUNG2_ESCC2_CANCER_EMT_SIGNATURE_1 = LUNG2_ESCC2_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

LUNG2_ESCC2_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/LUNG2_ESCC2_CANCER_EMT_SIGNATURE_2.csv'))
LUNG2_ESCC2_CANCER_EMT_SIGNATURE_2 = LUNG2_ESCC2_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()


LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1.csv'))
LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1 = LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2.csv'))
LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2 = LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()

LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3 = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/emt/LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3.csv'))
LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3 = LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3.iloc[:,1].tolist()

In [None]:
with open(os.path.join(BASE_PATH_DATA, 'annotations/emt/HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION.v7.5.1.json'), 'r') as f:
    hemt = json.load(f)
hallmark_emt = hemt['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION']['geneSymbols']

In [None]:
GM_B_22 = pd.read_csv(os.path.join(BASE_PATH_DATA,'annotations/emt/gene_modules_from_Barkley_et_al_2022.csv'))

In [None]:
mes_gm  = GM_B_22.Mesenchymal
mes_gm = mes_gm.dropna().tolist()

pEMT_gm = GM_B_22.pEMT
pEMT_gm = pEMT_gm.dropna().tolist()

cEMT = GM_B_22.cEMT
cEMT = cEMT.dropna().tolist()

pEMT = GM_B_22['pEMT.1']
pEMT = pEMT.dropna().tolist()

In [None]:
## Refind Lung signature on ESCC
new_sig = {'AGRN',
 'ANXA3',
 'AREG',
 'BMP2',
 'C3orf52',
 'CAV1',
 'CDCP1',
 'CRIP2',
 'CTSV',
 'CXCL14',
 'DCBLD2',
 'DFNA5',
 'DSG2',
 'ERBB2',
 'ERO1A',
 'FBXO2',
 'FGD6',
 'FLNA',
 'FLNB',
 'FOSL1',
 'ITGA2',
 'ITGA3',
 'ITGB4',
 'KRT14',
 'LAMA5',
 'LAMB3',
 'LAMC2',
 'MET',
 'MLLT11',
 'MT2A',
 'NRG1',
 'PHLDA2',
 'PKP3',
 'PLEK2',
 'PLOD3',
 'PLXNB2',
 'PPP1R14B',
 'PPP1R14C',
 'RAMP1',
 'RP11-670E13.6',
 'S100A10',
 'S100A2',
 'SEMA3C',
 'SEMA4B',
 'SERINC2',
 'SERPINE1',
 'SERPINE2',
 'SFN',
 'SH2D3A',
 'SLC2A1',
 'SPECC1',
 'TGFBI',
 'TNC',
 'TNFRSF12A',
 'TNNT1',
 'TNS4',
 'UBE2C',
 'UCHL1',
 'UPP1',
 'WDR66'}

In [None]:
venn2(
subsets= (set(LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1), set(new_sig)))

In [None]:
len(new_sig)

In [None]:
some_sig = {'AGRN',
 'ANXA3',
 'AREG',
 'BMP2',
 'C3orf52',
 'CAV1',
 'CDCP1',
 'COL27A1',
 'COTL1',
 'CRIP2',
 'CTSV',
 'CXCL14',
 'DCBLD2',
 'DFNA5',
 'DSG2',
 'ERBB2',
 'EREG',
 'ERO1A',
 'FBXO2',
 'FCMR',
 'FGD6',
 'FLNA',
 'FMNL3',
 'FOSL1',
 'IL4I1',
 'ITGA2',
 'ITGA3',
 'ITGB4',
 'KLHL35',
 'KRT14',
 'LAMB3',
 'LAMC2',
 'MCAM',
 'MET',
 'MLLT11',
 'MT2A',
 'NRG1',
 'PHLDA2',
 'PKP3',
 'PLEK2',
 'PLOD3',
 'PLXNB2',
 'PPP1R14B',
 'PPP1R14C',
 'RAC2',
 'RAMP1',
 'S100A10',
 'S100A2',
 'SEMA3C',
 'SEMA4B',
 'SERINC2',
 'SERPINA1',
 'SERPINE1',
 'SERPINE2',
 'SFN',
 'SH2D3A',
 'SLC2A1',
 'SPECC1',
 'TGFBI',
 'TNC',
 'TNFRSF12A',
 'TNNT1',
 'TNS4',
 'UBE2C',
 'UCHL1',
 'WDR66',
 'ZBED2'}

In [None]:
len(some_sig)

In [None]:
df_mean_var = get_mean_and_variance_gene_expression(orig_adata, estim_var=False)

In [None]:
for sig in [('ESOPHAG_CANCER_EMT_SIGNATURE_1',ESOPHAG_CANCER_EMT_SIGNATURE_1),
            ('ESOPHAG_CANCER_EMT_SIGNATURE_2',ESOPHAG_CANCER_EMT_SIGNATURE_2),
#             ('LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1',LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1),
#             ('LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2',LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2),
#             ('LUNG2_ESCC2_CANCER_EMT_SIGNATURE_1',LUNG2_ESCC2_CANCER_EMT_SIGNATURE_1),
#             ('LUNG2_ESCC2_CANCER_EMT_SIGNATURE_2',LUNG2_ESCC2_CANCER_EMT_SIGNATURE_2),
            ('LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1',LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1),
            ('LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2',LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2),
            ('LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3',LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3),
            ('hallmark_emt',hallmark_emt),('pEMT_gm',pEMT_gm),
#             ('new_sig',new_sig), ('some_sig',some_sig)
           ]:
    score_signature(method="adjusted_neighborhood_scoring",
                        adata=orig_adata,
                        gene_list= sig[1],
                        ctrl_size=100,
                        df_mean_var = df_mean_var,
                        score_name=sig[0])

In [None]:
gt = orig_adata.obs.celltype_emt.copy()
gt = gt.astype(str)

gt[gt!='Epi with EMT']= 'Rest'

In [None]:
orig_adata.obs.celltype_emt.value_counts()

In [None]:
caf_and_cancer_emt = orig_adata.obs.celltype_emt[orig_adata.obs.celltype_emt.isin(['Epi with EMT','Fibro' ])].index.tolist()
cancer_and_cancer_emt = orig_adata.obs.celltype_emt[orig_adata.obs.celltype_emt.isin(['Epi with EMT','Epi' ])].index.tolist()

In [None]:
len(caf_and_cancer_emt)

In [None]:
len(cancer_and_cancer_emt)

In [None]:
sig_names = [x[0] for x in [('ESOPHAG_CANCER_EMT_SIGNATURE_1',ESOPHAG_CANCER_EMT_SIGNATURE_1),
            ('ESOPHAG_CANCER_EMT_SIGNATURE_2',ESOPHAG_CANCER_EMT_SIGNATURE_2),
            ('LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1',LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1),
            ('LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2',LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2),
            ('LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3',LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3),
            ('hallmark_emt',hallmark_emt),('pEMT_gm',pEMT_gm),]]
           

In [None]:
from sklearn.metrics import precision_recall_curve, auc
for score_name in sig_names:
    plt.figure(figsize=(10,8))
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt, orig_adata.obs[score_name], pos_label='Epi with EMT')
    lr_auc = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[caf_and_cancer_emt], orig_adata.obs.loc[caf_and_cancer_emt,score_name], pos_label='Epi with EMT')
    lr_auc_caf_and_emt = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_and_cancer_emt], orig_adata.obs.loc[cancer_and_cancer_emt,score_name], pos_label='Epi with EMT')
    lr_auc_cancer_and_emt = auc(lr_recall, lr_precision)
    
    for group in orig_adata.obs.groupby('celltype_emt'):
        if group[0] not in ['Epi with EMT','Epi','Fibro']:
            continue
        group[1][score_name].hist(bins=100, density=True, alpha=0.5, label=group[0])
    
    plt.title(score_name+'\nAUCPR cancer EMT vs. rest '+str(np.round(lr_auc, decimals=3))+'\nAUCPR cancer EMT vs. cafs '+str(np.round(lr_auc_caf_and_emt, decimals=3))+'\nAUCPR cancer EMT vs. cancer '+str(np.round(lr_auc_cancer_and_emt, decimals=3)), fontsize=16)
    plt.legend()
    #plt.ylim([0,20])
    plt.show()

In [None]:
for sig_name in sig_names:
    sc.pl.violin(orig_adata, keys=sig_name, groupby='celltype_emt', rotation=90)

In [None]:
# with plt.rc_context({'figure.figsize':(10,8)}):
#     sc.pl.violin(orig_adata, keys=['new_sig','pEMT_gm'], groupby='celltype_emt', rotation=90)