## Testing different LUAD and ESCC cancer EMT signatures

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
import os 
import sys

import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt

sys.path.append('../../..')
from data.load_data import load_datasets
from data.constants import BASE_PATH_DATA, BASE_PATH_EXPERIMENTS

from signaturescoring import score_signature
from signaturescoring.utils.utils import get_mean_and_variance_gene_expression

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

sc.settings.verbosity = 2

pl_size = 6
dataset = 'luad_xing'


base_path_emt_exp = os.path.join(BASE_PATH_EXPERIMENTS, f'EMT_signature_scoring_case_study')

base_path_barcodes = os.path.join(base_path_emt_exp, dataset)

save = False 

Load preprocessed dataset 

In [None]:
orig_adata = load_datasets(dataset, preprocessed=True, norm_method='mean')
if 'log1p' in orig_adata.uns_keys():
    orig_adata.uns['log1p']['base'] = None
else:
    orig_adata.uns['log1p'] = {'base': None}

In [None]:
orig_adata.obs.celltype.value_counts()
orig_adata = orig_adata[orig_adata.obs.celltype!='Granulocytes'].copy()

Load cancer EMT barcodes

In [None]:
barcodes_cancer_emt_cells = pd.read_csv(os.path.join(base_path_barcodes, 'barcodes_cancer_emt.csv'))
barcodes_cancer_emt_cells = barcodes_cancer_emt_cells['0']
barcodes_cancer_emt_cells.name = 'cancer_emt_cells'

In [None]:
barcodes_caf_emt_mes_cells = orig_adata.obs[orig_adata.obs.celltype == 'Fibroblast'].index.to_list() + barcodes_cancer_emt_cells.to_list()
barcodes_caf_emt_mes_cells = pd.Series(barcodes_caf_emt_mes_cells)

In [None]:
orig_adata.obs['celltype_broad'] = orig_adata.obs['celltype'].copy() 
orig_adata.obs['celltype_broad'] = orig_adata.obs['celltype_broad'].astype(str)

In [None]:
cells_not_cafs_and_cancer_emt = orig_adata.obs.index.isin(barcodes_caf_emt_mes_cells) == False

In [None]:
orig_adata.obs['celltype_broad'][barcodes_cancer_emt_cells.tolist()] = 'Malignant with EMT'
orig_adata.obs['celltype_broad'][cells_not_cafs_and_cancer_emt & (orig_adata.obs.celltype != 'Malignant')] = 'rest'
orig_adata.obs['celltype_broad'].value_counts().sort_index()

In [None]:
orig_adata.obs['celltype_broader'] = orig_adata.obs['celltype'].copy() 
orig_adata.obs['celltype_broader'] = orig_adata.obs['celltype_broader'].astype('str')

In [None]:
orig_adata.obs.loc[orig_adata.obs['celltype_broad']=='Malignant with EMT', 'celltype_broader'] = 'Malignant with EMT'
orig_adata.obs.loc[orig_adata.obs['celltype_broad']=='Malignant', 'celltype_broader'] = 'Malignant'
orig_adata.obs['celltype_broader'].value_counts().sort_index()

### Test existing unioned signatures  

In [None]:
sig_list = {}

In [None]:
ESOPHAG_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(base_path_emt_exp,'escc', 'dataset_specific_emt_sig', 'ESOPHAG_CANCER_EMT_SIGNATURE_1.csv'))
sig_list['ESOPHAG_CANCER_EMT_SIGNATURE_1'] = ESOPHAG_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

ESOPHAG_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(base_path_emt_exp, 'escc', 'dataset_specific_emt_sig',  'ESOPHAG_CANCER_EMT_SIGNATURE_2.csv'))
sig_list['ESOPHAG_CANCER_EMT_SIGNATURE_2'] = ESOPHAG_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()

LUNG_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(base_path_emt_exp, 'luad_xing', 'dataset_specific_emt_sig','LUNG_CANCER_EMT_SIGNATURE_1.csv'))
sig_list['LUNG_CANCER_EMT_SIGNATURE_1'] = LUNG_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

LUNG_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(base_path_emt_exp, 'luad_xing', 'dataset_specific_emt_sig','LUNG_CANCER_EMT_SIGNATURE_2.csv'))
sig_list['LUNG_CANCER_EMT_SIGNATURE_2'] = LUNG_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()

LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(base_path_emt_exp, 'escc', 'union_emt_sigs','LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1.csv'))
sig_list['LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1'] = LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(base_path_emt_exp, 'escc', 'union_emt_sigs','LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2.csv'))
sig_list['LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2'] = LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()

LUNG1_ESCC2_CANCER_EMT_SIGNATURE_3 = pd.read_csv(os.path.join(base_path_emt_exp, 'escc', 'union_emt_sigs','LUNG1_ESCC2_CANCER_EMT_SIGNATURE_3.csv'))
sig_list['LUNG1_ESCC2_CANCER_EMT_SIGNATURE_3'] = LUNG1_ESCC2_CANCER_EMT_SIGNATURE_3.iloc[:,1].tolist()


In [None]:
df_mean_var = get_mean_and_variance_gene_expression(orig_adata, estim_var=False)

In [None]:
for sig_name, sig in sig_list.items():
    print(sig_name, len(sig))
    score_signature(method="adjusted_neighborhood_scoring",
                        adata=orig_adata,
                        gene_list= sig,
                        ctrl_size=100,
                        df_mean_var = df_mean_var,
                        score_name=sig_name)
    print()

In [None]:
gt = orig_adata.obs.celltype_broader.copy()
gt = gt.astype(str)

gt[gt!='Malignant with EMT']= 'Rest'

In [None]:
orig_adata.obs.celltype_broader.value_counts().sort_index()

In [None]:
caf_and_cancer_emt = orig_adata.obs.celltype_broader[orig_adata.obs.celltype_broader.isin(['Malignant with EMT','Fibroblast' ])].index.tolist()
cancer_and_cancer_emt = orig_adata.obs.celltype_broader[orig_adata.obs.celltype_broader.isin(['Malignant with EMT','Malignant' ])].index.tolist()

In [None]:
len(caf_and_cancer_emt), len(cancer_and_cancer_emt)

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})
orig_adata.obs.ESOPHAG_CANCER_EMT_SIGNATURE_1.hist()
plt.close()

In [None]:
from sklearn.metrics import precision_recall_curve, auc

plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

pos_lbl = 'Malignant with EMT'

for score_name in sig_list.keys():
    plt.figure(figsize=(10,8))
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt, orig_adata.obs[score_name], pos_label=pos_lbl)
    lr_auc = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[caf_and_cancer_emt], orig_adata.obs.loc[caf_and_cancer_emt,score_name], pos_label=pos_lbl)
    lr_auc_caf_and_emt = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_and_cancer_emt], orig_adata.obs.loc[cancer_and_cancer_emt,score_name], pos_label=pos_lbl)
    lr_auc_cancer_and_emt = auc(lr_recall, lr_precision)
    
    for group in orig_adata.obs.groupby('celltype_broader'):
        group[1][score_name].hist(bins=100, density=True, alpha=0.5, label=group[0])
    
    plt.title(score_name+\
              '\nAUCPR Malignant with EMT vs. Rest '+ str(np.round(lr_auc, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. CAFS '+str(np.round(lr_auc_caf_and_emt, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. Malignant '+str(np.round(lr_auc_cancer_and_emt, decimals=3)), fontsize=16)
    plt.legend()
    plt.ylim([0,20])
    plt.tight_layout()
    plt.show()

In [None]:
sc.pl.violin(orig_adata,keys=list(sig_list.keys()), groupby='celltype_broader', rotation=90, show=False)
plt.show()