## Establishment of a LUAD-specific cancer EMT signature

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
import os 
import sys

import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
from matplotlib.pyplot import rc_context
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests
from scipy.stats import mannwhitneyu
from matplotlib_venn import venn2
import json

sys.path.append('../../..')
from data.load_data import load_datasets
from data.constants import BASE_PATH_EXPERIMENTS, BASE_PATH_DATA

from signaturescoring import score_signature
from signaturescoring.utils.utils import get_mean_and_variance_gene_expression, check_signature_genes

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

sc.settings.verbosity = 2

pl_size = 6
dataset = 'luad_xing'

base_path_emt_signatures = os.path.join(BASE_PATH_DATA, f'annotations/emt')
base_path_barcodes = os.path.join(BASE_PATH_EXPERIMENTS, f'EMT_signature_scoring_case_study/{dataset}')
storing_path = os.path.join(base_path_barcodes, 'dataset_specific_emt_sig')

if not os.path.exists(storing_path):
    os.makedirs(storing_path)
    sc.logging.info(f'Creating new storing folder at {storing_path}')

save = True 

Load preprocessed dataset 

In [None]:
orig_adata = load_datasets(dataset, preprocessed=True, norm_method='mean')
if 'log1p' in orig_adata.uns_keys():
    orig_adata.uns['log1p']['base'] = None
else:
    orig_adata.uns['log1p'] = {'base': None}

In [None]:
orig_adata.obs.celltype.value_counts()
orig_adata = orig_adata[orig_adata.obs.celltype!='Granulocytes'].copy()

Load cancer EMT barcodes

In [None]:
barcodes_cancer_emt_cells = pd.read_csv(os.path.join(base_path_barcodes, 'barcodes_cancer_emt.csv'))
barcodes_cancer_emt_cells = barcodes_cancer_emt_cells['0']
barcodes_cancer_emt_cells.name = 'cancer_emt_cells'

In [None]:
barcodes_caf_emt_mes_cells = orig_adata.obs[orig_adata.obs.celltype == 'Fibroblast'].index.to_list() + barcodes_cancer_emt_cells.to_list()
barcodes_caf_emt_mes_cells = pd.Series(barcodes_caf_emt_mes_cells)

In [None]:
orig_adata.obs['celltype_broad'] = orig_adata.obs['celltype'].copy() 
orig_adata.obs['celltype_broad'] = orig_adata.obs['celltype_broad'].astype(str)

In [None]:
cells_not_cafs_and_cancer_emt = orig_adata.obs.index.isin(barcodes_caf_emt_mes_cells) == False

In [None]:
orig_adata.obs['celltype_broad'][barcodes_cancer_emt_cells.tolist()] = 'Malignant with EMT'
orig_adata.obs['celltype_broad'][cells_not_cafs_and_cancer_emt & (orig_adata.obs.celltype != 'Malignant')] = 'rest'
orig_adata.obs['celltype_broad'].value_counts().sort_index()

In [None]:
orig_adata.obs['celltype_broader'] = orig_adata.obs['celltype'].copy() 
orig_adata.obs['celltype_broader'] = orig_adata.obs['celltype_broader'].astype('str')

In [None]:
orig_adata.obs.loc[orig_adata.obs['celltype_broad']=='Malignant with EMT', 'celltype_broader'] = 'Malignant with EMT'
orig_adata.obs.loc[orig_adata.obs['celltype_broad']=='Malignant', 'celltype_broader'] = 'Malignant'
orig_adata.obs['celltype_broader'].value_counts().sort_index()

Prepare UMAPs

In [None]:
sc.tl.pca(orig_adata)
sc.pp.neighbors(orig_adata)
sc.tl.umap(orig_adata)

### Score for ESOPHAG_CANCER_EMT signature 

In [None]:
ESOPHAG_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(storing_path)), 
                                                          'escc', 'dataset_specific_emt_sig', 'ESOPHAG_CANCER_EMT_SIGNATURE_1.csv'))
ESOPHAG_CANCER_EMT_SIGNATURE_1 = ESOPHAG_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

ESOPHAG_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(storing_path)), 
                                                          'escc', 'dataset_specific_emt_sig',  'ESOPHAG_CANCER_EMT_SIGNATURE_2.csv'))
ESOPHAG_CANCER_EMT_SIGNATURE_2 = ESOPHAG_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()
len(ESOPHAG_CANCER_EMT_SIGNATURE_1), len(ESOPHAG_CANCER_EMT_SIGNATURE_2)

In [None]:
with open(os.path.join(base_path_emt_signatures,'HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION.v7.5.1.json'), 'r') as f:
    hemt = json.load(f)
hallmark_emt = hemt['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION']['geneSymbols']

In [None]:
GM_B_22 = pd.read_csv(os.path.join(base_path_emt_signatures,'gene_modules_from_Barkley_et_al_2022.csv'))

In [None]:
mes_gm  = GM_B_22.Mesenchymal
mes_gm = mes_gm.dropna().tolist()

pEMT_gm = GM_B_22.pEMT
pEMT_gm = pEMT_gm.dropna().tolist()

cEMT = GM_B_22.cEMT
cEMT = cEMT.dropna().tolist()

pEMT = GM_B_22['pEMT.1']
pEMT = pEMT.dropna().tolist()

In [None]:
df_mean_var = get_mean_and_variance_gene_expression(orig_adata, estim_var=False)

In [None]:
for sig in [('ESOPHAG_CANCER_EMT_SIGNATURE_1',ESOPHAG_CANCER_EMT_SIGNATURE_1),
            ('ESOPHAG_CANCER_EMT_SIGNATURE_2',ESOPHAG_CANCER_EMT_SIGNATURE_2), 
            ('hallmark_emt',hallmark_emt),
            ('pEMT_gm',pEMT_gm),
            ('cEMT',cEMT),
            ('mes_gm',mes_gm)]:
    score_signature(method="adjusted_neighborhood_scoring",
                        adata=orig_adata,
                        gene_list= sig[1],
                        ctrl_size=100,
                        df_mean_var = df_mean_var,
                        score_name=sig[0])

In [None]:
gt = orig_adata.obs.celltype_broader.copy()
gt = gt.astype(str)

gt[gt!='Malignant with EMT']= 'Rest'

In [None]:
orig_adata.obs.celltype_broader.value_counts().sort_index()

In [None]:
caf_and_cancer_emt = orig_adata.obs.celltype_broader[orig_adata.obs.celltype_broader.isin(['Malignant with EMT','Fibroblast' ])].index.tolist()
cancer_and_cancer_emt = orig_adata.obs.celltype_broader[orig_adata.obs.celltype_broader.isin(['Malignant with EMT','Malignant' ])].index.tolist()
cancer_emt_and_rest = orig_adata.obs.celltype_broader[~orig_adata.obs.celltype_broader.isin(['Fibroblast','Malignant' ])].index.tolist()

In [None]:
len(caf_and_cancer_emt), len(cancer_and_cancer_emt), len(cancer_emt_and_rest)

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})
orig_adata.obs.ESOPHAG_CANCER_EMT_SIGNATURE_1.hist()
plt.close()

In [None]:
from sklearn.metrics import precision_recall_curve, auc

plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

for score_name in ['ESOPHAG_CANCER_EMT_SIGNATURE_1','ESOPHAG_CANCER_EMT_SIGNATURE_2','hallmark_emt','pEMT_gm','cEMT','mes_gm']:
    plt.figure(figsize=(10,8))
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_emt_and_rest], orig_adata.obs.loc[cancer_emt_and_rest, score_name], pos_label='Malignant with EMT')
    lr_auc = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[caf_and_cancer_emt], orig_adata.obs.loc[caf_and_cancer_emt,score_name], pos_label='Malignant with EMT')
    lr_auc_caf_and_emt = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_and_cancer_emt], orig_adata.obs.loc[cancer_and_cancer_emt,score_name], pos_label='Malignant with EMT')
    lr_auc_cancer_and_emt = auc(lr_recall, lr_precision)
    
    for group in orig_adata.obs.groupby('celltype_broader'):
        group[1][score_name].hist(bins=100, density=True, alpha=0.5, label=group[0])
    
    plt.title(score_name+\
              '\nAUCPR Malignant with EMT vs. Rest '+ str(np.round(lr_auc, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. CAFS '+str(np.round(lr_auc_caf_and_emt, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. Malignant '+str(np.round(lr_auc_cancer_and_emt, decimals=3)), fontsize=16)
    plt.legend()
    plt.ylim([0,20])
    plt.tight_layout()
    if save:
        curr_path = os.path.join(storing_path, 'plots')
        if not os.path.exists(curr_path):
            os.makedirs(curr_path)
        plt.savefig(os.path.join(curr_path, f'dist_scores_{score_name}.png'), dpi=600)
    plt.show()

In [None]:
sc.pl.violin(orig_adata,keys=['ESOPHAG_CANCER_EMT_SIGNATURE_1', 
                              'ESOPHAG_CANCER_EMT_SIGNATURE_2',
                              'hallmark_emt',
                              'pEMT_gm',
                              'cEMT',
                              'mes_gm'], groupby='celltype_broader', rotation=90, show=False)
if save:
    curr_path = os.path.join(storing_path, 'plots')
    if not os.path.exists(curr_path):
        os.makedirs(curr_path)
    plt.savefig(os.path.join(curr_path, f'escc_halmark_scores.svg'), dpi=600)
plt.show()

### Find cancer emt signature

In [None]:
cancer_emt_CAFS = orig_adata[orig_adata.obs.celltype_broader.isin(['Malignant with EMT','Fibroblast'])].copy()

In [None]:
cancer_emt_cancer = orig_adata[orig_adata.obs.celltype_broader.isin(['Malignant with EMT','Malignant'])].copy()

In [None]:
sc.tl.rank_genes_groups(cancer_emt_CAFS, groupby='celltype_broader', method='wilcoxon', tie_correct=True)
sc.tl.rank_genes_groups(cancer_emt_cancer, groupby='celltype_broader', method='wilcoxon', tie_correct=True)

In [None]:
cancer_emt_CAFS_dge = sc.get.rank_genes_groups_df(cancer_emt_CAFS, group='Malignant with EMT', log2fc_min=2, pval_cutoff=0.001)
cancer_emt_cancer_dge = sc.get.rank_genes_groups_df(cancer_emt_cancer, group='Malignant with EMT', log2fc_min=1, pval_cutoff=0.001)

In [None]:
from matplotlib_venn import venn3

In [None]:
venn3(
subsets=(
    set(cancer_emt_CAFS_dge.names.tolist()),
    set(cancer_emt_cancer_dge.names.tolist()),
    set(ESOPHAG_CANCER_EMT_SIGNATURE_2)
),
    set_labels=(
        'cancer_emt_CAFS_dge',
        'cancer_emt_cancer_dge',
        'ESOPHAG_CANCER_EMT_SIGNATURE_2'
    )
)
if save:
    curr_path = os.path.join(storing_path, 'plots')
    if not os.path.exists(curr_path):
        os.makedirs(curr_path)
    plt.savefig(os.path.join(curr_path, f'venn_dgex_genes_mal_emt_vs_caf_mal_1.png'), dpi=600)

In [None]:
venn3(
subsets=(
    set(cancer_emt_CAFS_dge.names.tolist()),
    set(cancer_emt_cancer_dge.names.tolist()),
    set(pEMT_gm)
),
    set_labels=(
        'cancer_emt_CAFS_dge',
        'cancer_emt_cancer_dge',
        'pEMT_gm'
    )
)
if save:
    curr_path = os.path.join(storing_path, 'plots')
    if not os.path.exists(curr_path):
        os.makedirs(curr_path)
    plt.savefig(os.path.join(curr_path, f'venn_dgex_genes_mal_emt_vs_caf_mal_2.png'), dpi=600)

In [None]:
genes_dge_cancer_emt_vs_cafs_and_cancer = set(cancer_emt_CAFS_dge.names.tolist()).intersection(set(cancer_emt_cancer_dge.names.tolist()))

score_signature(method="adjusted_neighborhood_scoring",
                        adata=orig_adata,
                        gene_list= genes_dge_cancer_emt_vs_cafs_and_cancer,
                        ctrl_size=100,
                        score_name='genes_dge_cancer_emt_vs_cafs_and_cancer')


In [None]:
sc.pl.violin(orig_adata,keys=['ESOPHAG_CANCER_EMT_SIGNATURE_2', 'pEMT_gm', 'genes_dge_cancer_emt_vs_cafs_and_cancer'], groupby='celltype_broader',rotation=90, show=False)
if save:
    curr_path = os.path.join(storing_path, 'plots')
    if not os.path.exists(curr_path):
        os.makedirs(curr_path)
    plt.savefig(os.path.join(curr_path, f'escc_pemt_scores_gdex_genes_a1.svg'), dpi=600)
plt.show()

In [None]:
sc.tl.rank_genes_groups(orig_adata, groupby='celltype_broader', reference='Malignant with EMT', method='wilcoxon', tie_correct=True)

In [None]:
genes_dge_cancer_emt_vs_cafs_and_cancer_refined = genes_dge_cancer_emt_vs_cafs_and_cancer.copy()

In [None]:
for group in orig_adata.obs.groupby(by='celltype_broader'):
    if group[0] == 'Malignant with EMT':
        continue
    wc = sc.get.rank_genes_groups_df(orig_adata, group=group[0], log2fc_min=1, pval_cutoff=0.001)
    genes_dge_cancer_emt_vs_cafs_and_cancer_refined.difference_update(wc.names.tolist())

In [None]:
len(genes_dge_cancer_emt_vs_cafs_and_cancer_refined)

In [None]:
score_signature(method="adjusted_neighborhood_scoring",
                        adata=orig_adata,
                        gene_list= genes_dge_cancer_emt_vs_cafs_and_cancer_refined,
                        ctrl_size=100,
                        score_name='genes_dge_cancer_emt_vs_cafs_and_cancer_refined')

In [None]:
sc.pl.violin(orig_adata,keys=['ESOPHAG_CANCER_EMT_SIGNATURE_2', 'pEMT_gm', 'genes_dge_cancer_emt_vs_cafs_and_cancer','genes_dge_cancer_emt_vs_cafs_and_cancer_refined'], groupby='celltype_broader',rotation=90, show=False)
if save:
    curr_path = os.path.join(storing_path, 'plots')
    if not os.path.exists(curr_path):
        os.makedirs(curr_path)
    plt.savefig(os.path.join(curr_path, f'escc_pemt_scores_gdex_genes_a2.svg'), dpi=600)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, auc
for score_name in ['ESOPHAG_CANCER_EMT_SIGNATURE_1','ESOPHAG_CANCER_EMT_SIGNATURE_2','pEMT_gm','genes_dge_cancer_emt_vs_cafs_and_cancer', 'genes_dge_cancer_emt_vs_cafs_and_cancer_refined']:
    plt.figure(figsize=(10,8))
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_emt_and_rest], orig_adata.obs.loc[cancer_emt_and_rest, score_name], pos_label='Malignant with EMT')
    lr_auc = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[caf_and_cancer_emt], orig_adata.obs.loc[caf_and_cancer_emt,score_name], pos_label='Malignant with EMT')
    lr_auc_caf_and_emt = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_and_cancer_emt], orig_adata.obs.loc[cancer_and_cancer_emt,score_name], pos_label='Malignant with EMT')
    lr_auc_cancer_and_emt = auc(lr_recall, lr_precision)
    
    for group in orig_adata.obs.groupby('celltype_broader'):
        group[1][score_name].hist(bins=100, density=True, alpha=0.5, label=group[0])
    
    plt.title(score_name+\
              '\nAUCPR Malignant with EMT vs. Rest '+ str(np.round(lr_auc, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. CAFS '+str(np.round(lr_auc_caf_and_emt, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. Malignant '+str(np.round(lr_auc_cancer_and_emt, decimals=3)), fontsize=16)
    plt.legend()
    plt.ylim([0,20])
    plt.tight_layout()
    if save:
        curr_path = os.path.join(storing_path, 'plots')
        if not os.path.exists(curr_path):
            os.makedirs(curr_path)
        plt.savefig(os.path.join(curr_path, f'dist_scores_{score_name}.png'), dpi=600)
    plt.show()

#### refine found DGEX gene set even more 

In [None]:
cells_epithelial_malignant = orig_adata[orig_adata.obs.celltype_broader.isin(['Epithelial','Malignant', 'Malignant with EMT'])].copy()

In [None]:
cells_epithelial_malignant = cells_epithelial_malignant[cells_epithelial_malignant.obs.genes_dge_cancer_emt_vs_cafs_and_cancer_refined>=0.05]

In [None]:
cells_epithelial_malignant.obs.celltype_broader.value_counts()

In [None]:
sc.tl.rank_genes_groups(cells_epithelial_malignant, groupby='celltype_broader', method='wilcoxon', tie_correct=True)

In [None]:
dge_genes_mal_emt_vs_epithelial_and_mal_epithelial = sc.get.rank_genes_groups_df(cells_epithelial_malignant, group='Malignant with EMT', log2fc_min=2, pval_cutoff=0.001)

In [None]:
venn2(
subsets=(
    set(dge_genes_mal_emt_vs_epithelial_and_mal_epithelial.names.tolist()),
    set(genes_dge_cancer_emt_vs_cafs_and_cancer_refined)
),
    set_labels=(
        'dge_genes_mal_emt_vs_epithelial_and_mal_epithelial',
        'genes_dge_cancer_emt_vs_cafs_and_cancer_refined'
    )
)

In [None]:
genes_to_potentially_add = set(dge_genes_mal_emt_vs_epithelial_and_mal_epithelial.names.tolist()).difference(set(genes_dge_cancer_emt_vs_cafs_and_cancer_refined))

In [None]:
orig_adata.X = orig_adata.X.tocsc()

In [None]:
def get_scores_for_all_sig_genes(adata, sig_genes,ctrl_size = 100):

    gene_list = check_signature_genes(adata.var_names, sig_genes)
    df_mean_var = get_mean_and_variance_gene_expression(adata,
                                                    estim_var=False)
    gene_means = df_mean_var['mean'].copy()

    # computation of neighboring genes around each signature gene
    sorted_gene_means = gene_means.sort_values()
    ref_genes_means = sorted_gene_means[sorted_gene_means.index.isin(gene_list) == False]

    # use sliding window to compute for each window the mean
    rolled = ref_genes_means.rolling(ctrl_size, closed='right').mean()

    control_genes = []
    for sig_gene in gene_list:
        curr_sig_avg = sorted_gene_means.loc[sig_gene]
        min_val_idx = np.argmin(((rolled - curr_sig_avg).abs()))
        sig_gene_ctrl_genes = rolled.iloc[(min_val_idx - ctrl_size + 1):min_val_idx + 1]
        control_genes.append(list(sig_gene_ctrl_genes.index))
    
    
    list_scores_per_sig_genes = []
    for sig_gene, ctrl_genes in zip(gene_list,control_genes):
        curr_score = adata[:,sig_gene].X - adata[:,ctrl_genes].X.mean(axis=1)
        curr_score = (curr_score-curr_score.min())/(curr_score.max()-curr_score.min())
#         curr_score = np.tanh(curr_score)
        list_scores_per_sig_genes.append(pd.DataFrame
                                         (curr_score,
                                          index=adata.obs_names,
                                          columns=[sig_gene+'_score']))
    df_signature_scores = pd.concat(list_scores_per_sig_genes, axis=1)
    
    
    columns_titles = df_signature_scores.columns.tolist()
    columns_titles.sort()
    df_signature_scores=df_signature_scores.reindex(columns=columns_titles)
    df_signature_scores['final_score_mean'] = df_signature_scores.mean(axis=1)
    df_signature_scores['final_score_median'] = df_signature_scores.median(axis=1)
    df_signature_scores['celltype'] = adata.obs['celltype']
    df_signature_scores['celltype_broad'] = adata.obs['celltype_broad']
    df_signature_scores['celltype_broader'] = adata.obs['celltype_broader']
    return df_signature_scores

In [None]:
from pandas.api.types import is_numeric_dtype

def plot_heatmap_with_celltype_anno(df, label_col, palette="tab10", bbox_to_anchor=(0.16,0.79),cat_title = 'celltypes',apply_tanh=False):
    if label_col not in df:
        raise KeyError(f'labelcol={label_col} is not a column of df')
        
    lut = dict(zip(df[label_col].unique(), sns.color_palette(palette)))
    
    row_colors = pd.DataFrame(df[label_col])[label_col].astype(str).map(lut)
    
    counts = dict(df[label_col].value_counts())
    
    categories = counts.keys()
    
    handles = [Patch(color=lut[category],label=category+' (%i)'%counts[category]) for category in sorted(categories)]
    
    tmp = df.sort_values(by=[label_col,'final_score_mean'])
    tmp = tmp[[x for x in tmp.columns if (x!= label_col) and  (is_numeric_dtype(tmp[x]))]]
    if apply_tanh:
        tmp = np.tanh(tmp)
    g = sns.clustermap(tmp, 
                   row_colors=row_colors, 
                   row_cluster=False, 
                   col_cluster=False,
                   figsize=(50,30),
                   cmap="viridis",
                   cbar_pos=(0.1, .1, .03, .6))
    legend = g.fig.legend(handles=handles,title='celltypes',bbox_to_anchor=(0.16,0.79),loc='center right',bbox_transform=g.fig.transFigure,borderaxespad=0.,fontsize=18,title_fontsize=20,ncol=1)
    g.ax_heatmap.set_title(f'Scored for each gene in hallmark_emt signature.', fontsize=22)

In [None]:
genes_to_potentially_add_scores = get_scores_for_all_sig_genes(orig_adata, list(genes_to_potentially_add))

In [None]:
genes = []
pvals_cancer_emt_caf = []
pvals_cancer_emt_cancer = []
pvals_cancer_emt_rest = []
pvals_cancer_emt_epi = []
pvals_caf_rest = []
for col in tqdm(genes_to_potentially_add_scores.columns):
    if is_numeric_dtype(genes_to_potentially_add_scores[col]) and ('final' not in col):        
        caf_scores = genes_to_potentially_add_scores[col][genes_to_potentially_add_scores['celltype_broader']=='Fibroblast']
        mal_emt_scores = genes_to_potentially_add_scores[col][genes_to_potentially_add_scores['celltype_broader']=='Malignant with EMT']
        mal_non_emt_scores = genes_to_potentially_add_scores[col][genes_to_potentially_add_scores['celltype_broader']=='Malignant']
        epi_scores = genes_to_potentially_add_scores[col][genes_to_potentially_add_scores['celltype_broader']=='Epithelial']
        rest_scores = genes_to_potentially_add_scores[col][~genes_to_potentially_add_scores['celltype_broader'].isin(['Epithelial','Malignant', 'Malignant with EMT', 'Fibroblast'])]
        
        genes.append(col)
        pvals_cancer_emt_caf.append(mannwhitneyu(mal_emt_scores, caf_scores, alternative= 'greater').pvalue)
        pvals_cancer_emt_cancer.append(mannwhitneyu(mal_emt_scores, mal_non_emt_scores, alternative= 'greater').pvalue)
        pvals_cancer_emt_epi.append(mannwhitneyu(mal_emt_scores, epi_scores, alternative= 'greater').pvalue)
        pvals_cancer_emt_rest.append(mannwhitneyu(mal_emt_scores, rest_scores, alternative= 'greater').pvalue)

In [None]:
gene_cancer_emt_sig_larger_cafs = multipletests(pvals_cancer_emt_caf, alpha=1e-5,method='fdr_bh')[0].tolist()
gene_cancer_emt_sig_larger_cafs = [x[0].split('_')[0] for x in zip(genes, gene_cancer_emt_sig_larger_cafs) if x[1]]
gene_cancer_emt_sig_larger_cafs

In [None]:
gene_cancer_emt_sig_larger_cancer = multipletests(pvals_cancer_emt_cancer, alpha=1e-5,method='fdr_bh')[0].tolist()
gene_cancer_emt_sig_larger_cancer = [x[0].split('_')[0] for x in zip(genes, gene_cancer_emt_sig_larger_cancer) if x[1]]
gene_cancer_emt_sig_larger_cancer

In [None]:
gene_cancer_emt_sig_larger_epi = multipletests(pvals_cancer_emt_epi, alpha=1e-5,method='fdr_bh')[0].tolist()
gene_cancer_emt_sig_larger_epi = [x[0].split('_')[0] for x in zip(genes, gene_cancer_emt_sig_larger_epi) if x[1]]
gene_cancer_emt_sig_larger_epi

In [None]:
gene_cancer_emt_sig_larger_rest = multipletests(pvals_cancer_emt_rest, alpha=1e-5,method='fdr_bh')[0].tolist()
gene_cancer_emt_sig_larger_rest = [x[0].split('_')[0] for x in zip(genes, gene_cancer_emt_sig_larger_rest) if x[1]]
gene_cancer_emt_sig_larger_rest

In [None]:
venn2(
subsets=(
    set(gene_cancer_emt_sig_larger_cafs),
    set(gene_cancer_emt_sig_larger_cancer)),
set_labels=(
    'gene_cancer_emt_sig_larger_cafs',
    'gene_cancer_emt_sig_larger_cancer')
)
if save:
    curr_path = os.path.join(storing_path, 'plots')
    if not os.path.exists(curr_path):
        os.makedirs(curr_path)
    plt.savefig(os.path.join(curr_path, f'venn_dgex_genes_refinement_1.png'), dpi=600)

In [None]:
venn3(
subsets=(
    set(gene_cancer_emt_sig_larger_cafs),
    set(gene_cancer_emt_sig_larger_cancer),
    set(gene_cancer_emt_sig_larger_epi)
),
set_labels=(
    'gene_cancer_emt_sig_larger_cafs',
    'gene_cancer_emt_sig_larger_cancer',
    'gene_cancer_emt_sig_larger_epi'
)
)
if save:
    curr_path = os.path.join(storing_path, 'plots')
    if not os.path.exists(curr_path):
        os.makedirs(curr_path)
    plt.savefig(os.path.join(curr_path, f'venn_dgex_genes_refinement_2.png'), dpi=600)

In [None]:
# genes_dge_cancer_emt_vs_cafs_and_cancer_refined_2 = (set(gene_cancer_emt_sig_larger_cafs).intersection(
#     set(gene_cancer_emt_sig_larger_cancer)
# ).intersection(set(gene_cancer_emt_sig_larger_rest)).intersection(set(gene_cancer_emt_sig_larger_epi))).union(set(genes_dge_cancer_emt_vs_cafs_and_cancer_refined))

genes_dge_cancer_emt_vs_cafs_and_cancer_refined_2 = (set(gene_cancer_emt_sig_larger_cafs).intersection(set(gene_cancer_emt_sig_larger_cancer))).union(set(genes_dge_cancer_emt_vs_cafs_and_cancer_refined))

In [None]:
score_signature(method="adjusted_neighborhood_scoring",
                        adata=orig_adata,
                        gene_list= genes_dge_cancer_emt_vs_cafs_and_cancer_refined_2,
                        ctrl_size=100,
                        score_name='genes_dge_cancer_emt_vs_cafs_and_cancer_refined_2')

In [None]:
from sklearn.metrics import precision_recall_curve, auc
for score_name in ['ESOPHAG_CANCER_EMT_SIGNATURE_1','ESOPHAG_CANCER_EMT_SIGNATURE_2','pEMT_gm','genes_dge_cancer_emt_vs_cafs_and_cancer_refined','genes_dge_cancer_emt_vs_cafs_and_cancer_refined_2']:
    plt.figure(figsize=(10,8))
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_emt_and_rest], orig_adata.obs.loc[cancer_emt_and_rest, score_name], pos_label='Malignant with EMT')
    lr_auc = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[caf_and_cancer_emt], orig_adata.obs.loc[caf_and_cancer_emt,score_name], pos_label='Malignant with EMT')
    lr_auc_caf_and_emt = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_and_cancer_emt], orig_adata.obs.loc[cancer_and_cancer_emt,score_name], pos_label='Malignant with EMT')
    lr_auc_cancer_and_emt = auc(lr_recall, lr_precision)
    
    for group in orig_adata.obs.groupby('celltype_broader'):
        group[1][score_name].hist(bins=100, density=True, alpha=0.5, label=group[0])
    
    plt.title(score_name+\
              '\nAUCPR Malignant with EMT vs. Rest '+ str(np.round(lr_auc, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. CAFS '+str(np.round(lr_auc_caf_and_emt, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. Malignant '+str(np.round(lr_auc_cancer_and_emt, decimals=3)), fontsize=16)
    plt.legend()
    plt.ylim([0,20])
    plt.tight_layout()
    if save:
        curr_path = os.path.join(storing_path, 'plots')
        if not os.path.exists(curr_path):
            os.makedirs(curr_path)
        plt.savefig(os.path.join(curr_path, f'dist_scores_{score_name}.png'), dpi=600)
    plt.show()

In [None]:
with rc_context({'figure.figsize': (8,8)}):
    umap_celltypes = sc.pl.umap(orig_adata,
                            color=['ESOPHAG_CANCER_EMT_SIGNATURE_1','ESOPHAG_CANCER_EMT_SIGNATURE_2','hallmark_emt','pEMT_gm','genes_dge_cancer_emt_vs_cafs_and_cancer_refined','genes_dge_cancer_emt_vs_cafs_and_cancer_refined_2', 'celltype_broader'],
                            ncols=3,
                            return_fig=True,
                            color_map = 'viridis'
                            )
    if save:
        curr_path = os.path.join(storing_path, 'plots')
        if not os.path.exists(curr_path):
            os.makedirs(curr_path)
        umap_celltypes.savefig(os.path.join(curr_path, f'umap_celltypes.png'), dpi=600)

## Store cancer specific signature 

In [None]:
len(genes_dge_cancer_emt_vs_cafs_and_cancer_refined),len(genes_dge_cancer_emt_vs_cafs_and_cancer_refined_2)

In [None]:
if save:
    pd.Series(list(genes_dge_cancer_emt_vs_cafs_and_cancer_refined)).to_csv(os.path.join(storing_path, 'LUNG_CANCER_EMT_SIGNATURE_1.csv'))
    pd.Series(list(genes_dge_cancer_emt_vs_cafs_and_cancer_refined_2)).to_csv(os.path.join(storing_path, 'LUNG_CANCER_EMT_SIGNATURE_2.csv'))