## Union LUAD-specific cancer EMT signature and ESCC-specific cancer EMT signature
Combine the two cancer type specific cancer EMT signatrues and refine again on ESCC.
At the end add all genes found in LUAD and not available in the ESCC to the signature. We do this because thode genes might be important for EMT in other cancer datasets.

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import scanpy as sc
import json 
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests
from scipy.stats import mannwhitneyu
from matplotlib_venn import venn3, venn2

sys.path.append('../../..')
from data.load_data import load_datasets
from data.constants import BASE_PATH_DATA, BASE_PATH_EXPERIMENTS

from signaturescoring import score_signature
from signaturescoring.utils.utils import get_mean_and_variance_gene_expression, check_signature_genes

In [None]:
base_path_emt_exp = os.path.join(BASE_PATH_EXPERIMENTS,'EMT_signature_scoring_case_study')
base_path_emt_signatures = os.path.join(BASE_PATH_DATA, 'annotations/emt')

storing_path = os.path.join(base_path_emt_exp, 'escc', 'union_emt_sigs')

if not os.path.exists(storing_path):
    os.makedirs(storing_path)
    sc.logging.info(f'Creating new storing folder at {storing_path}')
    
save = True

Get barcodes for ESCC cancer EMT cells and preprocessed ESCC data.

In [None]:
escc_data = load_datasets('escc', preprocessed=True, norm_method='mean')
if 'log1p' in escc_data.uns_keys():
    escc_data.uns['log1p']['base'] = None
else:
    escc_data.uns['log1p'] = {'base': None}

In [None]:
barcodes_cancer_emt = pd.read_csv(os.path.join(base_path_emt_exp, 'escc', 'barcodes_cancer_emt.csv'))
barcodes_cancer_emt = barcodes_cancer_emt['0'].tolist()

Define cell type annotation to distinguish cancer cells expressing EMT and cancer cells not expressing EMT

In [None]:
escc_data.obs['celltype_broader'] = escc_data.obs.celltype.copy()
escc_data.obs['celltype_broader'] = escc_data.obs['celltype_broader'].astype(str)

In [None]:
escc_data.obs.loc[barcodes_cancer_emt, 'celltype_broader'] = 'Epi with Mes'
escc_data.obs.loc[(escc_data.obs.celltype == 'Epi')& (escc_data.obs.index.isin(barcodes_cancer_emt) == False), 'celltype_broader'] = 'Epi wo Mes'

In [None]:
escc_data.obs['celltype_broader'] = escc_data.obs['celltype_broader'].astype('category')

Get ESCC- and LUAD-specific cancer EMT signatures.

In [None]:
ESOPHAG_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(base_path_emt_exp, 'escc', 'dataset_specific_emt_sig','ESOPHAG_CANCER_EMT_SIGNATURE_1.csv'))
ESOPHAG_CANCER_EMT_SIGNATURE_1 = ESOPHAG_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()
ESOPHAG_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(base_path_emt_exp, 'escc', 'dataset_specific_emt_sig','ESOPHAG_CANCER_EMT_SIGNATURE_2.csv'))
ESOPHAG_CANCER_EMT_SIGNATURE_2 = ESOPHAG_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()

In [None]:
LUNG_CANCER_EMT_SIGNATURE_1 = pd.read_csv(os.path.join(base_path_emt_exp, 'luad_xing', 'dataset_specific_emt_sig','LUNG_CANCER_EMT_SIGNATURE_1.csv'))
LUNG_CANCER_EMT_SIGNATURE_1 = LUNG_CANCER_EMT_SIGNATURE_1.iloc[:,1].tolist()

LUNG_CANCER_EMT_SIGNATURE_2 = pd.read_csv(os.path.join(base_path_emt_exp, 'luad_xing', 'dataset_specific_emt_sig','LUNG_CANCER_EMT_SIGNATURE_2.csv'))
LUNG_CANCER_EMT_SIGNATURE_2 = LUNG_CANCER_EMT_SIGNATURE_2.iloc[:,1].tolist()

In [None]:
with open(os.path.join(base_path_emt_signatures,'HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION.v7.5.1.json'), 'r') as f:
    hemt = json.load(f)

In [None]:
GM_B_22 = pd.read_csv(os.path.join(base_path_emt_signatures,'gene_modules_from_Barkley_et_al_2022.csv'))

In [None]:
mes_gm  = GM_B_22.Mesenchymal
mes_gm = mes_gm.dropna().tolist()

pEMT_gm = GM_B_22.pEMT
pEMT_gm = pEMT_gm.dropna().tolist()

cEMT = GM_B_22.cEMT
cEMT = cEMT.dropna().tolist()

pEMT = GM_B_22['pEMT.1']
pEMT = pEMT.dropna().tolist()

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

escc_df_mean_var = get_mean_and_variance_gene_expression(escc_data, estim_var=False,show_plots=True)

In [None]:
sig_list=[
    ('ESOPHAG_CANCER_EMT_SIGNATURE_1', ESOPHAG_CANCER_EMT_SIGNATURE_1),
    ('ESOPHAG_CANCER_EMT_SIGNATURE_2', ESOPHAG_CANCER_EMT_SIGNATURE_2),
    ('LUNG_CANCER_EMT_SIGNATURE_1', LUNG_CANCER_EMT_SIGNATURE_1),
    ('LUNG_CANCER_EMT_SIGNATURE_2', LUNG_CANCER_EMT_SIGNATURE_2),
    ('pEMT_gm',pEMT_gm),
    ('hallmark_emt',hemt['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION']['geneSymbols'])
]


In [None]:
for sig in sig_list:
    score_signature(method="adjusted_neighborhood_scoring",
                    adata=escc_data,
                    gene_list= sig[1],
                    ctrl_size=100,
                    df_mean_var = escc_df_mean_var,
                    score_name=sig[0])

In [None]:
from sklearn.metrics import precision_recall_curve, auc

In [None]:
gt = escc_data.obs.celltype_broader.copy()
gt = gt.astype(str)

In [None]:
gt[gt!='Epi with Mes']= 'Rest'

In [None]:
caf_and_cancer_emt = escc_data.obs.celltype_broader[escc_data.obs.celltype_broader.isin(['Epi with Mes','Fibroblasts' ])].index.tolist()
cancer_and_cancer_emt = escc_data.obs.celltype_broader[escc_data.obs.celltype_broader.isin(['Epi with Mes','Epi wo Mes' ])].index.tolist()

In [None]:
sig_names  = [x[0] for x in sig_list]

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

pos_lbl = 'Epi with Mes'
for score_name in sig_names:
    plt.figure(figsize=(10,8))
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt, escc_data.obs[score_name], pos_label=pos_lbl)
    lr_auc = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[caf_and_cancer_emt], escc_data.obs.loc[caf_and_cancer_emt,score_name], pos_label=pos_lbl)
    lr_auc_caf_and_emt = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_and_cancer_emt], escc_data.obs.loc[cancer_and_cancer_emt,score_name], pos_label=pos_lbl)
    lr_auc_cancer_and_emt = auc(lr_recall, lr_precision)
    
    for group in escc_data.obs.groupby('celltype_broader'):
        group[1][score_name].hist(bins=100, density=True, alpha=0.5, label=group[0])
    
    plt.title(score_name+\
              '\nAUCPR Malignant with EMT vs. Rest '+ str(np.round(lr_auc, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. CAFS '+str(np.round(lr_auc_caf_and_emt, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. Malignant '+str(np.round(lr_auc_cancer_and_emt, decimals=3)), fontsize=16)
    plt.legend()
    plt.ylim([0,20])
    plt.tight_layout()
    if save:
        curr_path = os.path.join(storing_path, 'plots')
        if not os.path.exists(curr_path):
            os.makedirs(curr_path)
        plt.savefig(os.path.join(curr_path, f'dist_scores_{score_name}.png'), dpi=600)
    plt.show()

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

sc.pl.violin(escc_data, groupby='celltype_broader',keys=sig_names, rotation=90, show=False)
if save:
    curr_path = os.path.join(storing_path, 'plots')
    if not os.path.exists(curr_path):
        os.makedirs(curr_path)
    plt.savefig(os.path.join(curr_path, f'ds_specific_score_distr.p'), dpi=600)
plt.show()

### HELPER FUNCTIONS

In [None]:
def get_scores_for_all_sig_genes(adata, sig_genes,ctrl_size = 100):

    gene_list = check_signature_genes(adata.var_names, sig_genes)
    df_mean_var = get_mean_and_variance_gene_expression(adata,
                                                    estim_var=False)
    gene_means = df_mean_var['mean'].copy()

    # computation of neighboring genes around each signature gene
    sorted_gene_means = gene_means.sort_values()
    ref_genes_means = sorted_gene_means[sorted_gene_means.index.isin(gene_list) == False]

    # use sliding window to compute for each window the mean
    rolled = ref_genes_means.rolling(ctrl_size, closed='right').mean()

    control_genes = []
    for sig_gene in gene_list:
        curr_sig_avg = sorted_gene_means.loc[sig_gene]
        min_val_idx = np.argmin(((rolled - curr_sig_avg).abs()))
        sig_gene_ctrl_genes = rolled.iloc[(min_val_idx - ctrl_size + 1):min_val_idx + 1]
        control_genes.append(list(sig_gene_ctrl_genes.index))
    
    
    list_scores_per_sig_genes = []
    for sig_gene, ctrl_genes in zip(gene_list,control_genes):
        curr_score = adata[:,sig_gene].X - adata[:,ctrl_genes].X.mean(axis=1)
        curr_score = (curr_score-curr_score.min())/(curr_score.max()-curr_score.min())
#         curr_score = np.tanh(curr_score)
        list_scores_per_sig_genes.append(pd.DataFrame
                                         (curr_score,
                                          index=adata.obs_names,
                                          columns=[sig_gene+'_score']))
    df_signature_scores = pd.concat(list_scores_per_sig_genes, axis=1)
    
    
    columns_titles = df_signature_scores.columns.tolist()
    columns_titles.sort()
    df_signature_scores=df_signature_scores.reindex(columns=columns_titles)
    df_signature_scores['final_score_mean'] = df_signature_scores.mean(axis=1)
    df_signature_scores['final_score_median'] = df_signature_scores.median(axis=1)
    df_signature_scores['celltype'] = adata.obs['celltype']
    df_signature_scores['celltype_broader'] = adata.obs['celltype_broader']
    return df_signature_scores

In [None]:
from pandas.api.types import is_numeric_dtype

def plot_heatmap_with_celltype_anno(df, label_col, palette="tab10", bbox_to_anchor=(0.16,0.79),cat_title = 'celltypes',apply_tanh=False):
    if label_col not in df:
        raise KeyError(f'labelcol={label_col} is not a column of df')
        
    lut = dict(zip(df[label_col].unique(), sns.color_palette(palette)))
    
    row_colors = pd.DataFrame(df[label_col])[label_col].astype(str).map(lut)
    
    counts = dict(df[label_col].value_counts())
    
    categories = counts.keys()
    
    handles = [Patch(color=lut[category],label=category+' (%i)'%counts[category]) for category in sorted(categories)]
    
    tmp = df.sort_values(by=[label_col,'final_score_mean'])
    tmp = tmp[[x for x in tmp.columns if (x!= label_col) and  (is_numeric_dtype(tmp[x]))]]
    if apply_tanh:
        tmp = np.tanh(tmp)
    g = sns.clustermap(tmp, 
                   row_colors=row_colors, 
                   row_cluster=False, 
                   col_cluster=False,
                   figsize=(50,30),
                   cmap="viridis",
                   cbar_pos=(0.1, .1, .03, .6))
    legend = g.fig.legend(handles=handles,title='celltypes',bbox_to_anchor=(0.16,0.79),loc='center right',bbox_transform=g.fig.transFigure,borderaxespad=0.,fontsize=18,title_fontsize=20,ncol=1)
    g.ax_heatmap.set_title(f'Scored for each gene in hallmark_emt signature.', fontsize=22)

In [None]:
def get_pvals(genes_to_potentially_add_scores):
    genes = []
    pvals_cancer_emt_caf = []
    pvals_cancer_emt_cancer = []
    pvals_cancer_emt_rest = []
    pvals_caf_rest = []
    for col in tqdm(genes_to_potentially_add_scores.columns):
        if is_numeric_dtype(genes_to_potentially_add_scores[col]) and ('final' not in col):        
            caf_scores = genes_to_potentially_add_scores[col][genes_to_potentially_add_scores['celltype_broader']=='Fibroblasts']
            mal_emt_scores = genes_to_potentially_add_scores[col][genes_to_potentially_add_scores['celltype_broader']=='Epi with Mes']
            mal_non_emt_scores = genes_to_potentially_add_scores[col][genes_to_potentially_add_scores['celltype_broader']=='Epi wo Mes']
            rest_scores = genes_to_potentially_add_scores[col][genes_to_potentially_add_scores['celltype_broader'].isin(['Epi with Mes','Epi wo Mes','Fibroblasts'])==False]

            genes.append(col)
            pvals_cancer_emt_caf.append(mannwhitneyu(mal_emt_scores, caf_scores, alternative= 'greater').pvalue)
            pvals_cancer_emt_cancer.append(mannwhitneyu(mal_emt_scores, mal_non_emt_scores, alternative= 'greater').pvalue)
            pvals_cancer_emt_rest.append(mannwhitneyu(mal_emt_scores, rest_scores, alternative= 'greater').pvalue)
    return genes, pvals_cancer_emt_caf, pvals_cancer_emt_cancer, pvals_cancer_emt_rest, pvals_caf_rest

### Refine LUNG 1 unioned with ESCC 2

In [None]:
sig_lung_refined_and_esophag = set(LUNG_CANCER_EMT_SIGNATURE_1).union(ESOPHAG_CANCER_EMT_SIGNATURE_2)

In [None]:
len(sig_lung_refined_and_esophag)

In [None]:
escc_data.X = escc_data.X.tocsc()

In [None]:
genes_to_potentially_add_scores = get_scores_for_all_sig_genes(escc_data, list(sig_lung_refined_and_esophag))

In [None]:
genes, pvals_cancer_emt_caf, pvals_cancer_emt_cancer, pvals_cancer_emt_rest, pvals_caf_rest = get_pvals(genes_to_potentially_add_scores)

In [None]:
gene_cancer_emt_sig_larger_cafs = multipletests(pvals_cancer_emt_caf, alpha=1e-5,method='fdr_bh')[0].tolist()
gene_cancer_emt_sig_larger_cafs = [x[0].split('_')[0] for x in zip(genes, gene_cancer_emt_sig_larger_cafs) if x[1]]

In [None]:
gene_cancer_emt_sig_larger_cancer = multipletests(pvals_cancer_emt_cancer, alpha=1e-5,method='fdr_bh')[0].tolist()
gene_cancer_emt_sig_larger_cancer = [x[0].split('_')[0] for x in zip(genes, gene_cancer_emt_sig_larger_cancer) if x[1]]

In [None]:
gene_cancer_emt_sig_larger_rest = multipletests(pvals_cancer_emt_rest, alpha=1e-5,method='fdr_bh')[0].tolist()
gene_cancer_emt_sig_larger_rest = [x[0].split('_')[0] for x in zip(genes, gene_cancer_emt_sig_larger_rest) if x[1]]

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

venn3(
subsets=(
    set(gene_cancer_emt_sig_larger_cafs),
    set(gene_cancer_emt_sig_larger_cancer),
    set(gene_cancer_emt_sig_larger_rest)
),
set_labels=(
    'gene_cancer_emt_sig_larger_cafs',
    'gene_cancer_emt_sig_larger_cancer',
    'gene_cancer_emt_sig_larger_rest'
)
)

In [None]:
new_sig = set(gene_cancer_emt_sig_larger_cafs).intersection(set(gene_cancer_emt_sig_larger_cancer)).intersection(set(gene_cancer_emt_sig_larger_rest))

In [None]:
score_signature(method="adjusted_neighborhood_scoring",
                    adata=escc_data,
                    gene_list= set(LUNG_CANCER_EMT_SIGNATURE_1).union(ESOPHAG_CANCER_EMT_SIGNATURE_2),
                    ctrl_size=100,
                    df_mean_var = escc_df_mean_var,
                    score_name='LUNG_CANCER_EMT_SIGNATURE_1.union(ESOPHAG_CANCER_EMT_SIGNATURE_2)')

score_signature(method="adjusted_neighborhood_scoring",
                    adata=escc_data,
                    gene_list= new_sig,
                    ctrl_size=100,
                    df_mean_var = escc_df_mean_var,
                    score_name='new_sig')


In [None]:
## Add genes to signature from ESOPHAG_CANCER_EMT_SIGNATURE_2, that were not present in LUAD dataset (see notebooks/EMT_scoring_experiments/LUAD/find_cancer_emt_signature_LUAD.ipynb)
new_sig_2 = list(new_sig)+['CYP27B1', 'IL1A', 'MMP10', 'CTD-2357A8.3', 'L1CAM', 'KRT14', 'KRT81', 'RTTN']

In [None]:
## Add genes to signature from LUNG_CANCER_EMT_SIGNATURE_1, that are not present in ESCC dataset
gene_in_lung_not_in_escc = {'C2orf54', 'RP11-462G2.1', 'SEMA3B', 'TRIM29', 'RP11-519G16.5', 'IVL', 'SPRR2D', 'SYT8', 'CTXN1', 'PRSS8', 'VSTM2L', 'BCYRN1', 'MYLPF', 'TMEM61', 'SNCG', 'COBL', 'SCEL', 'VSIG1', 'MUC21', 'LCN2', 'RHBDL2', 'SPINK1', 'CRLF1', 'SPNS2', 'LA16c-431H6.6', 'MUC3A', 'RNF39', 'CDH3', 'PRR36', 'PITX1', 'EVPL', 'DNAH2', 'KDR', 'KISS1', 'NPIPB11', 'ABHD11-AS1', 'ADGRF1', 'C6orf132', 'RP6-65G23.3', 'TNNC2', 'BAIAP3', 'DNAH11', 'MISP', 'WNT7B', 'SHC2', 'TMC5', 'PHACTR3', 'RP3-340N1.2', 'LONRF2', 'SLC6A14', 'METTL7B', 'CX3CL1', 'RP11-431K24.1', 'MB', 'TMPRSS4', 'NOXO1', 'GPR39', 'PPL', 'CBLC', 'B3GNT3', 'EDN2', 'FGFBP1', 'SAA2', 'RP11-350J20.12', 'LAMA3', 'PAEP', 'ABCA13', 'WFDC3', 'IL37', 'CARD10', 'FAM83A', 'AC023590.1', 'HHLA2', 'MUC16'}

In [None]:

new_sig_3 = set(new_sig_2).union(gene_in_lung_not_in_escc)

In [None]:
score_signature(method="adjusted_neighborhood_scoring",
                    adata=escc_data,
                    gene_list= new_sig_2,
                    ctrl_size=100,
                    df_mean_var = escc_df_mean_var,
                    score_name='new_sig_2')

In [None]:
score_signature(method="adjusted_neighborhood_scoring",
                    adata=escc_data,
                    gene_list= new_sig_3,
                    ctrl_size=100,
                    df_mean_var = escc_df_mean_var,
                    score_name='new_sig_3')

In [None]:
sig_names = sig_names[0:-2]
sig_names

In [None]:
# sig_names = sig_names + ['LUNG_CANCER_EMT_SIGNATURE_1_refined.union(ESOPHAG_CANCER_EMT_SIGNATURE_2)','new_sig']
sig_names = sig_names + ['LUNG_CANCER_EMT_SIGNATURE_1.union(ESOPHAG_CANCER_EMT_SIGNATURE_2)','new_sig','new_sig_2','new_sig_3']

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

for sig_name in sig_names:
    sc.pl.violin(escc_data, groupby='celltype_broader',keys=sig_name, rotation=90)

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

pos_lbl = 'Epi with Mes'
for score_name in sig_names:
    plt.figure(figsize=(10,8))
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt, escc_data.obs[score_name], pos_label=pos_lbl)
    lr_auc = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[caf_and_cancer_emt], escc_data.obs.loc[caf_and_cancer_emt,score_name], pos_label=pos_lbl)
    lr_auc_caf_and_emt = auc(lr_recall, lr_precision)
    
    lr_precision, lr_recall, _ = precision_recall_curve(gt[cancer_and_cancer_emt], escc_data.obs.loc[cancer_and_cancer_emt,score_name], pos_label=pos_lbl)
    lr_auc_cancer_and_emt = auc(lr_recall, lr_precision)
    
    for group in escc_data.obs.groupby('celltype_broader'):
        group[1][score_name].hist(bins=100, density=True, alpha=0.5, label=group[0])
    
    plt.title(score_name+\
              '\nAUCPR Malignant with EMT vs. Rest '+ str(np.round(lr_auc, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. CAFS '+str(np.round(lr_auc_caf_and_emt, decimals=3))+\
              '\nAUCPR Malignant with EMT vs. Malignant '+str(np.round(lr_auc_cancer_and_emt, decimals=3)), fontsize=16)
    plt.legend()
    plt.ylim([0,20])
    plt.tight_layout()
    if save:
        curr_path = os.path.join(storing_path, 'plots')
        if not os.path.exists(curr_path):
            os.makedirs(curr_path)
        plt.savefig(os.path.join(curr_path, f'dist_scores_{score_name}.png'), dpi=600)
    plt.show()

### Store signatures 

In [None]:
len(new_sig), len(new_sig_2), len(new_sig_3)

In [None]:
if save:
    pd.Series(list(new_sig)).to_csv(os.path.join(storing_path, 'LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1.csv'))
    pd.Series(list(new_sig_2)).to_csv(os.path.join(storing_path, 'LUNG1_ESCC2_CANCER_EMT_SIGNATURE_2.csv'))
    pd.Series(list(new_sig_3)).to_csv(os.path.join(storing_path, 'LUNG1_ESCC2_CANCER_EMT_SIGNATURE_3.csv'))

## OLD

In [None]:
storing_path

In [None]:
# pd.Series(list(new_sig)).to_csv('..../data/annotations/emt/LUNG1_ESCC2_CANCER_EMT_SIGNATURE_1.csv')
# pd.Series(list(new_sig)).to_csv('..../data/annotations/emt/LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_1.csv')

In [None]:
venn2(
    subsets=(
        new_sig,
        set(ESOPHAG_CANCER_EMT_SIGNATURE_2)
    ),
    set_labels=(
        'new_sig',
        'ESOPHAG_CANCER_EMT_SIGNATURE_2'
    )
)

In [None]:
set(ESOPHAG_CANCER_EMT_SIGNATURE_2).difference(new_sig)

In [None]:
new_sig_2 = list(new_sig)+['CYP27B1','KCNMA1', 'L1CAM', 'SMOC1', 'ZBED2']

In [None]:
pd.Series(list(new_sig_2)).to_csv('..../data/annotations/emt/LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_2.csv')

In [None]:
gene_in_lung_not_in_escc = {'ADGRF1', 'TMEM61', 'NPIPB11', 'GPR39', 'LCN2', 'BAIAP2L2', 'RHBDL2', 'CTXN1', 'ZNF385C', 'RP11-105N14.1', 'FAM83A', 'LA16c-431H6.6', 'NPTXR', 'CBLC', 'SPNS2', 'AP006285.2', 'EML6', 'NIPAL1', 'RP11-431K24.1', 'SLC6A14', 'RP11-196G18.3', 'SLC29A4', 'TRIM29', 'C6orf132', 'OSMR-AS1', 'EDN2', 'CARD10', 'C19orf33', 'RP11-380N8.7', 'C2orf54', 'MUC16', 'TMC5', 'DNAH11', 'CX3CL1', 'BCAS1', 'MISP', 'RP11-519G16.5', 'METTL7B', 'BAIAP3', 'COBL', 'U73166.2', 'WNT7B', 'SNCG', 'EPHX3', 'XKR9', 'RP11-10C24.3', 'RP11-326C3.2', 'LTK', 'PRSS8', 'EVPL', 'LAMA3', 'RP11-539G18.3', 'KISS1', 'NANOS1', 'MYRF', 'TMEM92', 'PDE4C', 'USP43', 'CDH3', 'PLEKHG6', 'BCYRN1', 'PRSS27', 'TMPRSS4', 'ITPKA', 'B3GNT3', 'RP11-81A22.5', 'PPL', 'DOC2B', 'SYT12', 'KDR', 'RP11-350J20.12', 'RP11-672L10.6', 'RP6-65G23.3', 'NOXO1', 'SPINK1', 'VSTM2L', 'UNC5CL', 'KISS1R', 'MB', 'KRT80', 'PGBD5', 'ITIH4', 'TMC7'}

In [None]:
venn2(
    subsets=(
        set(LUNG_CANCER_EMT_SIGNATURE_1),
        set(escc_data.var_names)
    ),
    set_labels=(
        'LUNG_CANCER_EMT_SIGNATURE_1',
        'escc_data.var_names'
    )
)

In [None]:
pd.Series(list(new_sig_2.union(gene_in_lung_not_in_escc))).to_csv('....data/annotations/emt/LUNG1_notrefined_ESCC2_CANCER_EMT_SIGNATURE_3.csv')