## Get cancer EMT cells for LUAD (Xing et al.)
Score mutliple pan-cancer EMT signatures. Convert scores to rank and compute median rank per cell.
Cancer EMT cells are sells within the 10% of cells with the smallest ranks.

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import os 
import sys

import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

sys.path.append('../../..')
from data.load_data import load_datasets
from data.constants import BASE_PATH_EXPERIMENTS, BASE_PATH_DATA

from signaturescoring import score_signature
from signaturescoring.utils.utils import get_mean_and_variance_gene_expression, check_signature_genes

In [None]:
sc.settings.verbosity = 2

pl_size = 6

save = True

dataset = 'luad_xing'

In [None]:
storing_path = os.path.join(BASE_PATH_EXPERIMENTS, f'EMT_signature_scoring_case_study/{dataset}')
if not os.path.exists(storing_path):
    os.makedirs(storing_path)
    sc.logging.info(f'Creating new directory to store the results.')

Load preprocessed data

In [None]:
adata = load_datasets(dataset, preprocessed=True, norm_method='mean')
if 'log1p' in adata.uns_keys():
    adata.uns['log1p']['base'] = None
else:
    adata.uns['log1p'] = {'base': None}

In [None]:
adata.obs.celltype.value_counts().sort_index()

In [None]:
adata = adata[adata.obs.celltype!='Granulocytes'].copy()

In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})
sc.pl.umap(adata, color=['sample_id','malignant_key', 'celltype', 'cell_subtype'], ncols=1)

### Select cancer emt cells

In [None]:
import glob
import json

def get_sig_from_emtome_sig_file(filepath):
    assert os.path.exists(filepath)
    with open(filepath, 'r') as f:
        lines = f.readlines()
        lines = lines[2:]
        lines = [str(x[1:-1]) for x in lines]
        
    return lines

## define ase path to emt signatures
base_path_emt_signatures = os.path.join(BASE_PATH_DATA, 'annotations/emt')
luad_specfic_emt_sigs = glob.glob(base_path_emt_signatures+"/sigs_from_emtome/luad/*.txt")
luad_specfic_emt_sigs.sort()

luad_specfic_emt_sigs = {x.split('/')[-1].split('.')[0]:x for x in luad_specfic_emt_sigs}
luad_specfic_emt_sigs = {key:get_sig_from_emtome_sig_file(val) for key, val in luad_specfic_emt_sigs.items()}

pan_cancer_emt_sigs = glob.glob(base_path_emt_signatures+"/sigs_from_emtome/pan_cancer/*.txt")
pan_cancer_emt_sigs.sort()
pan_cancer_emt_sigs = {x.split('/')[-1].split('.')[0]:x for x in pan_cancer_emt_sigs}
pan_cancer_emt_sigs = {key:get_sig_from_emtome_sig_file(val) for key, val in pan_cancer_emt_sigs.items()}

with open(base_path_emt_signatures+'/HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION.v7.5.1.json', 'r') as f:
    hemt = json.load(f)

pan_cancer_emt_sigs['hallmark_emt'] = hemt['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION']['geneSymbols']

In [None]:
GM_B_22 = pd.read_csv(base_path_emt_signatures+'/gene_modules_from_Barkley_et_al_2022.csv')

mes_gm  = GM_B_22.Mesenchymal
mes_gm = mes_gm.dropna().tolist()

pEMT_gm = GM_B_22.pEMT
pEMT_gm = pEMT_gm.dropna().tolist()

cEMT = GM_B_22.cEMT
cEMT = cEMT.dropna().tolist()

pEMT = GM_B_22['pEMT.1']
pEMT = pEMT.dropna().tolist()

In [None]:
pan_cancer_emt_sigs['pEMT_gm'] = pEMT_gm

In [None]:
pan_cancer_emt_sigs.keys()

In [None]:
df_mean_var = get_mean_and_variance_gene_expression(adata, estim_var=False)

In [None]:
for key, val in luad_specfic_emt_sigs.items():
    score_signature(method="adjusted_neighborhood_scoring",
                        adata=adata,
                        gene_list= val,
                        ctrl_size=100,
                        df_mean_var = df_mean_var,
                        score_name=key)

In [None]:
for key, val in pan_cancer_emt_sigs.items():
    score_signature(method="adjusted_neighborhood_scoring",
                        adata=adata,
                        gene_list= val,
                        ctrl_size=100,
                        df_mean_var = df_mean_var,
                        score_name=key)

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

with plt.rc_context({'figure.figsize':(10,8)}):
    fig = sc.pl.umap(adata[adata.obs.malignant_key =='malignant'], 
               color=['sample_id','malignant_key', 'celltype', 'cell_subtype_clusters', 'cell_subtype']+list(luad_specfic_emt_sigs.keys())+list(pan_cancer_emt_sigs.keys()), 
               ncols=4,
               return_fig=True)
    if save:
        curr_path = os.path.join(storing_path, 'cancer_emt_barcode_selection')
        if not os.path.exists(curr_path):
            os.mkdir(curr_path)
        fig.savefig(os.path.join(curr_path, f'mal_cells_umap_emt_sigs.png'), dpi=600)

In [None]:
score_names_pan_cancer = list(pan_cancer_emt_sigs.keys())

In [None]:
cancer_cells  = adata[adata.obs.malignant_key=='malignant'].copy()

In [None]:
cell_ranks_for_score_names =[]

In [None]:
for col in score_names_pan_cancer:
    cell_ranks_for_score_names.append((cancer_cells.obs[col]).rank(na_option='bottom', ascending=False))

In [None]:
ranked_cells = pd.concat(cell_ranks_for_score_names, axis=1)

In [None]:
sorted_ranks = ranked_cells.loc[ranked_cells.median(axis=1).sort_values().index,:]

In [None]:
sorted_ranks.median(axis=1).hist(bins=100)
plt.axvline(sorted_ranks.median(axis=1).quantile(0.1),c='r', label='quantile 0.1')
plt.title(f'Distribution median ranks.')
if save:
    curr_path = os.path.join(storing_path, 'cancer_emt_barcode_selection')
    if not os.path.exists(curr_path):
        os.mkdir(curr_path)
    plt.savefig(os.path.join(curr_path, f'distr_median_ranks.png'), dpi=600)

In [None]:
sorted_ranks_median = sorted_ranks.median(axis=1)

In [None]:
barcodes_cancer_emt = sorted_ranks_median[sorted_ranks_median <= sorted_ranks_median.quantile(0.1)].index.tolist()

In [None]:
## define path to store the cancer emt cell barcodes
if save:
    pd.Series(barcodes_cancer_emt).to_csv(os.path.join(storing_path, 'barcodes_cancer_emt.csv'))

In [None]:
cancer_cells.obs.celltype.value_counts()

In [None]:
adata.obs['celltype_emt'] = adata.obs.celltype.copy()
adata.obs['celltype_emt'] = adata.obs['celltype_emt'].astype(str)

In [None]:
adata.obs.loc[barcodes_cancer_emt, 'celltype_emt'] = 'Malignant with EMT'

In [None]:
adata.obs['celltype_emt'] = adata.obs['celltype_emt'].astype('category')

In [None]:
adata.obs['celltype_emt'].value_counts().sort_index()

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

with plt.rc_context({'figure.figsize':(10,8)}):
    umap_fig = sc.pl.umap(adata[adata.obs.celltype.isin(['Malignant', 'Fibroblast'])], color=score_names_pan_cancer+['sample_id', 'celltype', 'celltype_emt'],return_fig=True,cmap='viridis')
    if save:
        curr_path = os.path.join(storing_path, 'cancer_emt_barcode_selection')
        if not os.path.exists(curr_path):
            os.mkdir(curr_path)
        umap_fig.savefig(os.path.join(curr_path, f'mal_n_caf_cells_umap_emt_sigs.png'), dpi=600)

In [None]:
adata.obs.celltype_emt = adata.obs.celltype_emt.astype('category') 

In [None]:
tmp = adata.obs[['sample_id','celltype_emt']]

In [None]:
cross_tab_prop = pd.crosstab(index=tmp['sample_id'],
                             columns=tmp['celltype_emt'],
                             normalize="index")

In [None]:
cross_tab_prop = cross_tab_prop[['Malignant with EMT', 'Malignant', 'Fibroblast', 
                                 'B_cell', 'Dendritic', 'Endothelial', 'Epithelial',
                                 'Erythroblast', 'Macrophage', 'Mast', 'Monocyte', 'NK_cell', 'T_cell']]

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

cross_tab_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='tab20', 
                    figsize=(10, 6))
plt.legend(loc='center left',bbox_to_anchor=(1.0, 0.5),ncol=1)
plt.xlabel("Sample ID")
plt.ylabel("Proportions")
plt.tight_layout()

if save:
    curr_path = os.path.join(storing_path, 'proportions')
    if not os.path.exists(curr_path):
        os.mkdir(curr_path)
    plt.savefig(os.path.join(curr_path, f'distribution_celltypes.svg'))
    cross_tab_prop.to_csv(os.path.join(curr_path, f'proportions_celltype.csv'))

In [None]:
prop_counts = pd.crosstab(index=tmp['sample_id'],
            columns=tmp['celltype_emt'],)
prop_counts = prop_counts[['Malignant with EMT', 'Malignant', 'Fibroblast', 
                                 'B_cell', 'Dendritic', 'Endothelial', 'Epithelial',
                                 'Erythroblast', 'Macrophage', 'Mast', 'Monocyte', 'NK_cell', 'T_cell']]
if save:
    curr_path = os.path.join(storing_path, 'proportions')
    if not os.path.exists(curr_path):
        os.mkdir(curr_path)
    prop_counts.to_csv(os.path.join(curr_path, f'counts_celltype.csv'))