## Experiment influence of noise in gene expression signatures on scoring (ESCC)
In this experiment we evaluated the robustness of gene signature scoring methods to noise in gene expression signatures on ESCC. We started with base signatures that were able to discriminate malignant vs. non-malignant cells with an AUCROC value of $0.9$ and were shortest. We iteratively added random gene with $-0.25\le lofFC \le 0.25$, i.e., non-relevant genes.

The scripts to run the experiments can be found in the `experiments/signature_noise_addition_experiments` folder. The experiments evaluate the AUCROC for malignant vs. non-malignant cells and store the performance files.

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import glob
import os
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import sys
sys.path.append('../..')
from data.constants import BASE_PATH_EXPERIMENTS

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

In [None]:
## define path to store visualizations
storing_path = os.path.join(BASE_PATH_EXPERIMENTS, 'signature_noise_addition_experiments/esophag')
storing_path

In [None]:
save = True

In [None]:
method = 'median' # methods in ['mean', 'median']
factor_ucell = 169 if method == 'mean' else 81
factor_rest = 4 # the same for 'mean' or 'median'

## Evaluate AUCs for runs

In [None]:
## define path to AUCROC performance files
base_path = storing_path  
list_performance_files = glob.glob(os.path.join(base_path, 'AUC_performances/AUC_*.csv'))
list_performance_files = [fn for fn in list_performance_files if method in fn]

In [None]:
list_performance_files.sort()
list_performance_files

In [None]:
list_performances = [pd.read_csv(x) for x in list_performance_files]
all_performances = pd.concat(list_performances)
all_performances = all_performances.drop(columns='Unnamed: 0')

In [None]:
all_performances

In [None]:
all_performances.scoring_method = all_performances.scoring_method.apply(lambda x : '_'.join(x.split('_')[0:-3]))
all_performances

In [None]:
# all_performances.loc[all_performances.scoring_method=='ucell_scoring', 'added_non_relevant_genes'] = all_performances.loc[all_performances.scoring_method=='ucell_scoring', 'added_non_relevant_genes']/120
all_performances.loc[all_performances.scoring_method=='ucell_scoring', 'added_non_relevant_genes'] = all_performances.loc[all_performances.scoring_method=='ucell_scoring', 'added_non_relevant_genes']/factor_ucell

In [None]:
# all_performances.loc[all_performances.scoring_method!='ucell_scoring', 'added_non_relevant_genes'] = all_performances.loc[all_performances.scoring_method!='ucell_scoring', 'added_non_relevant_genes']/10
all_performances.loc[all_performances.scoring_method!='ucell_scoring', 'added_non_relevant_genes'] = all_performances.loc[all_performances.scoring_method!='ucell_scoring', 'added_non_relevant_genes']/factor_rest

In [None]:
# perf_ucell = all_performances[all_performances.scoring_method=='ucell_scoring'].copy().reset_index()
# perf_ucell = perf_ucell[perf_ucell.added_non_relevant_genes<=22]
# perf_not_ucell = all_performances[all_performances.scoring_method.isin(['ucell_scoring','neighborhood_scoring','corrected_scanpy_scoring'])==False].copy().reset_index()
# perf_not_ucell = perf_not_ucell[perf_not_ucell.added_non_relevant_genes<51].reset_index()
# all_performances = pd.concat([perf_ucell,perf_not_ucell], axis=0 )

In [None]:
#all_performances = all_performances[all_performances.added_non_relevant_genes<41]

In [None]:
all_performances.scoring_method.value_counts()

In [None]:
all_performances.scoring_method = all_performances.scoring_method.map({
    'adjusted_neighborhood_scoring':'ANS',
    'jasmine_lh_scoring':'Jasmine_LH',
    'jasmine_or_scoring':'Jasmine_OR',
    'scanpy_scoring':'Scanpy',
    'tirosh_ag_scoring':'Tirosh_AG',
    'tirosh_lvg_scoring':'Tirosh_LVG',
    'tirosh_scoring':'Tirosh',
    'ucell_scoring':'UCell',
})
#all_performances.scoring_method = all_performances.scoring_method.map(
#{'ucell_scoring':'UCell',
# 'adjusted_neighborhood_scoring':'ANS',
# 'agcg_scoring':'Tirosh_AG',
# 'jasmine_scoring_lh':'Jasmine_LH',
# 'jasmine_scoring_or':'Jasmine_OR',
# 'lvcg_scoring':'Tirosh_LVG',
# 'tirosh_scoring':'Tirosh',
# 'original_scanpy_scoring':'Scanpy'
#}
#)

In [None]:
for group in all_performances.groupby(['scoring_method','added_non_relevant_genes']).mean().reset_index().sort_values(by='added_non_relevant_genes').groupby('added_non_relevant_genes'):
    print(group[1].sort_values(by='scoring_method'))

In [None]:
sc_names = ['ANS', 'Tirosh', 'Tirosh_AG', 'Tirosh_LVG', 'Scanpy', 'Jasmine_LH', 'Jasmine_OR', 'UCell']

In [None]:
tmp = all_performances.groupby(['scoring_method','added_non_relevant_genes']).mean().reset_index()

In [None]:
tmp[tmp.AUC<=0.9].groupby('scoring_method').added_non_relevant_genes.min(), tmp[tmp.AUC<=0.9].groupby('scoring_method').AUC.min()

In [None]:
with plt.rc_context({'figure.figsize': (16,8)}):
    plt.axvline(3, ls='--', alpha=0.4, c='grey')
    plt.axvline(13, ls='--', alpha=0.4, c='grey')
    plt.axvline(24, ls='--', alpha=0.4, c='grey')
    plt.axhline(0.99, c='r',ls=':',alpha=0.7, label='AUCROC 0.99')
    plt.axhline(0.95, c='orange',ls=':',alpha=0.7, label='AUCROC 0.95')
    plt.axhline(0.9, c='g',ls=':',alpha=0.7, label='AUCROC 0.90')
    
    for val in sc_names:
        curr_data = all_performances[all_performances.scoring_method==val]
        sns.lineplot(data=curr_data, x="added_non_relevant_genes", y="AUC",label=val)

    plt.legend(fontsize=16)
    plt.title(f'Robustness of scoring methods to signatures containing non-informative genes', fontsize=18)
    plt.xticks((np.arange(0, 41)),fontsize=16)
    plt.xlabel('Factor of non-informative genes added to base signature', fontsize=16)
    plt.ylabel('AUCROC',fontsize=16)
    plt.yticks(fontsize=16)
    if save:
        plt.savefig(os.path.join(storing_path,f'evaluation_{method}.svg'), format='svg')
        plt.savefig(os.path.join(storing_path,f'evaluation_{method}.png'), format='png', dpi=300)
    plt.show()