In [None]:
import os
import sys
import pandas as pd 
import glob
import seaborn as sns
import matplotlib.pyplot as plt

sys.path.append('../..')
from data.constants import BASE_PATH_EXPERIMENTS

plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})


In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

base_path = os.path.join(BASE_PATH_EXPERIMENTS, "/signature_noise_addition_experiments/")

sc_names = ['ANS','Seurat', 'Seurat_AG','Seurat_LVG','Scanpy', 'Jasmine_LH', 'Jasmine_OR','UCell']

max_abs_log2fc = 0.5
min_pval = 0.01
# max_abs_log2fc = None
# min_pval = None

if max_abs_log2fc is None and min_pval is None:
    suffix = 'noise_genes_all_remaining'
elif max_abs_log2fc is None and min_pval is not None:
    suffix = f'noise_genes_min_pval_{min_pval}'
elif max_abs_log2fc is not None and min_pval is None:
    suffix = f'noise_genes_max_abs_log2fc_{max_abs_log2fc}'
else:
    suffix = f'noise_genes_max_abs_log2fc_{max_abs_log2fc}_min_pval_{min_pval}'

name_map={
    f'AUCROC_20sims_adjusted_neighborhood_scoring_{suffix}.csv':'ANS',
    f'AUCROC_20sims_jasmine_scoring_lh_{suffix}.csv':'Jasmine_LH',
    f'AUCROC_20sims_jasmine_scoring_or_{suffix}.csv':'Jasmine_OR',
    f'AUCROC_20sims_scanpy_scoring_{suffix}.csv':'Scanpy',
    f'AUCROC_20sims_seurat_ag_scoring_{suffix}.csv':'Seurat_AG',
    f'AUCROC_20sims_seurat_lvg_scoring_{suffix}.csv':'Seurat_LVG',
    f'AUCROC_20sims_seurat_scoring_{suffix}.csv':'Seurat',
    f'AUCROC_20sims_ucell_scoring_{suffix}.csv':'UCell',
}

def load_results(list_fns):
    dfs_roc = []
    dfs_pr = []
    for fn in list_fns:
        df = pd.read_csv(fn)
        df_aucroc = df[['purity'] + [x for x in df.columns if 'AUCROC' in x]].copy()
        df_aucpr = df[['purity'] + [x for x in df.columns if 'AUCPR' in x]].copy()

        df_aucroc.columns = ['purity'] + ['sim_'+x.split('_')[1]for x in df_aucroc.columns[1:]]
        df_aucpr.columns = ['purity'] + ['sim_'+x.split('_')[1]for x in df_aucpr.columns[1:]]

        df_aucroc = df_aucroc.melt(id_vars=['purity'],
                                   var_name='simulation',
                                   value_name='AUCROC',
                                )
        df_aucpr = df_aucpr.melt(id_vars=['purity'],
                                 var_name='simulation',
                                 value_name='AUCPR',
                            )
        
        df_aucroc['scoring_method'] = name_map[os.path.basename(fn)]
        df_aucpr['scoring_method'] = name_map[os.path.basename(fn)]
        
        df_aucroc.simulation = df_aucroc.simulation.apply(lambda x: int(x.split('_')[1]))
        df_aucroc.purity = (df_aucroc.purity / df_aucroc.purity.max() )*100

        df_aucpr.simulation = df_aucpr.simulation.apply(lambda x: int(x.split('_')[1]))
        df_aucpr.purity = (df_aucroc.purity / df_aucroc.purity.max() )*100

        
        dfs_roc.append(df_aucroc)
        dfs_pr.append(df_aucpr)
    
    return pd.concat(dfs_roc, axis=0).reset_index(drop=True), pd.concat(dfs_pr, axis=0).reset_index(drop=True)


def draw_lines(df, auc_col='AUCROC', dataset='crc', figsize=(12,5)):
    plt.figure(figsize=figsize)
    g = sns.lineplot(
        data = df, 
        x = 'pct_noise',
        y = auc_col,
        hue = 'scoring_method',
        hue_order = sc_names,
        legend=True
    )
    #g.invert_xaxis()
    g.set_title(f'Signal-to-noise ratio on {dataset.upper()}');
    g.set_xlabel(f'Noise in signature (%)');
    g.axhline(0.9, label='0.9 AUC',ls=':', c='grey',  zorder=1);
    
    tmp = df.groupby(['scoring_method', 'pct_noise'])[auc_col].mean().reset_index()
    thresh = round(tmp.groupby('scoring_method').apply(lambda x: x[x[auc_col]>0.9].pct_noise.max()).median())
    g.axvline(thresh, label=f'{thresh}% noise',ls=':', c='grey',  zorder=1);
    plt.legend()

    return plt.gcf()

## CRC

In [None]:
dataset = 'crc'
storing_path = os.path.join(base_path, dataset)

#### Signature length 100

In [None]:
sig_length = 100
auc_paths = os.path.join(base_path, dataset, 'AUCROCS', f'sig_len_{sig_length}')
print('Storing_path: ', storing_path)
print('auc_paths:', auc_paths)

In [None]:
AUC_fns = glob.glob(os.path.join(auc_paths,f'*_{suffix}.csv'))
AUC_fns

In [None]:
df_aucroc, df_aucpr = load_results(AUC_fns)

In [None]:
df_aucroc['pct_noise'] = 100-df_aucroc.purity
df_aucpr['pct_noise'] = 100-df_aucpr.purity

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':14})

fig = draw_lines(df_aucroc, auc_col='AUCROC', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCROC_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

fig = draw_lines(df_aucpr, auc_col='AUCPR', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCPR_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

#### Signature length 650

In [None]:
sig_length = 650
auc_paths = os.path.join(base_path, dataset, 'AUCROCS', f'sig_len_{sig_length}')
print('Storing_path: ', storing_path)

In [None]:
AUC_fns = glob.glob(os.path.join(auc_paths,f'*_{suffix}.csv'))
AUC_fns

In [None]:
df_aucroc, df_aucpr =load_results(AUC_fns)

In [None]:
fig = draw_lines(df_aucroc, auc_col='AUCROC', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCROC_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

fig = draw_lines(df_aucpr, auc_col='AUCPR', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCPR_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

## ESCC

In [None]:
dataset = 'escc'
storing_path = os.path.join(base_path, dataset)

#### Signature length 100

In [None]:
sig_length = 100
auc_paths = os.path.join(base_path, dataset, 'AUCROCS', f'sig_len_{sig_length}')
print('Storing_path: ', storing_path)

In [None]:
AUC_fns = glob.glob(os.path.join(auc_paths,f'*_{suffix}.csv'))
df_aucroc, df_aucpr =load_results(AUC_fns)

In [None]:
df_aucroc['pct_noise'] = 100-df_aucroc.purity
df_aucpr['pct_noise'] = 100-df_aucpr.purity

In [None]:
fig = draw_lines(df_aucroc, auc_col='AUCROC', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCROC_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

fig = draw_lines(df_aucpr, auc_col='AUCPR', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCPR_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

#### Signature length 400

In [None]:
sig_length = 400
auc_paths = os.path.join(base_path, dataset, 'AUCROCS', f'sig_len_{sig_length}')
print('Storing_path: ', storing_path)

In [None]:
AUC_fns = glob.glob(os.path.join(auc_paths,f'*_{suffix}.csv'))
df_aucroc, df_aucpr =load_results(AUC_fns)

In [None]:
fig = draw_lines(df_aucroc, auc_col='AUCROC', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCROC_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

fig = draw_lines(df_aucpr, auc_col='AUCPR', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCPR_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

## LUAD

In [None]:
dataset = 'luad'
storing_path = os.path.join(base_path, dataset)

#### Signature length 100

In [None]:
sig_length = 100
auc_paths = os.path.join(base_path, dataset, 'AUCROCS', f'sig_len_{sig_length}')
print('Storing_path: ', storing_path)

In [None]:
AUC_fns = glob.glob(os.path.join(auc_paths,f'*_{suffix}.csv'))
df_aucroc, df_aucpr =load_results(AUC_fns)

In [None]:
df_aucroc['pct_noise'] = 100-df_aucroc.purity
df_aucpr['pct_noise'] = 100-df_aucpr.purity

In [None]:
fig = draw_lines(df_aucroc, auc_col='AUCROC', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCROC_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

fig = draw_lines(df_aucpr, auc_col='AUCPR', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCPR_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

#### Signature length 388

In [None]:
sig_length = 388
auc_paths = os.path.join(base_path, dataset, 'AUCROCS', f'sig_len_{sig_length}')
print('Storing_path: ', storing_path)

In [None]:
AUC_fns = glob.glob(os.path.join(auc_paths,f'*_{suffix}.csv'))
df_aucroc, df_aucpr =load_results(AUC_fns)

In [None]:
fig = draw_lines(df_aucroc, auc_col='AUCROC', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCROC_sig_len_{sig_length}_{suffix}.svg'))
plt.show()

fig = draw_lines(df_aucpr, auc_col='AUCPR', dataset=dataset, figsize=(12,5))
fig.savefig(os.path.join(storing_path, f'signal2noise_AUCPR_sig_len_{sig_length}_{suffix}.svg'))
plt.show()