In [1]:
import pandas as pd
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from itertools import cycle
import numpy as np
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    auc,
    average_precision_score,
    accuracy_score,
    f1_score,
    matthews_corrcoef)
from scipy import interpolate

In [2]:
baseline_results_path = '/projects/0/einf2380/data/results/best_models_metrics.csv'
baseline_df = pd.read_csv(baseline_results_path)
models = {
    'SHUFF_Cnn_SumFeat_ChannExpand': 'CNN_shuffle_class',
    'PEPT_Cnn_SumFeat_ChannExpand': 'CNN_peptide_class',
    'ALLELE_Cnn_SumFeat_ChannExpand': 'CNN_allele_class',
    'SHUFF_Group_Reg4': 'CNN_shuffle_reg',
    'PEPT_Group_Reg4': 'CNN_peptide_reg',
    'ALLELE_Group_Reg4': 'CNN_allele_reg',
    'mlp_classification_blosum_with_allele_encoder_500_neurons_50_epochs_shuffled_64_batch_size.pt': 'MLP_shuffle_class',
    'mlp_classification_blosum_with_allele_encoder_500_neurons_50_epochs_LOGO_anch_rep_64_batch_size.pt': 'MLP_peptide_class',
    'mlp_classification_blosum_with_allele_encoder_500_neurons_50_epochs_pseudoseq_cluster_64_batch_size.pt': 'MLP_allele_class',
    'mhcflurry_held_out_trained': 'MHCFlurry_shuffle',
    'mhcflurry_peptide_clustered_trained': 'MHCFlurry_peptide',
    'mhcflurry_allele_clustered_trained': 'MHCFlurry_allele',
}
baseline_df.rename(columns={"Model":"model"}, inplace=True)
baseline_df.replace(models, inplace=True)
baseline_df.head()

Unnamed: 0,model,acc,auc,f1,mcc,tnr,tpr
0,CNN_shuffle_class,0.787383,0.855928,0.751575,0.566675,0.831581,0.73111
1,CNN_peptide_class,0.717805,0.788124,0.70636,0.43834,0.695017,0.745038
2,CNN_allele_class,0.649774,0.697055,0.431115,0.245109,0.878164,0.322732
3,CNN_shuffle_reg,0.780295,0.871268,0.712325,0.557802,0.907503,0.618334
4,CNN_peptide_reg,0.713439,0.807924,0.624438,0.42716,0.872852,0.52293


In [3]:
######## Modify here
exp_path = './experiments/'
# exps we want to compare with sequence-based model baseline and best CNN
exp_ids = ['exp_100k_std_gpu_nw16_1']
new_exp_ids = ['GNN_shuffle_class']
comparison_id = 'standardization'
########
exp_log = pd.read_excel(exp_path + '_experiments_log.xlsx', index_col='exp_id')
exp_log.head()

Unnamed: 0_level_0,exp_fullname,exp_path,start_time,end_time,input_data_path,protein_class,target_data,resolution,task,node_features,...,training_accuracy,validation_accuracy,testing_accuracy,training_precision,validation_precision,testing_precision,training_recall,validation_recall,testing_recall,test_clusters
exp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
exp_100k_pssm_rm_std_classw_bs64_bn1_0,exp_100k_pssm_rm_std_classw_bs64_bn1_0_230330,./experiments/exp_100k_pssm_rm_std_classw_bs64...,30/Mar/2023_16:05:51,30/Mar/2023_18:56:57,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.711,0.68,0.774,0.61,0.584,0.729,0.956,0.951,0.773,
exp_692_pssm_rm_std_classw_bs64_bn1_1,exp_692_pssm_rm_std_classw_bs64_bn1_1_230330,./experiments/exp_692_pssm_rm_std_classw_bs64_...,30/Mar/2023_15:09:06,30/Mar/2023_15:15:47,['/home/ccrocion/snellius_data_sample/data/pMH...,I,BA,residue,classif,all,...,0.483,0.456,0.557,0.47,0.456,1.0,0.956,1.0,0.031,
exp_100k_pssm_rm_allele_C_std_classw_gpu_nw16_0,exp_100k_pssm_rm_allele_C_std_classw_gpu_nw16_...,./experiments/exp_100k_pssm_rm_allele_C_std_cl...,27/Mar/2023_00:34:34,27/Mar/2023_04:04:16,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.466,0.466,0.704,0.45,0.45,0.717,0.999,0.999,0.968,['C']
exp_100k_pssm_rm_clustering_std_classw_gpu_nw16_0,exp_100k_pssm_rm_clustering_std_classw_gpu_nw1...,./experiments/exp_100k_pssm_rm_clustering_std_...,22/Mar/2023_10:31:56,22/Mar/2023_16:25:01,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.8,0.776,0.775,0.741,0.714,0.729,0.84,0.82,0.766,[3]
exp_100k_pssm_rm_std_classw_gpu_nw16_0,exp_100k_pssm_rm_std_classw_gpu_nw16_0_230321,./experiments/exp_100k_pssm_rm_std_classw_gpu_...,21/Mar/2023_15:01:26,21/Mar/2023_22:58:50,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.783,0.764,0.774,0.711,0.69,0.729,0.853,0.845,0.773,


In [4]:
######## Definitions used in the plotting
comparisons_path = os.path.join(exp_path, 'comparisons', 'baseline')
comparison_path = os.path.join(comparisons_path, comparison_id)

if not os.path.exists(comparisons_path):
    os.makedirs(comparisons_path)

if not os.path.exists(comparison_path):
    os.makedirs(comparison_path)
else:
    print(f'Folder comparisons/{comparison_id}/ already exists! \
          \nChange comparison_id if you want to save plots for a different comparison.')

def get_single_exp_df(exp_id, exp_log, exp_path):
    exp_fullname = exp_log.loc[exp_id].exp_fullname
    exp_path = os.path.join(exp_path, exp_fullname)
    output_path = os.path.join(exp_path, 'output')
    output_train = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='training')
    output_test = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='testing')
    df = pd.concat([output_train, output_test])
    df.sort_values(by=['epoch'], inplace = True)
    return df

Folder comparisons/standardization/ already exists!           
Change comparison_id if you want to save plots for a different comparison.


In [5]:
for idx, exp_id in enumerate(exp_ids):
    exp_dict = {}
    exp_dict["model"] = new_exp_ids[idx]
    df = get_single_exp_df(exp_id, exp_log, exp_path)
    df_plot = df[(df.epoch == 0) & (df.phase == 'testing')]

    y_true = df_plot.target
    y_score = np.array(df_plot.output.values.tolist())[:, 1]

    thrs = np.linspace(0,1,100)
    accuracy = []
    f1 = []
    mcc = []
    for thr in thrs:
        y_pred = (y_score > thr)*1
        accuracy.append(accuracy_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        mcc.append(matthews_corrcoef(y_true, y_pred))

    thr_df = pd.DataFrame({
        'thr': thrs,
        'accuracy': accuracy,
        'f1': f1,
        'mcc': mcc})
    # maximize mcc
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr
    exp_dict["model"] = exp_dict["model"] + f"_{sel_thr:.2f}"
    exp_dict["mcc"] = thr_df.loc[mcc_idxmax].mcc
    exp_dict["acc"] = thr_df.loc[mcc_idxmax].accuracy
    exp_dict["f1"] = thr_df.loc[mcc_idxmax].f1

    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    auc = auc(fpr_roc, tpr_roc)
    exp_dict["auc"] = auc
    tpr_intrp = interpolate.interp1d(thr_roc, tpr_roc)
    exp_dict["tpr"] = float(tpr_intrp(sel_thr)) # recall

    baseline_df = baseline_df.append(exp_dict, ignore_index=True)

  baseline_df = baseline_df.append(exp_dict, ignore_index=True)


In [6]:
baseline_df

Unnamed: 0,model,acc,auc,f1,mcc,tnr,tpr
0,CNN_shuffle_class,0.787383,0.855928,0.751575,0.566675,0.831581,0.73111
1,CNN_peptide_class,0.717805,0.788124,0.70636,0.43834,0.695017,0.745038
2,CNN_allele_class,0.649774,0.697055,0.431115,0.245109,0.878164,0.322732
3,CNN_shuffle_reg,0.780295,0.871268,0.712325,0.557802,0.907503,0.618334
4,CNN_peptide_reg,0.713439,0.807924,0.624438,0.42716,0.872852,0.52293
5,CNN_allele_reg,0.639004,0.725724,0.332978,0.221437,0.932223,0.21913
6,MLP_shuffle_class,0.79577,0.79577,0.892664,0.631333,0.818141,0.793863
7,MLP_peptide_class,0.745867,0.745867,0.855406,0.553975,0.784439,0.747912
8,MLP_allele_class,0.195628,0.195628,0.459883,0.042437,0.567818,0.255109
9,MHCFlurry_shuffle,0.757804,0.713289,0.735547,0.471687,0.736301,0.723246


In [37]:
######## Compare
cl_type = 'shuffle'
metrics = ['auc', 'f1', 'mcc', 'tpr', 'acc']
########
models = [
    f'CNN_{cl_type}_class', 
    f'CNN_{cl_type}_reg',
    f'MLP_{cl_type}_class',
    f'MHCFlurry_{cl_type}',
    f'GNN_{cl_type}_class_0.43']
fig = go.Figure()

for model in models:
    idx = baseline_df.index[baseline_df['model'] == model].tolist()[0]
    fig.add_trace(go.Bar(
        x = metrics,
        y = [baseline_df.loc[idx][metric] for metric in metrics],
        name = ''.join(model.split(f"_{cl_type}")),
        legendgroup = model
    ))

fig.update_yaxes(title_text="Value")
fig.update_layout(
    barmode='group',
    title=f'Experiments type: {cl_type}',
    title_x=0.5,
    width=1100, height=600)
fig.show()
# fig.write_html(os.path.join(comparison_path, 'timings.html'))