In [1]:
import pandas as pd
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from itertools import cycle
import numpy as np
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    auc,
    average_precision_score,
    accuracy_score,
    f1_score,
    matthews_corrcoef)
from scipy import interpolate

In [2]:
baseline_results_path = '/projects/0/einf2380/data/results/best_models_metrics.csv'
baseline_df = pd.read_csv(baseline_results_path)
models = {
    'SHUFF_CNN': 'CNN_shuffle_class',
    'PEPT_CNN': 'CNN_peptide_class',
    'ALLELE_CNN': 'CNN_allele_class',
    'SHUFF_Group_Reg4': 'CNN_shuffle_reg',
    'PEPT_Group_Reg4': 'CNN_peptide_reg',
    'ALLELE_Group_Reg4': 'CNN_allele_reg',
    'mlp_classification_blosum_with_allele_encoder_500_neurons_50_epochs_shuffled_64_batch_size.pt': 'MLP_shuffle_class',
    'mlp_classification_blosum_with_allele_encoder_500_neurons_50_epochs_LOGO_anch_rep_64_batch_size.pt': 'MLP_peptide_class',
    'mlp_classification_blosum_with_allele_encoder_500_neurons_50_epochs_pseudoseq_cluster_64_batch_size.pt': 'MLP_allele_class',
    'mhcflurry_held_out_trained': 'MHCFlurry_shuffle',
    'mhcflurry_peptide_clustered_trained': 'MHCFlurry_peptide',
    'mhcflurry_allele_clustered_trained': 'MHCFlurry_allele',
}
baseline_df.rename(columns={"Model":"model"}, inplace=True)
baseline_df.replace(models, inplace=True)
baseline_df.head()

Unnamed: 0,model,acc,auc,f1,mcc,tnr,tpr
0,SHUFF_Cnn_SumFeat_ChannExpand,0.787383,0.855928,0.751575,0.566675,0.831581,0.73111
1,PEPT_Cnn_SumFeat_ChannExpand,0.717805,0.788124,0.70636,0.43834,0.695017,0.745038
2,ALLELE_Cnn_SumFeat_ChannExpand,0.649774,0.697055,0.431115,0.245109,0.878164,0.322732
3,SHUFF_Group_Class4,0.777401,0.848964,0.740999,0.546493,0.819462,0.723848
4,PEPT_Group_Class4,0.710789,0.750552,0.692116,0.42058,0.708477,0.713552


In [3]:
baseline_df

Unnamed: 0,model,acc,auc,f1,mcc,tnr,tpr
0,SHUFF_Cnn_SumFeat_ChannExpand,0.787383,0.855928,0.751575,0.566675,0.831581,0.73111
1,PEPT_Cnn_SumFeat_ChannExpand,0.717805,0.788124,0.70636,0.43834,0.695017,0.745038
2,ALLELE_Cnn_SumFeat_ChannExpand,0.649774,0.697055,0.431115,0.245109,0.878164,0.322732
3,SHUFF_Group_Class4,0.777401,0.848964,0.740999,0.546493,0.819462,0.723848
4,PEPT_Group_Class4,0.710789,0.750552,0.692116,0.42058,0.708477,0.713552
5,ALLELE_Group_Class4,0.646601,0.674479,0.49623,0.244818,0.802548,0.423293
6,CNN_shuffle_class,0.780295,0.871268,0.712325,0.557802,0.907503,0.618334
7,CNN_peptide_class,0.713439,0.807924,0.624438,0.42716,0.872852,0.52293
8,CNN_allele_class,0.639004,0.725724,0.332978,0.221437,0.932223,0.21913
9,MLP_shuffle_class,0.818141,0.892664,0.793863,0.631333,0.79577,0.79577


In [4]:
######## Modify here
project_folder = '/projects/0/einf2380'
protein_class = 'I'
exp_path = f'{project_folder}/data/pMHC{protein_class}/trained_models/deeprankcore/experiments/'
# exps we want to compare with sequence-based model baseline and best CNN
exp_ids = [
    'exp_100k_std_transf_bs64_naivegnn_0',
    'exp_100k_std_transf_bs64_naivegnn_cl_peptide2_0',
    'exp_100k_std_transf_bs64_naivegnn_cl_allele_0']
new_exp_ids = [
    'GNN_shuffle_class',
    'GNN_peptide_class',
    'GNN_allele_class']
comparison_id = 'std_transf_bs64_naivegnn'
########
exp_log = pd.read_excel(exp_path + '_experiments_log.xlsx', index_col='exp_id')
exp_log.head()

Unnamed: 0_level_0,exp_fullname,exp_path,start_time,end_time,input_data_path,protein_class,target_data,resolution,task,node_features,...,training_accuracy,validation_accuracy,testing_accuracy,training_precision,validation_precision,testing_precision,training_recall,validation_recall,testing_recall,features
exp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
exp_100k_std_transf_bs64_naivegnn1_wloss_wdecay_cl_allele_0,exp_100k_std_transf_bs64_naivegnn1_wloss_wdeca...,/projects/0/einf2380/data/pMHCI/trained_models...,08/Jun/2023_16:52:15,09/Jun/2023_07:26:14,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.714,0.704,0.641,0.619,0.609,0.55,0.924,0.931,0.703,
exp_100k_std_transf_bs64_naivegnn1_wloss_wdecay_cl_peptide2_0,exp_100k_std_transf_bs64_naivegnn1_wloss_wdeca...,/projects/0/einf2380/data/pMHCI/trained_models...,08/Jun/2023_16:47:56,09/Jun/2023_06:17:44,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.689,0.694,0.754,0.603,0.61,0.628,0.947,0.938,0.708,
exp_100k_std_transf_bs64_naivegnn1_wloss_wdecay_0,exp_100k_std_transf_bs64_naivegnn1_wloss_wdeca...,/projects/0/einf2380/data/pMHCI/trained_models...,08/Jun/2023_14:51:43,09/Jun/2023_03:22:05,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.764,0.745,0.761,0.693,0.664,0.721,0.833,0.854,0.746,
exp_100k_std_transf_bs64_naivegnn1_wloss_cl_allele_0,exp_100k_std_transf_bs64_naivegnn1_wloss_cl_al...,/projects/0/einf2380/data/pMHCI/trained_models...,07/Jun/2023_09:54:50,07/Jun/2023_17:27:29,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.758,0.724,0.661,0.663,0.628,0.58,0.924,0.928,0.64,
exp_100k_std_transf_bs64_naivegnn1_wloss_0,exp_100k_std_transf_bs64_naivegnn1_wloss_0_230607,/projects/0/einf2380/data/pMHCI/trained_models...,07/Jun/2023_09:46:52,07/Jun/2023_17:04:29,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.81,0.779,0.781,0.757,0.729,0.733,0.838,0.791,0.79,


In [5]:
######## Definitions used in the plotting
comparisons_path = os.path.join(exp_path, 'comparisons', 'baseline')
comparison_path = os.path.join(comparisons_path, comparison_id)

if not os.path.exists(comparisons_path):
    os.makedirs(comparisons_path)

if not os.path.exists(comparison_path):
    os.makedirs(comparison_path)
else:
    print(f'Folder comparisons/{comparison_id}/ already exists! \
          \nChange comparison_id if you want to save plots for a different comparison.')

def get_single_exp_df(exp_id, exp_log, exp_path):
    exp_fullname = exp_log.loc[exp_id].exp_fullname
    exp_path = os.path.join(exp_path, exp_fullname)
    output_path = os.path.join(exp_path, 'output')
    output_train = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='training')
    output_test = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='testing')
    df = pd.concat([output_train, output_test])
    df.sort_values(by=['epoch'], inplace = True)
    return df

Folder comparisons/std_transf_bs64_naivegnn/ already exists!           
Change comparison_id if you want to save plots for a different comparison.


In [6]:
# # maximize mcc
# for idx, exp_id in enumerate(exp_ids):
#     exp_dict = {}
#     exp_dict["model"] = new_exp_ids[idx]
#     epoch = exp_log.loc[exp_id].saved_epoch
#     df = get_single_exp_df(exp_id, exp_log, exp_path)
#     df_plot = df[(df.epoch == epoch) & (df.phase == 'testing')]

#     y_true = df_plot.target
#     y_score = np.array(df_plot.output.values.tolist())[:, 1]

#     thrs = np.linspace(0,1,100)
#     accuracy = []
#     f1 = []
#     mcc = []
#     for thr in thrs:
#         y_pred = (y_score > thr)*1
#         accuracy.append(accuracy_score(y_true, y_pred))
#         f1.append(f1_score(y_true, y_pred))
#         mcc.append(matthews_corrcoef(y_true, y_pred))

#     thr_df = pd.DataFrame({
#         'thr': thrs,
#         'accuracy': accuracy,
#         'f1': f1,
#         'mcc': mcc})
#     # maximize mcc
#     mcc_idxmax = thr_df.mcc.idxmax()
#     sel_thr = thr_df.loc[mcc_idxmax].thr
#     exp_dict["model"] = exp_dict["model"] + f"_{sel_thr:.2f}"
#     exp_dict["mcc"] = thr_df.loc[mcc_idxmax].mcc
#     exp_dict["acc"] = thr_df.loc[mcc_idxmax].accuracy
#     exp_dict["f1"] = thr_df.loc[mcc_idxmax].f1

#     fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
#     exp_dict["auc"] = auc(fpr_roc, tpr_roc)
#     tpr_intrp = interpolate.interp1d(thr_roc, tpr_roc)
#     exp_dict["tpr"] = float(tpr_intrp(sel_thr)) # recall

#     baseline_df = baseline_df.append(exp_dict, ignore_index=True)

In [7]:
for idx, exp_id in enumerate(exp_ids):
    exp_dict = {}
    exp_dict["model"] = new_exp_ids[idx]
    epoch = exp_log.loc[exp_id].saved_epoch
    df = get_single_exp_df(exp_id, exp_log, exp_path)
    df_plot = df[(df.epoch == epoch) & (df.phase == 'testing')]

    y_true = df_plot.target
    y_score = np.array(df_plot.output.values.tolist())[:, 1]

    thr = 0.5
    y_pred = (y_score > thr)*1
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)

    exp_dict["model"] = exp_dict["model"] + f"_{thr:.2f}"
    exp_dict["mcc"] = mcc
    exp_dict["acc"] = accuracy
    exp_dict["f1"] = f1

    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    exp_dict["auc"] = auc(fpr_roc, tpr_roc)
    tpr_intrp = interpolate.interp1d(thr_roc, tpr_roc)
    exp_dict["tpr"] = float(tpr_intrp(thr)) # recall

    baseline_df = baseline_df.append(exp_dict, ignore_index=True)

  baseline_df = baseline_df.append(exp_dict, ignore_index=True)
  baseline_df = baseline_df.append(exp_dict, ignore_index=True)
  baseline_df = baseline_df.append(exp_dict, ignore_index=True)


In [8]:
baseline_df

Unnamed: 0,model,acc,auc,f1,mcc,tnr,tpr
0,SHUFF_Cnn_SumFeat_ChannExpand,0.787383,0.855928,0.751575,0.566675,0.831581,0.73111
1,PEPT_Cnn_SumFeat_ChannExpand,0.717805,0.788124,0.70636,0.43834,0.695017,0.745038
2,ALLELE_Cnn_SumFeat_ChannExpand,0.649774,0.697055,0.431115,0.245109,0.878164,0.322732
3,SHUFF_Group_Class4,0.777401,0.848964,0.740999,0.546493,0.819462,0.723848
4,PEPT_Group_Class4,0.710789,0.750552,0.692116,0.42058,0.708477,0.713552
5,ALLELE_Group_Class4,0.646601,0.674479,0.49623,0.244818,0.802548,0.423293
6,CNN_shuffle_class,0.780295,0.871268,0.712325,0.557802,0.907503,0.618334
7,CNN_peptide_class,0.713439,0.807924,0.624438,0.42716,0.872852,0.52293
8,CNN_allele_class,0.639004,0.725724,0.332978,0.221437,0.932223,0.21913
9,MLP_shuffle_class,0.818141,0.892664,0.793863,0.631333,0.79577,0.79577


In [11]:
######## Compare
cl_type = 'allele'
metrics = ['auc', 'f1', 'mcc', 'tpr', 'acc']
########
models = [
    f'CNN_{cl_type}_class', 
    # f'CNN_{cl_type}_reg',
    f'MLP_{cl_type}_class',
    f'MHCFlurry_{cl_type}',
    f'GNN_{cl_type}_class_0.43']
fig = go.Figure()

for model in models:
    idx = baseline_df.index[baseline_df['model'] == model].tolist()[0]
    fig.add_trace(go.Bar(
        x = metrics,
        y = [baseline_df.loc[idx][metric] for metric in metrics],
        name = ''.join(model.split(f"_{cl_type}")),
        legendgroup = model
    ))

fig.update_yaxes(title_text="Value")
fig.update_layout(
    barmode='group',
    title=f'Experiments type: {cl_type}',
    title_x=0.5,
    width=1100, height=600)
fig.write_html(os.path.join(comparison_path, f'{cl_type}.html'))

In [10]:
#### Poster
x_axis = ['Shuffled', 'Peptide-clustered', 'Allele-clustered']

models = {
    '3D-CNN': [0.871268, 0, 0.725724],
    '3D-GNN': [0.8586, 0.842082, 0.674662],
    'Re-trained MHCFlurry2.0': [0.735547, 0, 0.606256],
    'Seq-based NN': [0.892664, 0, 0.459883]}
fig = go.Figure()

for key, value in models.items():
    fig.add_trace(go.Bar(
        x = x_axis,
        y = value,
        name = key,
        legendgroup = key,
        # text = key
    ))

fig.update_yaxes(title_text="AUC", tickfont_size=15)
fig.update_xaxes(title_text="Dataset", tickfont_size=15)
fig.update_layout(
    barmode='group',
    title_x=0.5,
    width=900, height=500,
    showlegend = True,
    font=dict(
        size=16,
        color="#421A48"
    )
    )
# fig.write_html(os.path.join(comparison_path, f'{cl_type}.html'))
fig.write_image("plot1.svg")
fig.show()