In [1]:
import pandas as pd
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from itertools import cycle
import numpy as np
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    auc,
    average_precision_score,
    accuracy_score,
    f1_score,
    matthews_corrcoef)
from scipy import interpolate

In [2]:
project_folder = '/projects/0/einf2380'
protein_class = 'I'

# deeprank/seq-based models
deeprank_path = '/projects/0/einf2380/data/pop_paper_data'
cnn_path = os.path.join(deeprank_path, 'cnn_outputs')
mlp_path = os.path.join(deeprank_path, 'cnn_outputs')
mhcf_path = os.path.join(deeprank_path, 'cnn_outputs')
# mlp_path = os.path.join(deeprank_path, 'mlp_outputs') # when files are ready, modify these back
# mhcf_path = os.path.join(deeprank_path, 'mhcflurry_outputs')
# deeprankcore
comparison_id = 'paper_plots_best_models'
# exp we want to compare with sequence-based models baseline and best CNN
best_gnn_prefix = 'exp_100k_std_transf_bs64_naivegnn1_wloss_wdecay' # edit removing wdecay when gnn's models are ready
exp_path = f'{project_folder}/data/pMHC{protein_class}/trained_models/deeprankcore/experiments/'
exp_log = pd.read_excel(exp_path + '_experiments_log.xlsx', index_col='exp_id')

######## Definitions used in the plotting
comparisons_path = os.path.join(exp_path, 'comparisons', 'baseline')
comparison_path = os.path.join(comparisons_path, comparison_id)

if not os.path.exists(comparisons_path):
    os.makedirs(comparisons_path)

if not os.path.exists(comparison_path):
    os.makedirs(comparison_path)
else:
    print(f'Folder comparisons/{comparison_id}/ already exists! \
          \nChange comparison_id if you want to save plots for a different comparison.')

Folder comparisons/paper_plots_best_models/ already exists!           
Change comparison_id if you want to save plots for a different comparison.


In [3]:
# compute metrics for deeprank/seq-based models
# maximize mcc on 'phase', then compute metrics on the testing set
def metrics_from_csv(df, model_name, phase):
    df_phase = df[df.PHASE == phase]
    y_true = np.array(df_phase.TARGET.values.tolist())
    y_score = np.array(df_phase.OUTPUT_1.values.tolist())

    thrs = np.linspace(0,1,100)
    accuracy = []
    f1 = []
    mcc = []
    for thr in thrs:
        y_pred = (y_score > thr)*1
        accuracy.append(accuracy_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        mcc.append(matthews_corrcoef(y_true, y_pred))

    thr_df = pd.DataFrame({
        'thr': thrs,
        'accuracy': accuracy,
        'f1': f1,
        'mcc': mcc})
    # maximize mcc for this model, using 'phase' set
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr
    
    # use selected threshold on the testing set
    df_test = df[df.PHASE == 'testing']
    y_true = np.array(df_test.TARGET.values.tolist())
    y_score = np.array(df_test.OUTPUT_1.values.tolist())
    y_pred = (y_score > sel_thr)*1
    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)

    metrics = {}
    metrics['model'] = [model_name]
    metrics['mcc'] = [matthews_corrcoef(y_true, y_pred)]
    metrics['acc'] = [accuracy_score(y_true, y_pred)]
    metrics['f1'] = [f1_score(y_true, y_pred)]
    metrics['auc'] = [auc(fpr_roc, tpr_roc)]
    return metrics

In [4]:
# read in the data from a specific deeprankcore experiment
def get_single_exp_df(exp_id, exp_log, exp_path):
    exp_fullname = exp_log.loc[exp_id].exp_fullname
    exp_path = os.path.join(exp_path, exp_fullname)
    output_path = os.path.join(exp_path, 'output')
    output_train = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='training')
    output_test = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='testing')
    df = pd.concat([output_train, output_test])
    df.sort_values(by=['epoch'], inplace = True)
    return df

# compute metrics for deeprankcore models
# maximize mcc on validation, then compute metrics on test set
def metrics_from_hdf5(exp_id, model_name):
    epoch = exp_log.loc[exp_id].saved_epoch
    df = get_single_exp_df(exp_id, exp_log, exp_path)
    df_valid = df[(df.epoch == epoch) & (df.phase == 'validation')]

    y_true = df_valid.target
    y_score = np.array(df_valid.output.values.tolist())[:, 1]

    thrs = np.linspace(0,1,100)
    accuracy = []
    f1 = []
    mcc = []
    for thr in thrs:
        y_pred = (y_score > thr)*1
        accuracy.append(accuracy_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        mcc.append(matthews_corrcoef(y_true, y_pred))

    thr_df = pd.DataFrame({
        'thr': thrs,
        'accuracy': accuracy,
        'f1': f1,
        'mcc': mcc})
    # maximize mcc for this model, using validation set
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr

    # use selected threshold on the testing set
    df_test = df[(df.epoch == epoch) & (df.phase == 'testing')]
    y_true = df_test.target
    y_score = np.array(df_test.output.values.tolist())[:, 1]
    y_pred = (y_score > sel_thr)*1
    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)

    metrics = {}
    metrics['model'] = [model_name]
    metrics['mcc'] = [matthews_corrcoef(y_true, y_pred)]
    metrics['acc'] = [accuracy_score(y_true, y_pred)]
    metrics['f1'] = [f1_score(y_true, y_pred)]
    metrics['auc'] = [auc(fpr_roc, tpr_roc)]
    return metrics

In [5]:
all_metrics = pd.DataFrame()

# configs are shuffled, peptide (Gibbs), peptide2_10set (Marieke's cluster with 10 clusters), allele
# cnns, mhcflurry and mlp should have peptide only for now (Gibbs)

for config in ['shuffled', 'peptide', 'allele']:

    cnn_res = pd.read_csv(os.path.join(cnn_path, f'{config}_cnn_outputs.csv'))
    mlp_res = pd.read_csv(os.path.join(cnn_path, f'{config}_cnn_outputs.csv'))
    mhcf_res = pd.read_csv(os.path.join(cnn_path, f'{config}_cnn_outputs.csv'))
    # mlp_res = pd.read_csv(os.path.join(mlp_path, f'{config}_mlp_outputs.csv')) # when files are ready, modify these back
    # mhcf_res = pd.read_csv(os.path.join(mhcf_path, f'{config}_mhcflurry_outputs.csv'))
    # set deeprankcore model name
    if config == 'shuffled':
        exp_id = best_gnn_prefix + '_0'
    else:
        exp_id = best_gnn_prefix + f'_cl_{config}_0'

    cnn_metrics = metrics_from_csv(cnn_res, f'{config}_cnn', 'validation')
    gnn_metrics = metrics_from_hdf5(exp_id, f'{config}_gnn')
    mlp_metrics = metrics_from_csv(mlp_res, f'{config}_mlp', 'validation')
    mhcf_metrics = metrics_from_csv(mhcf_res, f'{config}_mhcf', 'validation') # modify with training when file is ready

    all_metrics = pd.concat([all_metrics, pd.DataFrame(cnn_metrics)], ignore_index=True)
    all_metrics = pd.concat([all_metrics, pd.DataFrame(gnn_metrics)], ignore_index=True)
    all_metrics = pd.concat([all_metrics, pd.DataFrame(mlp_metrics)], ignore_index=True)
    all_metrics = pd.concat([all_metrics, pd.DataFrame(mhcf_metrics)], ignore_index=True)

In [41]:
# single config plots across all models
config = 'allele'
models = [
    f'{config}_cnn',
    f'{config}_gnn',
    f'{config}_mhcf',
    f'{config}_mlp']
models_names = [
    '3D CNN',
    'GNN',
    'Re-trained MHCFlurry 2.0',
    'Seq-based MLP'
]
metrics = ['auc', 'f1', 'mcc', 'acc']
metrics_names = ['AUC', 'F1', 'MCC', 'Accuracy']
fig = go.Figure()
colors = ['#56B4E9', '#009E73', '#F0E442', '#E69F00']

for ii, model in enumerate(models):
    idx = all_metrics.index[all_metrics['model'] == model].tolist()[0]
    fig.add_trace(go.Bar(
        x = metrics_names,
        y = [all_metrics.loc[idx][metric] for metric in metrics],
        marker_color=colors[ii],
        name = models_names[ii],
        legendgroup = model
    ))

fig.update_yaxes(title_text="Score value")
fig.update_layout(
    barmode='group',
    title=f'Configuration type: {config.upper()}',
    title_x=0.5,
    width=1100, height=600)

fig.update_traces(marker_line_color='rgb(0,0,0)',
                  marker_line_width=0.0)
                  
fig.update_layout(
    plot_bgcolor='white',
    width=900,
    height=600, 
    barmode="group",
    # legend_title = "Architecture",
    # legend_traceorder="normal",
)

fig.update_xaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    tickfont_size=15
)
fig.update_yaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    title_text='Score value',
    tickfont_size=15
)

fig.show()
fig.write_html(os.path.join(comparison_path, f'{config}.html'))

In [40]:
# all config plots choosing a metric
metric = 'auc'
metric_name = 'AUC'
configs = ['shuffled', 'peptide', 'allele']
configs_names = ['Shuffled', 'Peptide-clustered', 'Allele-clustered']
models = [
    '_cnn',
    '_gnn',
    '_mhcf',
    '_mlp']
models_names = [
    '3D CNN',
    'GNN',
    'Re-trained MHCFlurry 2.0',
    'Seq-based MLP'
]
fig = go.Figure()
colors = ['#56B4E9', '#009E73', '#F0E442', '#E69F00']

for ii, model in enumerate(models):
    y_values = []
    for config in configs:
        idx = all_metrics.index[all_metrics['model'] == config + model].tolist()[0]
        y_values.append(all_metrics.loc[idx][metric])

    fig.add_trace(go.Bar(
        x = configs_names,
        y = y_values,
        marker_color=colors[ii],
        name = models_names[ii],
        legendgroup = models_names[ii]
    ))

fig.update_traces(marker_line_color='rgb(0,0,0)',
                  marker_line_width=0.0)
                  
fig.update_layout(
    plot_bgcolor='white',
    width=900,
    height=600, 
    barmode="group",
    # legend_title = "Architecture",
    # legend_traceorder="normal",
)

fig.update_xaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    tickfont_size=15
)
fig.update_yaxes(
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    title_text=metric_name,
    tickfont_size=15
)

fig.show()
fig.write_html(os.path.join(comparison_path, f'{metric}.html'))

In [None]:
#### Poster
x_axis = ['Shuffled', 'Peptide-clustered', 'Allele-clustered']

models = {
    '3D CNN': [0.871268, 0, 0.725724],
    'GNN': [0.8586, 0.842082, 0.674662],
    'Re-trained MHCFlurry 2.0': [0.735547, 0, 0.606256],
    'Seq-based NN': [0.892664, 0, 0.459883]}
fig = go.Figure()

for key, value in models.items():
    fig.add_trace(go.Bar(
        x = x_axis,
        y = value,
        name = key,
        legendgroup = key,
        # text = key
    ))

fig.update_yaxes(title_text="AUC", tickfont_size=15)
fig.update_xaxes(title_text="Dataset", tickfont_size=15)
fig.update_layout(
    barmode='group',
    title_x=0.5,
    width=900, height=500,
    showlegend = True,
    font=dict(
        size=16,
        color="#421A48"
    )
    )
# fig.write_html(os.path.join(comparison_path, f'{cl_type}.html'))
# fig.write_image("plot1.svg")
fig.show()