In [77]:
import h5py
import glob
import os
import sys
from pathlib import Path
import torch
import pandas as pd
import numpy as np
import logging
from deeprankcore.trainer import Trainer
from deeprankcore.dataset import GraphDataset
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    auc,
    average_precision_score,
    precision_score,
    recall_score,
    accuracy_score,
    f1_score,
    matthews_corrcoef)

In [78]:
#################### To fill
exp_id = 'exp_100k_final_Increase2_seed55_rmpssm_0'
cluster_dataset =  False # fill in only if the experiment has clusters
project_folder = '/projects/0/einf2380'
protein_class = 'I'
exp_basepath = f'{project_folder}/data/pMHC{protein_class}/trained_models/deeprankcore/experiments/cyulin'
####################
exp_df = pd.read_excel(os.path.join(exp_basepath, 'cyulin_experiments_log.xlsx'), index_col='exp_id')
exp_df.head()

Unnamed: 0_level_0,exp_fullname,exp_path,start_time,end_time,input_data_path,protein_class,target_data,resolution,task,node_features,...,testing_f1,training_accuracy,validation_accuracy,testing_accuracy,training_precision,validation_precision,testing_precision,training_recall,validation_recall,testing_recall
exp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
exp_100k_final_feattrans_Increase1_cl_allele_seed55_rmpssm_0,exp_100k_final_feattrans_Increase1_cl_allele_s...,/projects/0/einf2380/data/pMHCI/trained_models...,04/Jun/2023_00:48:35,04/Jun/2023_05:37:54,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.579,0.696,0.685,0.629,0.598,0.589,0.543,0.96,0.957,0.62
exp_100k_final_feattrans_Increase1_seed55_rmpssm_0,exp_100k_final_feattrans_Increase1_seed55_rmps...,/projects/0/einf2380/data/pMHCI/trained_models...,04/Jun/2023_00:34:14,04/Jun/2023_05:27:42,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.757,0.792,0.765,0.771,0.725,0.69,0.711,0.849,0.847,0.809
exp_100k_final_feattrans_Increase1_cl_peptide2_seed55_rmpssm_0,exp_100k_final_feattrans_Increase1_cl_peptide2...,/projects/0/einf2380/data/pMHCI/trained_models...,04/Jun/2023_00:37:48,04/Jun/2023_05:23:14,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.692,0.798,0.769,0.773,0.746,0.719,0.65,0.85,0.818,0.741
exp_100k_final_feattrans_Increase1_seed44_rmpssm_0,exp_100k_final_feattrans_Increase1_seed44_rmps...,/projects/0/einf2380/data/pMHCI/trained_models...,04/Jun/2023_00:28:18,04/Jun/2023_05:22:43,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.763,0.805,0.782,0.783,0.774,0.757,0.735,0.787,0.743,0.795
exp_100k_final_Increase2_seed55_rmpssm_0,exp_100k_final_Increase2_seed55_rmpssm_0_230603,/projects/0/einf2380/data/pMHCI/trained_models...,03/Jun/2023_13:34:06,03/Jun/2023_20:29:05,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.756,0.778,0.757,0.771,0.695,0.677,0.711,0.885,0.856,0.808


In [79]:
exp_fullname = exp_df.loc[exp_id].exp_fullname
exp_path = os.path.join(exp_basepath, exp_fullname)
output_path = os.path.join(exp_path, 'output')
img_path = os.path.join(exp_path, 'images')

df_summ = pd.read_hdf(os.path.join(output_path, 'summary_data.hdf5'), key='summary')
output_train = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='training')
output_test = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='testing')
df = pd.concat([output_train, output_test])
df.sort_values(by=['epoch'], inplace = True)
print(df.shape)

(3792611, 6)


In [80]:
if cluster_dataset:
    df = df.merge(df_summ[['entry', 'cluster']], how='left', on='entry')
    print(df.shape)
    df.head()

In [81]:
epoch = exp_df.loc[exp_id].saved_epoch
df_plot = df[(df.epoch == epoch) | ((df.epoch == 0) & (df.phase == 'testing'))]
print(df_plot.shape)
y_true = df_plot.target
y_score = np.array(df_plot.output.values.tolist())[:, 1]

(100069, 6)


In [82]:
d = {'thr': [], 'precision': [], 'recall': [], 'accuracy': [], 'f1': [], 'mcc': [], 'auc': [], 'aucpr': [], 'phase': []}
thr_df = pd.DataFrame(data=d)

for idx, set in enumerate(['training']):
    df_plot_phase = df_plot[(df_plot.phase == set)]
    y_true = df_plot_phase.target
    y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]

    thrs = np.linspace(0,1,100)
    mcc = []
    
    for thr in thrs:
        y_pred = (y_score > thr)*1
        mcc.append(matthews_corrcoef(y_true, y_pred))
    
    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    auc_score = auc(fpr_roc, tpr_roc)
    aucpr = average_precision_score(y_true, y_score)

    phase_df = pd.DataFrame({'thr': thrs ,'mcc': mcc, 'auc': auc_score, 'aucpr': aucpr, 'phase': set})
    thr_df = pd.concat([thr_df, phase_df], ignore_index=True)

# find max mcc of test set
test_df = thr_df.loc[thr_df.phase == 'training']
test_mcc_idxmax = test_df.mcc.idxmax()
if thr_df.loc[test_mcc_idxmax].mcc > 0:
    sel_thr = thr_df.loc[test_mcc_idxmax].thr
# use max mcc of all data if max of test set is 0 (usually only on small local test experiments)
else:
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr
    print("WARNING: Maximum mcc of test set is 0. Instead, maximum mcc of all data will be used for determining optimal threshold.\n")

fig_thresh = px.line(
    thr_df,
    x='thr',
    y=[
        'mcc'
    ],
    facet_col='phase',
    category_orders={'phase': [
        "validation"]},
    width=600,
    height=500
)
fig_thresh.add_vline(x=sel_thr, line_width=3, line_dash="dash", line_color="green")
fig_thresh.update_layout(
    title='MCC score vs thresholds (Standardization Applied)',
    title_x=0.5)
fig_thresh.update_yaxes(range=[-0.2, 1.2], scaleanchor="x", scaleratio=1, constrain='domain')
fig_thresh.update_xaxes(range=[0, 1], scaleratio = 1, constrain='domain')
fig_thresh.write_html(os.path.join(img_path, 'thresholds_metrics.html'))


print(f"Maximum MCC score for training appears at threshold: {sel_thr}")
print(f"Maximum MCC score is: {thr_df.loc[test_mcc_idxmax].mcc}")

Maximum MCC score for training appears at threshold: 0.5050505050505051
Maximum MCC score is: 0.5979369673411115


In [83]:
d = {'thr': [], 'precision': [], 'recall': [], 'accuracy': [], 'f1': [], 'mcc': [], 'auc': [], 'aucpr': [], 'phase': []}
thr_df = pd.DataFrame(data=d)

for idx, set in enumerate(['validation']):
    df_plot_phase = df_plot[(df_plot.phase == set)]
    y_true = df_plot_phase.target
    y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]

    thrs = np.linspace(0,1,100)
    mcc = []
    
    for thr in thrs:
        y_pred = (y_score > thr)*1
        mcc.append(matthews_corrcoef(y_true, y_pred))
    
    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    auc_score = auc(fpr_roc, tpr_roc)
    aucpr = average_precision_score(y_true, y_score)

    phase_df = pd.DataFrame({'thr': thrs ,'mcc': mcc, 'auc': auc_score, 'aucpr': aucpr, 'phase': set})
    thr_df = pd.concat([thr_df, phase_df], ignore_index=True)

# find max mcc of test set
test_df = thr_df.loc[thr_df.phase == 'validation']
test_mcc_idxmax = test_df.mcc.idxmax()
if thr_df.loc[test_mcc_idxmax].mcc > 0:
    sel_thr = thr_df.loc[test_mcc_idxmax].thr
# use max mcc of all data if max of test set is 0 (usually only on small local test experiments)
else:
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr
    print("WARNING: Maximum mcc of test set is 0. Instead, maximum mcc of all data will be used for determining optimal threshold.\n")

fig_thresh = px.line(
    thr_df,
    x='thr',
    y=[
        'mcc'
    ],
    facet_col='phase',
    category_orders={'phase': [
        "validation"]},
    width=600,
    height=500
)
fig_thresh.add_vline(x=sel_thr, line_width=3, line_dash="dash", line_color="green")
fig_thresh.update_layout(
    title='MCC score vs thresholds (Standardization Applied)',
    title_x=0.5)
fig_thresh.update_yaxes(range=[-0.2, 1.2], scaleanchor="x", scaleratio=1, constrain='domain')
fig_thresh.update_xaxes(range=[0, 1], scaleratio = 1, constrain='domain')
fig_thresh.write_html(os.path.join(img_path, 'thresholds_metrics.html'))


print(f"Maximum MCC score for validation appears at threshold: {sel_thr}")
print(f"Maximum MCC score is: {thr_df.loc[test_mcc_idxmax].mcc}")

Maximum MCC score for validation appears at threshold: 0.5555555555555556
Maximum MCC score is: 0.5597089079789267


In [84]:
d = {'thr': [], 'precision': [], 'recall': [], 'accuracy': [], 'f1': [], 'mcc': [], 'auc': [], 'aucpr': [], 'phase': []}
thr_df = pd.DataFrame(data=d)

for idx, set in enumerate(['testing']):
    df_plot_phase = df_plot[(df_plot.phase == set)]
    y_true = df_plot_phase.target
    y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]

    thrs = np.linspace(0,1,100)
    mcc = []
    
    for thr in thrs:
        y_pred = (y_score > thr)*1
        mcc.append(matthews_corrcoef(y_true, y_pred))
    
    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    auc_score = auc(fpr_roc, tpr_roc)
    aucpr = average_precision_score(y_true, y_score)

    phase_df = pd.DataFrame({'thr': thrs ,'mcc': mcc, 'auc': auc_score, 'aucpr': aucpr, 'phase': set})
    thr_df = pd.concat([thr_df, phase_df], ignore_index=True)

# find max mcc of test set
test_df = thr_df.loc[thr_df.phase == 'testing']
test_mcc_idxmax = test_df.mcc.idxmax()
if thr_df.loc[test_mcc_idxmax].mcc > 0:
    sel_thr = thr_df.loc[test_mcc_idxmax].thr
# use max mcc of all data if max of test set is 0 (usually only on small local test experiments)
else:
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr
    print("WARNING: Maximum mcc of test set is 0. Instead, maximum mcc of all data will be used for determining optimal threshold.\n")

fig_thresh = px.line(
    thr_df,
    x='thr',
    y=[
        'mcc'
    ],
    facet_col='phase',
    category_orders={'phase': [
        "validation"]},
    width=600,
    height=500
)
fig_thresh.add_vline(x=sel_thr, line_width=3, line_dash="dash", line_color="green")
fig_thresh.update_layout(
    title='MCC score vs thresholds (Standardization Applied)',
    title_x=0.5)
fig_thresh.update_yaxes(range=[-0.2, 1.2], scaleanchor="x", scaleratio=1, constrain='domain')
fig_thresh.update_xaxes(range=[0, 1], scaleratio = 1, constrain='domain')
fig_thresh.write_html(os.path.join(img_path, 'thresholds_metrics.html'))


print(f"Maximum MCC score for testing appears at threshold: {sel_thr}")
print(f"Maximum MCC score is: {thr_df.loc[test_mcc_idxmax].mcc}")

Maximum MCC score for testing appears at threshold: 0.3535353535353536
Maximum MCC score is: 0.5456289628153725
