In [25]:
import h5py
import glob
import os
import sys
from pathlib import Path
import torch
import pandas as pd
import numpy as np
import logging
from deeprankcore.trainer import Trainer
from deeprankcore.dataset import GraphDataset
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    auc,
    average_precision_score,
    precision_score,
    recall_score,
    accuracy_score,
    f1_score,
    matthews_corrcoef)

In [26]:
#################### To fill
exp_id = 'exp_100k_final_nostd_seed55_rmpssm_0'
cluster_dataset =  False # fill in only if the experiment has clusters
####################
exp_df = pd.read_excel('./experiments/_experiments_log.xlsx', index_col='exp_id')
exp_df.head()

Unnamed: 0_level_0,exp_fullname,exp_path,start_time,end_time,input_data_path,protein_class,target_data,resolution,task,node_features,...,training_accuracy,validation_accuracy,testing_accuracy,training_precision,validation_precision,testing_precision,training_recall,validation_recall,testing_recall,test_clusters
exp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
exp_100k_final_feat_trans_seed11_rmpssm_0,exp_100k_final_feat_trans_seed11_rmpssm_0_230530,./experiments/exp_100k_final_feat_trans_seed11...,30/May/2023_02:50:52,30/May/2023_05:29:15,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.792,0.773,0.777,0.734,0.711,0.72,0.829,0.815,0.806,
exp_100k_final_nostd_seed44_rmpssm_0,exp_100k_final_nostd_seed44_rmpssm_0_230529,./experiments/exp_100k_final_nostd_seed44_rmps...,29/May/2023_22:49:24,30/May/2023_03:26:25,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.764,0.754,0.747,0.715,0.69,0.702,0.77,0.798,0.739,
exp_100k_final_nostd_seed55_rmpssm_0,exp_100k_final_nostd_seed55_rmpssm_0_230529,./experiments/exp_100k_final_nostd_seed55_rmps...,29/May/2023_22:49:43,30/May/2023_03:26:09,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.758,0.758,0.753,0.726,0.727,0.702,0.724,0.72,0.764,
exp_100k_final_nostd_seed33_rmpssm_0,exp_100k_final_nostd_seed33_rmpssm_0_230529,./experiments/exp_100k_final_nostd_seed33_rmps...,29/May/2023_22:49:09,30/May/2023_03:24:54,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.753,0.755,0.76,0.772,0.777,0.733,0.624,0.622,0.715,
exp_100k_final_nostd_seed11_rmpssm_0,exp_100k_final_nostd_seed11_rmpssm_0_230529,./experiments/exp_100k_final_nostd_seed11_rmps...,29/May/2023_22:48:45,30/May/2023_02:38:32,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.766,0.761,0.752,0.746,0.744,0.706,0.71,0.697,0.749,


In [27]:
exp_fullname = exp_df.loc[exp_id].exp_fullname
exp_path = os.path.join('experiments', exp_fullname)
output_path = os.path.join(exp_path, 'output')
img_path = os.path.join(exp_path, 'images')

df_summ = pd.read_hdf(os.path.join(output_path, 'summary_data.hdf5'), key='summary')
output_train = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='training')
output_test = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='testing')
df = pd.concat([output_train, output_test])
df.sort_values(by=['epoch'], inplace = True)
print(df.shape)

(3706578, 6)


In [28]:
if cluster_dataset:
    df = df.merge(df_summ[['entry', 'cluster']], how='left', on='entry')
    print(df.shape)
    df.head()

In [29]:
epoch = exp_df.loc[exp_id].saved_epoch
df_plot = df[(df.epoch == epoch) | ((df.epoch == 0) & (df.phase == 'testing'))]
print(df_plot.shape)
y_true = df_plot.target
y_score = np.array(df_plot.output.values.tolist())[:, 1]

(100178, 6)


In [30]:
d = {'thr': [], 'precision': [], 'recall': [], 'accuracy': [], 'f1': [], 'mcc': [], 'auc': [], 'aucpr': [], 'phase': []}
thr_df = pd.DataFrame(data=d)

for idx, set in enumerate(['training']):
    df_plot_phase = df_plot[(df_plot.phase == set)]
    y_true = df_plot_phase.target
    y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]

    thrs = np.linspace(0,1,100)
    mcc = []
    
    for thr in thrs:
        y_pred = (y_score > thr)*1
        mcc.append(matthews_corrcoef(y_true, y_pred))
    
    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    auc_score = auc(fpr_roc, tpr_roc)
    aucpr = average_precision_score(y_true, y_score)

    phase_df = pd.DataFrame({'thr': thrs ,'mcc': mcc, 'auc': auc_score, 'aucpr': aucpr, 'phase': set})
    thr_df = pd.concat([thr_df, phase_df], ignore_index=True)

# find max mcc of test set
test_df = thr_df.loc[thr_df.phase == 'training']
test_mcc_idxmax = test_df.mcc.idxmax()
if thr_df.loc[test_mcc_idxmax].mcc > 0:
    sel_thr = thr_df.loc[test_mcc_idxmax].thr
# use max mcc of all data if max of test set is 0 (usually only on small local test experiments)
else:
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr
    print("WARNING: Maximum mcc of test set is 0. Instead, maximum mcc of all data will be used for determining optimal threshold.\n")

fig_thresh = px.line(
    thr_df,
    x='thr',
    y=[
        'mcc'
    ],
    facet_col='phase',
    category_orders={'phase': [
        "validation"]},
    width=600,
    height=500
)
fig_thresh.add_vline(x=sel_thr, line_width=3, line_dash="dash", line_color="green")
fig_thresh.update_layout(
    title='MCC score vs thresholds (Standardization Applied)',
    title_x=0.5)
fig_thresh.update_yaxes(range=[-0.2, 1.2], scaleanchor="x", scaleratio=1, constrain='domain')
fig_thresh.update_xaxes(range=[0, 1], scaleratio = 1, constrain='domain')
fig_thresh.write_html(os.path.join(img_path, 'thresholds_metrics.html'))


print(f"Maximum MCC score for training appears at threshold: {sel_thr}")
print(f"Maximum MCC score is: {thr_df.loc[test_mcc_idxmax].mcc}")

Maximum MCC score for training appears at threshold: 0.494949494949495
Maximum MCC score is: 0.5263538139786798


In [31]:
d = {'thr': [], 'precision': [], 'recall': [], 'accuracy': [], 'f1': [], 'mcc': [], 'auc': [], 'aucpr': [], 'phase': []}
thr_df = pd.DataFrame(data=d)

for idx, set in enumerate(['validation']):
    df_plot_phase = df_plot[(df_plot.phase == set)]
    y_true = df_plot_phase.target
    y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]

    thrs = np.linspace(0,1,100)
    mcc = []
    
    for thr in thrs:
        y_pred = (y_score > thr)*1
        mcc.append(matthews_corrcoef(y_true, y_pred))
    
    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    auc_score = auc(fpr_roc, tpr_roc)
    aucpr = average_precision_score(y_true, y_score)

    phase_df = pd.DataFrame({'thr': thrs ,'mcc': mcc, 'auc': auc_score, 'aucpr': aucpr, 'phase': set})
    thr_df = pd.concat([thr_df, phase_df], ignore_index=True)

# find max mcc of test set
test_df = thr_df.loc[thr_df.phase == 'validation']
test_mcc_idxmax = test_df.mcc.idxmax()
if thr_df.loc[test_mcc_idxmax].mcc > 0:
    sel_thr = thr_df.loc[test_mcc_idxmax].thr
# use max mcc of all data if max of test set is 0 (usually only on small local test experiments)
else:
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr
    print("WARNING: Maximum mcc of test set is 0. Instead, maximum mcc of all data will be used for determining optimal threshold.\n")

fig_thresh = px.line(
    thr_df,
    x='thr',
    y=[
        'mcc'
    ],
    facet_col='phase',
    category_orders={'phase': [
        "validation"]},
    width=600,
    height=500
)
fig_thresh.add_vline(x=sel_thr, line_width=3, line_dash="dash", line_color="green")
fig_thresh.update_layout(
    title='MCC score vs thresholds (Standardization Applied)',
    title_x=0.5)
fig_thresh.update_yaxes(range=[-0.2, 1.2], scaleanchor="x", scaleratio=1, constrain='domain')
fig_thresh.update_xaxes(range=[0, 1], scaleratio = 1, constrain='domain')
fig_thresh.write_html(os.path.join(img_path, 'thresholds_metrics.html'))


print(f"Maximum MCC score for validation appears at threshold: {sel_thr}")
print(f"Maximum MCC score is: {thr_df.loc[test_mcc_idxmax].mcc}")

Maximum MCC score for validation appears at threshold: 0.5858585858585859
Maximum MCC score is: 0.5161546092300213


In [32]:
d = {'thr': [], 'precision': [], 'recall': [], 'accuracy': [], 'f1': [], 'mcc': [], 'auc': [], 'aucpr': [], 'phase': []}
thr_df = pd.DataFrame(data=d)

for idx, set in enumerate(['testing']):
    df_plot_phase = df_plot[(df_plot.phase == set)]
    y_true = df_plot_phase.target
    y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]

    thrs = np.linspace(0,1,100)
    mcc = []
    
    for thr in thrs:
        y_pred = (y_score > thr)*1
        mcc.append(matthews_corrcoef(y_true, y_pred))
    
    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    auc_score = auc(fpr_roc, tpr_roc)
    aucpr = average_precision_score(y_true, y_score)

    phase_df = pd.DataFrame({'thr': thrs ,'mcc': mcc, 'auc': auc_score, 'aucpr': aucpr, 'phase': set})
    thr_df = pd.concat([thr_df, phase_df], ignore_index=True)

# find max mcc of test set
test_df = thr_df.loc[thr_df.phase == 'testing']
test_mcc_idxmax = test_df.mcc.idxmax()
if thr_df.loc[test_mcc_idxmax].mcc > 0:
    sel_thr = thr_df.loc[test_mcc_idxmax].thr
# use max mcc of all data if max of test set is 0 (usually only on small local test experiments)
else:
    mcc_idxmax = thr_df.mcc.idxmax()
    sel_thr = thr_df.loc[mcc_idxmax].thr
    print("WARNING: Maximum mcc of test set is 0. Instead, maximum mcc of all data will be used for determining optimal threshold.\n")

fig_thresh = px.line(
    thr_df,
    x='thr',
    y=[
        'mcc'
    ],
    facet_col='phase',
    category_orders={'phase': [
        "validation"]},
    width=600,
    height=500
)
fig_thresh.add_vline(x=sel_thr, line_width=3, line_dash="dash", line_color="green")
fig_thresh.update_layout(
    title='MCC score vs thresholds (Standardization Applied)',
    title_x=0.5)
fig_thresh.update_yaxes(range=[-0.2, 1.2], scaleanchor="x", scaleratio=1, constrain='domain')
fig_thresh.update_xaxes(range=[0, 1], scaleratio = 1, constrain='domain')
fig_thresh.write_html(os.path.join(img_path, 'thresholds_metrics.html'))


print(f"Maximum MCC score for testing appears at threshold: {sel_thr}")
print(f"Maximum MCC score is: {thr_df.loc[test_mcc_idxmax].mcc}")

Maximum MCC score for testing appears at threshold: 0.5050505050505051
Maximum MCC score is: 0.49076233434383365
