In [25]:
import pandas as pd
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from itertools import cycle
import numpy as np
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    auc,
    average_precision_score,
    f1_score,
    matthews_corrcoef)

In [26]:
######## Modify here
project_folder = '/projects/0/einf2380'
protein_class = 'I'
exp_path = f'{project_folder}/data/pMHC{protein_class}/trained_models/deeprankcore/experiments/cyulin'
exp_ids = [
    'exp_100k_final_increase1_seed11_rmpssm_0',
    'exp_100k_final_Increase1_seed22_rmpssm_0',
    'exp_100k_final_Increase1_seed33_rmpssm_0',
    'exp_100k_final_Increase1_seed44_rmpssm_0',
    'exp_100k_final_Increase1_seed55_rmpssm_0',
    'exp_100k_final_feattrans_withIncrease1_seed11_rmpssm_5',
    'exp_100k_final_feattrans_Increase1_seed22_rmpssm_0',
    'exp_100k_final_feattrans_Increase1_seed33_rmpssm_0',
    'exp_100k_final_feattrans_Increase1_seed44_rmpssm_0',
    'exp_100k_final_feattrans_Increase1_seed55_rmpssm_0',
    ]
exp_show_name=[
               'Without Feature Transformation_1',
               'Without Feature Transformation_2',
               'Without Feature Transformation_3',
               'Without Feature Transformation_4',
               'Without Feat Transformation_5',
               'With Feature Transformation_1',
               'With Feature Transformation_2',
               'With Feature Transformation_3',
               'With Feature Transformation_4',
               'With Feature Transformation_5',]
comparison_id = 'feattrans_experiment'
exp_type = 'shuffle'
exp_log = pd.read_excel(os.path.join(exp_path, 'cyulin_experiments_log.xlsx'), index_col='exp_id')
exp_log.head()

Unnamed: 0_level_0,exp_fullname,exp_path,start_time,end_time,input_data_path,protein_class,target_data,resolution,task,node_features,...,training_accuracy,validation_accuracy,testing_accuracy,training_precision,validation_precision,testing_precision,training_recall,validation_recall,testing_recall,test_clusters
exp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
exp_100k_final_feat_trans_seed11_rmpssm_0,exp_100k_final_feat_trans_seed11_rmpssm_0_230530,./experiments/exp_100k_final_feat_trans_seed11...,30/May/2023_02:50:52,30/May/2023_05:29:15,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.792,0.773,0.777,0.734,0.711,0.72,0.829,0.815,0.806,
exp_100k_final_nostd_seed44_rmpssm_0,exp_100k_final_nostd_seed44_rmpssm_0_230529,./experiments/exp_100k_final_nostd_seed44_rmps...,29/May/2023_22:49:24,30/May/2023_03:26:25,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.764,0.754,0.747,0.715,0.69,0.702,0.77,0.798,0.739,
exp_100k_final_nostd_seed55_rmpssm_0,exp_100k_final_nostd_seed55_rmpssm_0_230529,./experiments/exp_100k_final_nostd_seed55_rmps...,29/May/2023_22:49:43,30/May/2023_03:26:09,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.758,0.758,0.753,0.726,0.727,0.702,0.724,0.72,0.764,
exp_100k_final_nostd_seed33_rmpssm_0,exp_100k_final_nostd_seed33_rmpssm_0_230529,./experiments/exp_100k_final_nostd_seed33_rmps...,29/May/2023_22:49:09,30/May/2023_03:24:54,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.753,0.755,0.76,0.772,0.777,0.733,0.624,0.622,0.715,
exp_100k_final_nostd_seed11_rmpssm_0,exp_100k_final_nostd_seed11_rmpssm_0_230529,./experiments/exp_100k_final_nostd_seed11_rmps...,29/May/2023_22:48:45,30/May/2023_02:38:32,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.766,0.761,0.752,0.746,0.744,0.706,0.71,0.697,0.749,


In [27]:
######## Definitions used in the plotting
comparisons_path = os.path.join(exp_path, 'comparisons')
comparison_path = os.path.join(comparisons_path, comparison_id)

if not os.path.exists(comparisons_path):
    os.makedirs(comparisons_path)

if not os.path.exists(comparison_path):
    os.makedirs(comparison_path)
else:
    print(f'Folder comparisons/{comparison_id}/ already exists! \
          \nChange comparison_id if you want to save plots for a different comparison.')

def get_single_exp_df(exp_id, exp_log, exp_path):
    exp_fullname = exp_log.loc[exp_id].exp_fullname
    exp_path = os.path.join(exp_path, exp_fullname)
    output_path = os.path.join(exp_path, 'output')
    output_train = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='training')
    output_test = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='testing')
    df = pd.concat([output_train, output_test])
    df.sort_values(by=['epoch'], inplace = True)
    return df

Folder comparisons/standardize_experiment/ already exists!           
Change comparison_id if you want to save plots for a different comparison.


In [28]:
#AUC score bar comparison(testing set)

palette = cycle(px.colors.qualitative.Plotly)

auc_scores = []
auc_scores_notrans = [0.8254,0.8335,0.8344,0.8225,0.8268]
auc_scores_trans = [0.8515,0.8495,0.8522,0.8577,0.8506]
random_split=['Random_split1','Random_split2','Random_split3','Random_split4','Random_split5']


avg_notrans=sum(auc_scores_notrans)/len(auc_scores_notrans)
gap_avg_notrans=max(auc_scores_notrans)-avg_notrans
name_notrans=f'Without Feat Transformation: AUC={avg_notrans:.4f}±{gap_avg_notrans:.4f}'

avg_trans=sum(auc_scores_trans)/len(auc_scores_trans)
gap_avg_trans=max(auc_scores_trans)-avg_trans
name_trans=f'With Feat Transformation: AUC={avg_trans:.4f}±{gap_avg_trans:.4f}'


fig = go.Figure(data=[
    go.Bar(name= name_notrans, x=random_split, y = auc_scores_notrans),
    go.Bar(name= name_trans, x=random_split, y = auc_scores_trans)
])

fig.update_yaxes(title_text="AUC Score(Testing Set)")
fig.update_layout(
    barmode='group',
    title=f'AUC Scores for Feature Transformation Experiment(Testing Set)', #modify
    title_x=0.5,
    width=700, height=600,
    yaxis_range=[0.80,0.89])
fig.write_html(os.path.join(comparison_path, 'feattrans_auc_bars_testingset.html'))    


In [29]:
#AUC score bar comparison(validation set)


palette = cycle(px.colors.qualitative.Plotly)
random_split=['Random_split1','Random_split2','Random_split3','Random_split4','Random_split5']
auc_scores = []
auc_scores_notrans = [0.8341,0.8340,0.8379,0.8381,0.8316]
auc_scores_trans = [0.8582,0.8587,0.8580,0.8605,0.8570]
random_split=['Random_split1','Random_split2','Random_split3','Random_split4','Random_split5']


avg_notrans=sum(auc_scores_notrans)/len(auc_scores_notrans)
gap_avg_notrans=max(auc_scores_notrans)-avg_notrans
name_notrans=f'Without Feat Transformation: AUC={avg_notrans:.4f}±{gap_avg_notrans:.4f}'

avg_trans=sum(auc_scores_trans)/len(auc_scores_trans)
gap_avg_trans=max(auc_scores_trans)-avg_trans
name_trans=f'With Feat Transformation: AUC={avg_trans:.4f}±{gap_avg_trans:.4f}'


fig = go.Figure(data=[
    go.Bar(name= name_notrans, x=random_split, y = auc_scores_notrans),
    go.Bar(name= name_trans, x=random_split, y = auc_scores_trans)
])

fig.update_yaxes(title_text="AUC Score(Validation Set)")
fig.update_layout(
    barmode='group',
    title=f'AUC Scores for Feature Transformation Experiment(Validation Set)', #modify
    title_x=0.5,
    width=700, height=600,
    yaxis_range=[0.80,0.89])
fig.write_html(os.path.join(comparison_path, 'feattrans_auc_bars_validationset.html'))

In [30]:
#AUC score bar comparison(training set)


palette = cycle(px.colors.qualitative.Plotly)
random_split=['Random_split1','Random_split2','Random_split3','Random_split4','Random_split5']
auc_scores = []
auc_scores_notrans = [0.8446,0.8422,0.8386,0.8430,0.8355]
auc_scores_trans = [0.8737,0.8818,0.8796,0.8813,0.8770]
random_split=['Random_split1','Random_split2','Random_split3','Random_split4','Random_split5']


avg_notrans=sum(auc_scores_notrans)/len(auc_scores_notrans)
gap_avg_notrans=max(auc_scores_notrans)-avg_notrans
name_notrans=f'Without Feat Transformation: AUC={avg_notrans:.4f}±{gap_avg_notrans:.4f}'

avg_trans=sum(auc_scores_trans)/len(auc_scores_trans)
gap_avg_trans=max(auc_scores_trans)-avg_trans
name_trans=f'With Feat Transformation: AUC={avg_trans:.4f}±{gap_avg_trans:.4f}'


fig = go.Figure(data=[
    go.Bar(name= name_notrans, x=random_split, y = auc_scores_notrans),
    go.Bar(name= name_trans, x=random_split, y = auc_scores_trans)
])

fig.update_yaxes(title_text="AUC Score(Training Set)")
fig.update_layout(
    barmode='group',
    title=f'AUC Scores for Feature Transformation Experiment(Training Set)', #modify
    title_x=0.5,
    width=700, height=600,
    yaxis_range=[0.80,0.89])
fig.write_html(os.path.join(comparison_path, 'feattrans_auc_bars_trainingset.html'))

In [31]:
######## MCC bar plots(training set)
palette = cycle(px.colors.qualitative.Plotly)

mcc_notrans=[0.5279,0.5245,0.5188,0.5263,0.5119]
mcc_trans=[0.5873,0.6030,0.5992,0.6029,0.5934]


thr = [0.50,0.49,0.50,0.49,0.50,
       0.50,0.51,0.52,0.52,0.49,
       ] #modify, max mcc threshold

avg_notrans=sum(mcc_notrans)/len(mcc_notrans)
gap_avg_notrans=max(mcc_notrans)-avg_notrans
name_notrans=f'Without Feat Transformation: MCC={avg_notrans:.4f}±{gap_avg_notrans:.4f}'

avg_trans=sum(mcc_trans)/len(mcc_trans)
gap_avg_trans=max(mcc_trans)-avg_trans
name_trans=f'With Feat Transformation: MCC={avg_trans:.4f}±{gap_avg_trans:.4f}'

fig = go.Figure(data=[
    go.Bar(name= name_notrans, x=random_split, y = mcc_notrans),
    go.Bar(name= name_trans, x=random_split, y = mcc_trans)
])

fig.update_yaxes(title_text="MCC Score(Training Set)")
fig.update_layout(
    barmode='group',
    title='MCC Scores for Feature Transformation Experiment(Training Set)', #Modify
    title_x=0.5,
    width=800, height=500,
    yaxis_range=[0.49,0.6])
fig.write_html(os.path.join(comparison_path, 'feattrans_mcc_thr_trainingset.html'))

In [32]:
######## MCC bar plots(validation set)
palette = cycle(px.colors.qualitative.Plotly)

mcc_notrans=[0.5136,0.5097,0.5200,0.5161,0.5076]
mcc_trans=[0.5611,0.5554,0.5580,0.5609,0.5500]


thr = [0.56,0.50,0.53,0.58,0.45,
       0.58,0.51,0.50,0.47,0.45
       ] #modify, max mcc threshold

avg_notrans=sum(mcc_notrans)/len(mcc_notrans)
gap_avg_notrans=max(mcc_notrans)-avg_notrans
name_notrans=f'Without Feat Transformation: MCC={avg_notrans:.4f}±{gap_avg_notrans:.4f}'

avg_trans=sum(mcc_trans)/len(mcc_trans)
gap_avg_trans=max(mcc_trans)-avg_trans
name_trans=f'With Feat Transformation: MCC={avg_trans:.4f}±{gap_avg_trans:.4f}'

fig = go.Figure(data=[
    go.Bar(name= name_notrans, x=random_split, y = mcc_notrans),
    go.Bar(name= name_trans, x=random_split, y = mcc_trans)
])

fig.update_yaxes(title_text="MCC Score(Validation Set)")
fig.update_layout(
    barmode='group',
    title='MCC Scores for Feature Transformation Experiment(Validation Set)', #Modify
    title_x=0.5,
    width=800, height=500,
    yaxis_range=[0.49,0.6])
fig.write_html(os.path.join(comparison_path, 'feattrans_mcc_thr_validationset.html'))

In [33]:
######## MCC bar plots(testing set)
palette = cycle(px.colors.qualitative.Plotly)

mcc_notrans=[0.5014,0.5070,0.5117,0.4907,0.5053]
mcc_trans=[0.5465,0.5390,0.5457,0.5498,0.5400]


thr = [0.57,0.57,0.64,0.50,0.54,
       0.47,0.56,0.61,0.39,0.58
       ] #modify, max mcc threshold

avg_notrans=sum(mcc_notrans)/len(mcc_notrans)
gap_avg_notrans=max(mcc_notrans)-avg_notrans
name_notrans=f'Without Feat Transformation: MCC={avg_notrans:.4f}±{gap_avg_notrans:.4f}'

avg_trans=sum(mcc_trans)/len(mcc_trans)
gap_avg_trans=max(mcc_trans)-avg_trans
name_trans=f'With Feat Transformation: MCC={avg_trans:.4f}±{gap_avg_trans:.4f}'

fig = go.Figure(data=[
    go.Bar(name= name_notrans, x=random_split, y = mcc_notrans),
    go.Bar(name= name_trans, x=random_split, y = mcc_trans)
])

fig.update_yaxes(title_text="MCC Score(Testing Set)")
fig.update_layout(
    barmode='group',
    title='MCC Scores for Feature Transformation Experiment(Testing Set)', #Modify
    title_x=0.5,
    width=800, height=500,
    yaxis_range=[0.49,0.6])
fig.write_html(os.path.join(comparison_path, 'feattrans_mcc_thr_testingset.html'))

In [37]:
### Final comparison with best batch size & original batch size (AUC) ###
x_group=['Training Set','Validation Set','Testing Set']
auc_nostd=[0.8408,0.8351,0.8285]
auc_std=[0.8787,0.8585,0.8523]
name_notrans="Without Feat Transformation (Original)"
name_trans="With Feat Transformation (Adopted)"
fig = go.Figure(data=[
    go.Bar(name= name_notrans, x=x_group, y = auc_nostd, text = auc_nostd),
    go.Bar(name= name_trans, x=x_group, y = auc_std, text = auc_std)
])

fig.update_yaxes(title_text="AUC Score")
fig.update_layout(
    barmode='group',
    title='Improvement of AUC Score for Feature Transformation Experiment',
    title_x=0.5,
    width=800, height=600,
    yaxis_range=[0.80,0.89])
fig.write_html(os.path.join(comparison_path, 'final_feattrans_auc.html'))

In [38]:
### Final comparison with best batch size & original batch size (MCC) ###
x_group=['Training Set','Validation Set','Testing Set']
mcc_notrans=[0.5219,0.5134,0.5032]
mcc_trans=[0.5972,0.5571,0.5442]
name_notrans="Without Feat Transformation (Original)"
name_trans="With Feat Transformation (Adopted)"
fig = go.Figure(data=[
    go.Bar(name= name_notrans, x=x_group, y = mcc_notrans, text = mcc_notrans),
    go.Bar(name= name_trans, x=x_group, y = mcc_trans, text = mcc_trans)
])

fig.update_yaxes(title_text="MCC Score")
fig.update_layout(
    barmode='group',
    title='Improvement of MCC Score for Feature Transformation Experiment',
    title_x=0.5,
    width=800, height=500,
    yaxis_range=[0.49,0.60])
fig.write_html(os.path.join(comparison_path, 'final_feattrans_mcc.html'))