In [1]:
import pandas as pd
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from itertools import cycle
import numpy as np
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    auc,
    average_precision_score,
    f1_score,
    matthews_corrcoef)

In [2]:
######## Modify here
exp_path = './experiments/'
exp_ids = [
    'exp_100k_std_bs16_0',
    'exp_100k_std_bs16_net2_0']
comparison_id = 'naive_vs_net2'
exp_type = 'Naive GNN vs Net2'
exp_log = pd.read_excel(exp_path + '_experiments_log.xlsx', index_col='exp_id')
exp_log.head()

Unnamed: 0_level_0,exp_fullname,exp_path,start_time,end_time,input_data_path,protein_class,target_data,resolution,task,node_features,...,testing_f1,training_accuracy,validation_accuracy,testing_accuracy,training_precision,validation_precision,testing_precision,training_recall,validation_recall,testing_recall
exp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
exp_100k_dist_res_type_std_bs16_cl_allele_C_0,exp_100k_dist_res_type_std_bs16_cl_allele_C_0_...,./experiments/exp_100k_dist_res_type_std_bs16_...,21/Apr/2023_14:26:44,21/Apr/2023_22:41:06,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.601,0.704,0.702,0.535,0.609,0.608,0.774,0.903,0.894,0.491
exp_100k_std_bs16_net2_0,exp_100k_std_bs16_net2_0_230421,./experiments/exp_100k_std_bs16_net2_0_230421,21/Apr/2023_17:49:12,21/Apr/2023_22:04:17,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.755,0.815,0.785,0.788,0.838,0.816,0.769,0.719,0.659,0.743
exp_692_std_bs16_net2_0,exp_692_std_bs16_net2_0_230421,./experiments/exp_692_std_bs16_net2_0_230421,21/Apr/2023_15:15:43,21/Apr/2023_15:16:16,['/home/ccrocion/snellius_data_sample/data/pMH...,I,BA,residue,classif,all,...,0.615,0.517,0.504,0.571,0.488,0.478,0.522,0.961,0.965,0.75
exp_100k_dist_res_type_std_bs16_cl_peptide_0,exp_100k_dist_res_type_std_bs16_cl_peptide_0_2...,./experiments/exp_100k_dist_res_type_std_bs16_...,17/Apr/2023_17:00:02,17/Apr/2023_22:49:06,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.696,0.751,0.752,0.711,0.7,0.695,0.64,0.761,0.779,0.764
exp_100k_dist_res_type_std_bs16_0,exp_100k_dist_res_type_std_bs16_0_230417,./experiments/exp_100k_dist_res_type_std_bs16_...,17/Apr/2023_16:56:38,17/Apr/2023_22:39:50,['/projects/0/einf2380/data/pMHCI/features_out...,I,BA,residue,classif,all,...,0.727,0.739,0.741,0.74,0.673,0.671,0.676,0.791,0.804,0.785


In [3]:
######## Definitions used in the plotting
comparisons_path = os.path.join(exp_path, 'comparisons')
comparison_path = os.path.join(comparisons_path, comparison_id)

if not os.path.exists(comparisons_path):
    os.makedirs(comparisons_path)

if not os.path.exists(comparison_path):
    os.makedirs(comparison_path)
else:
    print(f'Folder comparisons/{comparison_id}/ already exists! \
          \nChange comparison_id if you want to save plots for a different comparison.')

def get_single_exp_df(exp_id, exp_log, exp_path):
    exp_fullname = exp_log.loc[exp_id].exp_fullname
    exp_path = os.path.join(exp_path, exp_fullname)
    output_path = os.path.join(exp_path, 'output')
    output_train = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='training')
    output_test = pd.read_hdf(os.path.join(output_path, 'output_exporter.hdf5'), key='testing')
    df = pd.concat([output_train, output_test])
    df.sort_values(by=['epoch'], inplace = True)
    return df

Folder comparisons/naive_vs_net2/ already exists!           
Change comparison_id if you want to save plots for a different comparison.


In [4]:
######## Losses curves
palette = cycle(px.colors.qualitative.Plotly)
fig = go.Figure()

for exp_id in exp_ids:
    df = get_single_exp_df(exp_id, exp_log, exp_path)
    df_grouped = df.groupby(['phase', 'epoch']).mean().reset_index()[['phase', 'epoch', 'loss']]
    color = next(palette)
    df_train_plot = df_grouped[(df_grouped.phase =='training') & (df_grouped.epoch > 0)]
    df_valid_plot = df_grouped[(df_grouped.phase =='validation') & (df_grouped.epoch > 0)]

    fig.add_trace(go.Scatter(
        x=df_train_plot.epoch,
        y=df_train_plot.loss,
        mode = 'lines+markers',
        line=dict(color=color),
        name='training',
        legendgroup=exp_id,
        legendgrouptitle_text=exp_id))
    fig.add_trace(go.Scatter(
        x=df_valid_plot.epoch,
        y=df_valid_plot.loss,
        mode = 'lines+markers',
        line=dict(color=color, dash='dot'),
        name='validation',
        legendgroup=exp_id,
        legendgrouptitle_text=exp_id))
    
    epoch = exp_log.loc[exp_id].saved_epoch
    fig.add_trace(go.Scatter(
        x=[epoch],
        y=[exp_log.loc[exp_id].training_loss],
        mode = 'markers',
        marker_symbol = 'star',
        marker_color = 'firebrick',
        marker_size = 15,
        name = f'epoch {epoch}',
        legendgroup=exp_id,
        showlegend = False))

fig.update_layout(
    xaxis_title='Epoch #',
    yaxis_title='Loss',
    width=800, height=500,
    title='Loss vs epochs',
    title_x=0.5,
    margin=go.layout.Margin(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4),
    legend=dict(
        yanchor="top",
        xanchor="right",
        x=0.99
        )
)
fig.write_html(os.path.join(comparison_path, 'losses.html'))


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [5]:
######## AUC and AUCPR curves
palette = cycle(px.colors.qualitative.Plotly)
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles = ['ROC Curves (AUC)', 'PR Curves (AUCPR)'],
    horizontal_spacing = 0.05)

for exp_id in exp_ids:
    df = get_single_exp_df(exp_id, exp_log, exp_path)
    df_plot = df[(df.epoch == 0) & (df.phase == 'testing')]
    y_true = df_plot.target
    y_score = np.array(df_plot.output.values.tolist())[:, 1]
    color = next(palette)

    df_plot_phase = df_plot
    y_true = df_plot_phase.target
    y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]

    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    pr_pr, rec_pr, thr_pr = precision_recall_curve(y_true, y_score)

    name_roc = f'AUC={auc(fpr_roc, tpr_roc):.4f}'
    name_pr = f'AUCPR={average_precision_score(y_true, y_score):.4f}'

    fig.add_trace(go.Scatter(
        x=fpr_roc,
        y=tpr_roc,
        name=name_roc,
        line=dict(color=color),
        legendgroup=exp_id,
        legendgrouptitle_text=exp_id),
        row=1,
        col=1)
    fig.add_trace(go.Scatter(
        x=rec_pr,
        y=pr_pr,
        name=name_pr,
        line=dict(color=color),
        legendgroup=exp_id,
        legendgrouptitle_text=exp_id),
        row=1,
        col=2)

fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1,
    row = 1, col = 1
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=1, y1=0,
    row = 1, col = 2 
)
fig.update_layout(
    width=900, height=400,
    margin=go.layout.Margin(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    legend=dict(
        yanchor="top",
        y=1.05,
        xanchor="left",
        x=0.97
        ))
fig.update_xaxes(title_text="FPR", constrain='domain', scaleratio = 1, row=1, col=1)
fig.update_yaxes(title_text="TPR (Recall)", constrain='domain', scaleanchor = "x", scaleratio = 1, row=1, col=1)
fig.update_xaxes(title_text="Recall", constrain='domain', scaleanchor = "y", scaleratio = 1, row=1, col=2)
fig.update_yaxes(title_text="Precision", constrain='domain', scaleratio = 1, row=1, col=2)
fig.update_layout(
    title='AUC and AUCPR curves for testing set',
    title_x=0.5,
    margin={'t': 100})
fig.write_html(os.path.join(comparison_path, 'auc_aucpr_testing.html'))

In [6]:
fig = go.Figure()
auc_scores = []

for exp_id in exp_ids:
    df = get_single_exp_df(exp_id, exp_log, exp_path)
    df_plot = df[(df.epoch == 0) & (df.phase == 'testing')]
    y_true = df_plot.target
    y_score = np.array(df_plot.output.values.tolist())[:, 1]
    color = next(palette)

    df_plot_phase = df_plot
    y_true = df_plot_phase.target
    y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]

    fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)
    pr_pr, rec_pr, thr_pr = precision_recall_curve(y_true, y_score)

    auc_score = auc(fpr_roc, tpr_roc)
    auc_scores.append(auc_score)
    name_roc = f'AUC={auc_score:.4f}'

    fig.add_trace(go.Bar(
        x = ['AUC'],
        y = [auc_score],
        name = name_roc,
        legendgroup = exp_id,
        legendgrouptitle_text = exp_id
    ))

fig.update_yaxes(title_text="Value")
fig.update_layout(
    barmode='group',
    title=f'AUC bars comparison for {exp_type}',
    title_x=0.5,
    width=1100, height=600)
fig.write_html(os.path.join(comparison_path, 'auc_testing_bars.html'))

In [7]:
######## F1 and MCC bar plots
palette = cycle(px.colors.qualitative.Plotly)
fig = go.Figure()

thr = 0.5

for exp_id in exp_ids:
    df = get_single_exp_df(exp_id, exp_log, exp_path)
    df_plot = df[(df.epoch == 0) & (df.phase == 'testing')]
    y_true = df_plot.target
    y_score = np.array(df_plot.output.values.tolist())[:, 1]
    y_pred = (y_score > thr)*1
    color = next(palette)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)

    name_f1 = f'F1={f1:.4f}'
    name_mcc = f'MCC={mcc:.4f}'

    fig.add_trace(go.Bar(
        x=['F1', 'MCC'],
        y=[f1, mcc],
        name = name_f1 + ', ' + name_mcc,
        legendgroup=exp_id,
        legendgrouptitle_text=exp_id,
        marker_color = color
    ))

fig.update_yaxes(title_text="Score")
fig.update_layout(
    barmode='group',
    title='F1 and MCC scores for testing set',
    title_x=0.5,
    width=800, height=500)
fig.write_html(os.path.join(comparison_path, 'f1_mcc_thr.5_testing.html'))

In [8]:
######## Timings
fig = go.Figure()

for exp_id in exp_ids:
    start = exp_log.loc[exp_id].start_time
    end = exp_log.loc[exp_id].end_time
    start_dt = datetime.strptime(start, '%d/%b/%Y_%H:%M:%S')
    end_dt = datetime.strptime(end, '%d/%b/%Y_%H:%M:%S')
    time = end_dt - start_dt

    fig.add_trace(go.Bar(
        x=[exp_id],
        y=[time.seconds],
        name = str(time) + ' hours'
    ))

fig.update_yaxes(title_text="Seconds")
fig.update_layout(
    barmode='group',
    title='Total timings for the experiments',
    title_x=0.5,
    width=600, height=500)
fig.write_html(os.path.join(comparison_path, 'timings.html'))