In [2]:
import sys
import os
import inspect
import numpy as np
import pandas as pd
import plotly.express as px
import json
import glob
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import json

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))))

from chemprop.train.uncertainty_evaluator import UncertaintyEvaluator

from plotly import io as pio

pio.templates.default = 'ggplot2'

# Quantitative Overview

In [3]:
methods = ["spearman", "log_likelihood", "calibration_auc"]
data_sources = ["delaney", "freesolv", "lipo", "qm7", "logp"]
estimator_types = ["mpnn_ensemble", "mpnn_bootstrap", "mpnn_snapshot", "mpnn_dropout10", "mpnn_dropout20", "mpnn_mve",
                   "mpnn_gaussian", "mpnn_random_forest", "mpnn_latent_space", "mpnn_tanimoto",
                   "ffn_ensemble", "ffn_bootstrap", "ffn_snapshot", "ffn_dropout10", "ffn_dropout20", "ffn_mve",
                   "ffn_gaussian", "ffn_random_forest", "ffn_latent_space", "ffn_tanimoto",
                   "fp_random_forest", "fp_gaussian"]

estimator_name_map = {"mpnn_mve": "MPNN MVE",
                      "mpnn_gaussian": "MPNN GP",
                      "mpnn_random_forest": "MPNN RF",
                      "mpnn_ensemble": "MPNN Ensemble",
                      "mpnn_tanimoto": "MPNN Tanimoto Distance",
                      "mpnn_latent_space": "MPNN Latent Distance",
                      "mpnn_bootstrap": "MPNN Bootstrap",
                      "mpnn_snapshot": "MPNN Snapshot Ensembling",
                      "mpnn_dropout10": "MPNN Dropout (10%)",
                      "mpnn_dropout20": "MPNN Dropout (20%)",
                      "ffn_mve": "FFN MVE",
                      "ffn_gaussian": "FFN GP",
                      "ffn_random_forest": "FFN RF",
                      "ffn_ensemble": "FFN Ensemble",
                      "ffn_tanimoto": "FFN Tanimoto Distance",
                      "ffn_latent_space": "FFN Latent Distance",
                      "ffn_bootstrap": "FFN Bootstrap",
                      "ffn_snapshot": "FFN Snapshot Ensembling",
                      "ffn_dropout10": "FFN Dropout (10%)",
                      "ffn_dropout20": "FFN Dropout (20%)",
                      "fp_random_forest": "FP RF",
                      "fp_gaussian": "FP GP"
                      }

dataset_name_map = {
    "delaney": "Delaney",
    "freesolv": "freesolv",
    "lipo": "lipo",
    "qm7": "QM7",
    "logp": "CLogP"}

proper_estimator_names = [estimator_name_map[estimator_type] for estimator_type in estimator_types]
proper_dataset_names = [dataset_name_map[dataset] for dataset in data_sources]

split_name_map = {"random": "Random Split",
                  "scaffold": "Scaffold Split"}

### Scale

In [10]:
evaluations_df = pd.DataFrame(columns = ['Estimator',
                                         'Data Set',
                                         'Task',
                                         'Split',
                                         'Spearman\'s Coefficient',
                                         'NLL',
                                         'Average NLL',
                                         'Calibrated NLL',
                                         'Average Calibrated NLL',
                                         'Calibration Slope',
                                         'Calibration Intercept',
                                         'Optimal NLL',
                                         'Average Optimal NLL',
                                         'Miscalibration Area',
                                         'File Path'])

for estimator in estimator_types:
    for data_source in data_sources:
        for split in ["random", "scaffold"]:
            uncalibrated_path = f'../uncertainty_evaluation/uncalibrated/{estimator}/{data_source}/{split}/*.txt'
            uncalibrated_files = glob.glob(uncalibrated_path)
            for uncalibrated_file in uncalibrated_files:
                all_evaluations = UncertaintyEvaluator.evaluate(uncalibrated_file, methods)
                core_path = uncalibrated_file[39:]
                for task, task_evaluations in all_evaluations.items():
                    rho = task_evaluations["spearman"]["rho"]
                    nll = -1 * task_evaluations["log_likelihood"]["log_likelihood"]
                    average_nll = -1 * task_evaluations["log_likelihood"]["average_log_likelihood"]
                    optimal_nll = -1 * task_evaluations["log_likelihood"]["optimal_log_likelihood"]
                    average_optimal_nll = -1 * task_evaluations["log_likelihood"]["average_optimal_log_likelihood"]
                    miscalibration_area = task_evaluations["calibration_auc"]["miscalibration_area"]
                    evaluations_df = evaluations_df.append(
                        {'Estimator': estimator_name_map[estimator],
                        'Data Set': dataset_name_map[data_source],
                        'Task': task,
                        'Split': split_name_map[split],
                        'Spearman\'s Coefficient': rho,
                        'NLL': nll,
                         'Average NLL': average_nll,
                        'Calibrated NLL': 0,
                         'Average Calibrated NLL': 0,
                        'Optimal NLL': optimal_nll,
                         'Average Optimal NLL': average_optimal_nll,
                        'Miscalibration Area': miscalibration_area,
                        'File Path': core_path}, ignore_index=True)

In [12]:
for estimator in estimator_types:
    for data_source in data_sources:
        for split in ["random", "scaffold"]:
            path = f'../uncertainty_evaluation/uncalibrated/{estimator}/{data_source}/{split}/*.txt'
            files = glob.glob(path)
            
            for file in files:
                new_path = "../uncertainty_evaluation/calibrated/" + file[39:]

                calibrated_log, coefficients = UncertaintyEvaluator.calibrate([lambda x: x, lambda x: 1], [1, 0], file)
                
                if not os.path.exists(os.path.dirname(new_path)):
                    os.makedirs(os.path.dirname(new_path))
                f = open(new_path, 'w+')
                json.dump(calibrated_log, f)
                f.close()
                
                all_evaluations = UncertaintyEvaluator.evaluate(new_path, methods)
                for task, task_evaluations in all_evaluations.items():
                    index = evaluations_df[evaluations_df["Task"] == task][evaluations_df["File Path"] == new_path[37:]].index[0]
                    calibrated_nll = -1 * task_evaluations["log_likelihood"]["log_likelihood"]
                    average_calibrated_nll = -1 * task_evaluations["log_likelihood"]["average_log_likelihood"]
                    evaluations_df.at[index, "Calibrated NLL"] = calibrated_nll
                    evaluations_df.at[index, "Average Calibrated NLL"] = average_calibrated_nll
                    evaluations_df.at[index, "Calibration Slope"] = coefficients[task][0]
                    evaluations_df.at[index, "Calibration Intercept"] = coefficients[task][1]



























In [13]:
evaluations_df["Average NLL Difference"] = evaluations_df["Average NLL"] - evaluations_df["Average Optimal NLL"]
evaluations_df["Average Calibrated NLL Difference"] = evaluations_df["Average Calibrated NLL"] - evaluations_df["Average Optimal NLL"]

In [14]:
evaluations_df["Estimator Order"] = evaluations_df["Estimator"].map({
    key: proper_estimator_names.index(key) for key in proper_estimator_names
})

evaluations_df["Data Set Order"] = evaluations_df["Data Set"].map({
    key: proper_dataset_names.index(key) for key in proper_dataset_names
})

In [15]:
evaluations_df = evaluations_df.sort_values(by=["Data Set Order", "Estimator Order"])

In [16]:
evaluations_df.head()

Unnamed: 0,Estimator,Data Set,Task,Split,Spearman's Coefficient,NLL,Average NLL,Calibrated NLL,Average Calibrated NLL,Calibration Slope,Calibration Intercept,Optimal NLL,Average Optimal NLL,Miscalibration Area,File Path,Average NLL Difference,Average Calibrated NLL Difference,Estimator Order,Data Set Order
0,MPNN Ensemble,Delaney,logSolubility,Random Split,0.253664,405.518614,1.19622,574.827,1.69565,0.1127798,0.2800009,117.22324,0.345791,0.14115,mpnn_ensemble/delaney/random/7.txt,0.850429,1.34986,0,0
1,MPNN Ensemble,Delaney,logSolubility,Random Split,0.157046,5172.95401,15.259451,493.965,1.45712,34.51976,7.45057e-09,104.260407,0.307553,0.360709,mpnn_ensemble/delaney/random/4.txt,14.951899,1.14957,0,0
2,MPNN Ensemble,Delaney,logSolubility,Random Split,0.188756,5374.731072,15.854664,506.3,1.49351,34.25046,0.02074724,90.086407,0.265742,0.36964,mpnn_ensemble/delaney/random/5.txt,15.588922,1.22777,0,0
3,MPNN Ensemble,Delaney,logSolubility,Random Split,0.238892,5305.769192,15.651237,409.633,1.20836,6.886822e-10,1.018857,99.141964,0.292454,0.368319,mpnn_ensemble/delaney/random/3.txt,15.358782,0.915903,0,0
4,MPNN Ensemble,Delaney,logSolubility,Random Split,0.272549,4612.547933,13.606336,408.385,1.20468,3.541902e-10,1.019571,78.659933,0.232035,0.351883,mpnn_ensemble/delaney/random/6.txt,13.374301,0.972641,0,0


In [28]:
uncalibrated_evaluation_methods = ['MPNN Tanimoto Distance',
                                   'MPNN Latent Distance',
                                   'FFN Tanimoto Distance',
                                   'FFN Latent Distance']

In [39]:
precalibrated_evaluations_df = evaluations_df.copy()

for method in uncalibrated_evaluation_methods:
    precalibrated_evaluations_df.NLL *= (evaluations_df["Estimator"] != method)
    precalibrated_evaluations_df["Average NLL"] *= (evaluations_df["Estimator"] != method)
    precalibrated_evaluations_df["Average NLL Difference"] *= (evaluations_df["Estimator"] != method)
    precalibrated_evaluations_df["Miscalibration Area"] *= (evaluations_df["Estimator"] != method)

precalibrated_evaluations_df.NLL.replace(to_replace=[0], value=np.nan, inplace=True)
precalibrated_evaluations_df["Average NLL"].replace(to_replace=[0], value=np.nan, inplace=True)
precalibrated_evaluations_df["Average NLL Difference"].replace(to_replace=[0], value=np.nan, inplace=True)
precalibrated_evaluations_df["Miscalibration Area"].replace(to_replace=[0], value=np.nan, inplace=True)

In [40]:
labels = list('abcdefghijklmnopqrstuvwxyz')
annotations = list()
for i, label in enumerate(labels):
    annotations.append(dict(text=label,
                            x=0,
                            y=(5-i)/5,
                            xref="paper",
                            yref="paper",
                            showarrow=False,
                            textangle=0))

In [41]:
def update_label(a):
    dataset = a.text.split("=")[1]
    label = labels[proper_dataset_names.index(dataset)]
    return a.update(text=f'<b>({label})</b> {dataset}')

In [42]:
def update_estimator_label(a):
    estimator = a.text.split("=")[1]
    label = labels[proper_estimator_names.index(estimator)]
    return a.update(text=f'<b>({label})</b> {estimator}')

In [76]:
fig = px.box(evaluations_df,
             x="Estimator",
             y="Spearman's Coefficient",
             color="Split",
             facet_row="Data Set",
             points=False,
             height=468*2,
             width=234*2,
             color_discrete_sequence=["#f49a94", "#50c2f2"],
             range_y=[-0.35, 0.8], labels={"Spearman's Coefficient": "\u03C1"})

fig.update_layout(legend=dict(xanchor='center', x=0.5, y=-0.25, orientation='h', title=None),
                  margin=go.layout.Margin(
                    l=25,
                    r=25,
                    b=0,
                    t=10,
                    pad=0),
                  autosize=False,
                  font=dict(
                      size=12),
                 )

fig.update_xaxes(
    ticks="outside",
    tickson="boundaries",
    title=None
)

fig.update_yaxes( 
        tickmode = 'linear',
        tick0 = 0.0,
        dtick = 0.25)

fig.for_each_annotation(update_label)

fig.show()

In [77]:
fig = px.box(precalibrated_evaluations_df,
             x="Estimator",
             y="Miscalibration Area",
             color="Split",
             facet_row="Data Set",
             points=False,
             height=468*2,
             width=234*2,
             color_discrete_sequence=["#f49a94", "#50c2f2"],
             range_y=[0, 0.5], labels={"Miscalibration Area": "AUC"})

fig.update_layout(legend=dict(xanchor='center', x=0.5, y=-0.25, orientation='h', title=None),
                  margin=go.layout.Margin(
                    l=25,
                    r=25,
                    b=0,
                    t=10,
                    pad=0),
                  autosize=False,
                  font=dict(
                      size=12))

fig.update_xaxes(
    ticks="outside",
    tickson="boundaries",
    title=None
)

fig.update_yaxes( 
        tickmode = 'linear',
        tick0 = 0.0,
        dtick = 0.125)

fig.for_each_annotation(update_label)

fig.show()

In [78]:
fig = px.box(precalibrated_evaluations_df,
             x="Estimator",
             y="Average NLL",
             color="Split",
             facet_row="Data Set",
             points=False,
             height=468*2,
             width=234*2,
             color_discrete_sequence=["#f49a94", "#50c2f2"],
            labels={"Average NLL": "NLL"})

fig.update_xaxes(
    ticks="outside",
    tickson="boundaries",
    title=None
)

fig.update_yaxes(matches=None)

fig.update_layout(legend=dict(xanchor='center', x=0.5, y=-0.25, orientation='h', title=None),
                  margin=go.layout.Margin(
                    l=25,
                    r=25,
                    b=0,
                    t=10,
                    pad=0),
                  autosize=False,
                  font=dict(
                      size=12))

fig.for_each_annotation(update_label)

fig.show()

In [79]:
fig = px.box(evaluations_df,
             x="Estimator",
             y="Average Calibrated NLL",
             color="Split",
             facet_row="Data Set",
             points=False,
             height=468*2,
             width=234*2,
             color_discrete_sequence=["#f49a94", "#50c2f2"],
             labels={"Average Calibrated NLL": "cNLL"})

fig.update_xaxes(
    ticks="outside",
    tickson="boundaries",
    title=None
)

fig.update_yaxes(matches=None)

fig.update_layout(legend=dict(xanchor='center', x=0.5, y=-0.25, orientation='h', title=None),
                  margin=go.layout.Margin(
                    l=25,
                    r=25,
                    b=0,
                    t=10,
                    pad=0),
                  autosize=False,
                  font=dict(
                      size=12))

fig.for_each_annotation(update_label)

fig.show()

## Calibration Coefficients

In [50]:
evaluations_df["Capped Calibration Slope"] = evaluations_df["Calibration Slope"].apply(lambda x: 10 if x > 10 else (0 if x < 0 else x))

In [51]:
fig = px.histogram(evaluations_df,
             x="Capped Calibration Slope",
             color="Split",
             facet_col="Estimator",
                   facet_col_wrap=4,
             height=468*2,
             width=234*4,
            nbins=20,
             color_discrete_sequence=["#f49a94", "#50c2f2"],
             labels={"Calibrated NLL": "cNLL"})

fig.for_each_annotation(update_estimator_label)

fig.show()

## NLL Differences

In [80]:
fig = px.box(precalibrated_evaluations_df,
             x="Estimator",
             y="Average NLL Difference",
             color="Split",
             facet_row="Data Set",
             points=False,
             height=468*2,
             width=234*2,
             color_discrete_sequence=["#f49a94", "#50c2f2"],
            labels={"Average NLL Difference": "NLL Difference"})

fig.update_xaxes(
    ticks="outside",
    tickson="boundaries",
    title=None
)

fig.update_yaxes(matches=None)

fig.update_layout(legend=dict(xanchor='center', x=0.5, y=-0.25, orientation='h', title=None),
                  margin=go.layout.Margin(
                    l=25,
                    r=25,
                    b=0,
                    t=10,
                    pad=0),
                  autosize=False,
                  font=dict(
                      size=12))

fig.for_each_annotation(update_label)

fig.show()

In [71]:
fig = px.box(precalibrated_evaluations_df,
             x="Estimator",
             y="Average Calibrated NLL Difference",
             color="Split",
             facet_row="Data Set",
             points=False,
             height=468*2,
             width=234*2,
             color_discrete_sequence=["#f49a94", "#50c2f2"],
            labels={"Average Calibrated NLL Difference": "cNLL Difference"})

fig.update_xaxes(
    ticks="outside",
    tickson="boundaries",
    title=None
)

fig.update_yaxes(matches=None)

fig.update_layout(legend=dict(xanchor='center', x=0.5, y=-0.25, orientation='h', title=None),
                  margin=go.layout.Margin(
                    l=25,
                    r=25,
                    b=0,
                    t=10,
                    pad=0),
                  autosize=False,
                  font=dict(
                      size=12))

fig.for_each_annotation(update_label)

fig.show()

## Slides

In [87]:
top5_df = pd.DataFrame(columns = ['Estimator',
                                  'Data Set',
                                  'Selection',
                                  'RMSE'])

estimators = ["mpnn_ensemble", "mpnn_mve", "mpnn_tanimoto", "mpnn_random_forest",
              "ffn_ensemble", "ffn_mve", "ffn_tanimoto", "ffn_random_forest"]
split = "random"

for estimator in estimators:
    for data_source in data_sources:
        uncalibrated_path = f'../uncertainty_evaluation/uncalibrated/{estimator}/{data_source}/{split}/*.txt'
        uncalibrated_files = glob.glob(uncalibrated_path)

        percents = [100, 50, 25, 10, 5]
        avg_rmse = {percent: 0 for percent in percents}
        for uncalibrated_file in uncalibrated_files:
            f = open(uncalibrated_file)
            test_log = json.load(f)["test"]

            for task, task_info in test_log.items():
                for percent in percents:
                    sets_by_uncertainty = task_info["sets_by_uncertainty"]
                    
                    topx = int(len(sets_by_uncertainty) * percent / 100)

                    mse = 0

                    for set_ in sets_by_uncertainty[-topx:]:
                        mse += set_["error"]**2 / topx

                    avg_rmse[percent] += np.sqrt(mse) / len(uncalibrated_files) 
            f.close()
        for percent in percents:
            top5_df = top5_df.append({'Estimator': estimator_name_map[estimator],
                            'Data Set': dataset_name_map[data_source],
                            'Selection': f'Top {percent}%',
                            'RMSE': avg_rmse[percent]}, ignore_index=True)

In [93]:
fig = px.bar(top5_df,
             x="Estimator",
             y="RMSE",
             color="Selection",
             height=468*2,
             width=234*2,
             facet_row='Data Set')

# Change the bar mode

fig.update_yaxes(matches=None)

fig.update_layout(legend=dict(xanchor='center', x=0.5, y=-0.2, orientation='h', title=None),
                  barmode="group",
                  margin=go.layout.Margin(
                    l=25,
                    r=25,
                    b=0,
                    t=10,
                    pad=0),
                  autosize=False,
                  font=dict(
                  size=12))

fig.for_each_annotation(update_label)

fig.show()