# Metrics Analysis Notebook

This notebook mirrors the cleaned workflow from `scripts/metrics_analysis.py`. It:

- Loads metrics CSVs for selected activity.
- Deduplicates and preprocesses the DataFrame.
- Aggregates calibration data.
- Produces correlation heatmaps, grouped bar plots, calibration curves, and RMSE rejection curves.

Use this for interactive exploration; for batch runs, prefer the script.

## Configuration

In [None]:
import os

import pandas as pd

from uqdd.metrics import (
    group_cols, numeric_cols, string_cols, order_by,
    group_order_no_time, hatches_dict_no_time,
    accmetrics, accmetrics2, uctmetrics,
    aggregate_results_csv,
    find_highly_correlated_metrics, plot_metrics, plot_comparison_metrics,
    plot_calibration_data, plot_rmse_rejection_curves, plot_auc_comparison,
    save_stats_df, plot_pairplot
)

In [None]:
DATA_NAME = "papyrus"
ACTIVITY_TYPE = "xc50"  # or 'kx'
PROJECT_NAME = "notebook-run"
COLOR_MAP = "tab10_r"
COLOR_MAP_2 = None  # defaults to COLOR_MAP if None
CORR_CMAP = "YlGnBu"
SHOW_LEGEND = True

COLOR_MAP_2 = COLOR_MAP if COLOR_MAP_2 is None else COLOR_MAP_2

# Paths (relative to repo root)
repo_root = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))) if os.path.basename(
    os.getcwd()) == 'notebooks' else os.path.dirname(os.path.abspath(os.getcwd()))
base_path = os.path.join(repo_root, "uqdd", "figures")

project_out_name = PROJECT_NAME
TYPE_N_TARGETS = "all"
DATA_SPECIFIC_PATH = f"{DATA_NAME}/{ACTIVITY_TYPE}/{TYPE_N_TARGETS}"

file_1 = os.path.join(base_path, "papyrus", ACTIVITY_TYPE, "all", f"reassess-runs_ensemble_mcdp_{ACTIVITY_TYPE}",
                      "metrics.csv")
file_2 = os.path.join(base_path, "papyrus", ACTIVITY_TYPE, "all", f"reassess-runs_evidential_{ACTIVITY_TYPE}",
                      "metrics.csv")
file_3 = os.path.join(base_path, "papyrus", ACTIVITY_TYPE, "all", f"reassess-runs_pnn_{ACTIVITY_TYPE}", "metrics.csv")

save_dir = os.path.join(base_path, DATA_SPECIFIC_PATH, project_out_name, COLOR_MAP)
save_dir_no_time = os.path.join(base_path, DATA_SPECIFIC_PATH, f"{project_out_name}-no-time", COLOR_MAP)

for p in [save_dir, save_dir_no_time]:
    os.makedirs(p, exist_ok=True)

print("Input files:", file_1, file_2, file_3, sep='\n')
print("Output dirs:", save_dir, save_dir_no_time, sep='\n')

## Load and preprocess

In [None]:
df_1 = pd.read_csv(file_1, header=0)
df_2 = pd.read_csv(file_2, header=0)
df_3 = pd.read_csv(file_3, header=0)

In [None]:
df_main = pd.concat([df_1, df_2, df_3])
num_duplicates = df_main.duplicated(subset=["wandb run", "Task"]).sum()
print(f"Duplicates found: {num_duplicates}")

df_main = df_main.drop_duplicates(subset=["wandb run", "Task"], keep="first")

In [None]:
# Harmonize split names
df_main["Split"] = df_main["Split"].apply(lambda x: "stratified" if x == "random" else x)

In [None]:
# Remove specific MCDropout rows
df_merged = df_main.copy()
df_merged = df_merged[~(((df_merged["Model type"] == "mcdropout") & (df_merged["Split"] == "scaffold_cluster") & (
            df_merged["dropout"] == 0.2)))]
df_merged = df_merged[~(
((df_merged["Model type"] == "mcdropout") & (df_merged["Split"] == "stratified") & (df_merged["dropout"] == 0.1)))]
df_merged = df_merged[
    ~(((df_merged["Model type"] == "mcdropout") & (df_merged["Split"] == "time") & (df_merged["dropout"] == 0.1)))]

df_merged["Group"] = df_merged.apply(lambda row: f"{row['Split']}_{row['Model type']}", axis=1)

In [None]:
# Task subsets
df_pcm = df_merged[df_merged["Task"] == "PCM"].copy()
df_before_calib = df_merged[df_merged["Task"] == "PCM_before_calibration"].copy()
df_before_calib["Calibration"] = "Before Calibration"
df_after_calib = df_merged[df_merged["Task"] == "PCM_after_calibration_with_isotonic_regression"].copy()
df_after_calib["Calibration"] = "After Calibration"

df_calib = pd.concat([df_before_calib, df_after_calib])
df_calib_no_time = df_calib.copy()[df_calib["Split"] != "time"]

df_no_time = df_pcm.copy()[df_pcm["Split"] != "time"]

df_pcm.to_csv(os.path.join(save_dir, "final.csv"), index=False)
df_no_time.to_csv(os.path.join(save_dir_no_time, "final_no_time.csv"), index=False)
print(df_pcm.shape, df_no_time.shape)

## Aggregate calibration data

In [None]:
output_file_path_no_time = os.path.join(
    save_dir_no_time, "final_aggregated_no_time.csv"
)
output_file_path = os.path.join(
    save_dir, "final_aggregated.csv"
)

In [None]:
final_aggregated = aggregate_results_csv(df_pcm, group_cols, numeric_cols, string_cols, order_by, output_file_path)
final_aggregated["Group"] = final_aggregated.apply(lambda row: f"{row['Split']}_{row['Model type']}", axis=1)

final_aggregated_no_time = aggregate_results_csv(df_no_time, group_cols, numeric_cols, string_cols, order_by,
                                                 output_file_path_no_time)
final_aggregated_no_time["Group"] = final_aggregated_no_time.apply(lambda row: f"{row['Split']}_{row['Model type']}",
                                                                   axis=1)

final_aggregated.head()

## Correlation analysis

In [None]:
find_highly_correlated_metrics(df_pcm, accmetrics, threshold=0.9, save_dir=save_dir, cmap=CORR_CMAP,
                               show_legend=SHOW_LEGEND)
find_highly_correlated_metrics(df_no_time, accmetrics, threshold=0.9, save_dir=save_dir_no_time, cmap=CORR_CMAP,
                               show_legend=SHOW_LEGEND)
find_highly_correlated_metrics(df_pcm, uctmetrics, threshold=0.9, save_dir=save_dir, cmap=CORR_CMAP,
                               show_legend=SHOW_LEGEND)
find_highly_correlated_metrics(df_no_time, uctmetrics, threshold=0.9, save_dir=save_dir_no_time, cmap=CORR_CMAP,
                               show_legend=SHOW_LEGEND)

uctmetrics_uncorr = ["Miscalibration Area", "Sharpness", "CRPS", "NLL", "Interval"]
find_highly_correlated_metrics(df_pcm, uctmetrics_uncorr, threshold=0.9, save_dir=save_dir, cmap=CORR_CMAP,
                               show_legend=SHOW_LEGEND)
find_highly_correlated_metrics(df_no_time, uctmetrics_uncorr, threshold=0.9, save_dir=save_dir_no_time, cmap=CORR_CMAP,
                               show_legend=SHOW_LEGEND)

## Grouped bar plots

In [None]:
color_dict = plot_metrics(df_no_time, accmetrics, cmap=COLOR_MAP, save_dir=save_dir_no_time,
                          hatches_dict=hatches_dict_no_time, group_order=group_order_no_time, fig_width=12,
                          fig_height=3, show_legend=SHOW_LEGEND)
plot_metrics(df_no_time, accmetrics2, cmap=COLOR_MAP, save_dir=save_dir_no_time, hatches_dict=hatches_dict_no_time,
             group_order=group_order_no_time, fig_width=6, fig_height=3, show_legend=SHOW_LEGEND)
uctmetrics_uncorr = ["Miscalibration Area", "Sharpness", "CRPS", "NLL", "Interval"]
plot_metrics(df_no_time, uctmetrics_uncorr, cmap=COLOR_MAP, save_dir=save_dir_no_time,
             hatches_dict=hatches_dict_no_time, group_order=group_order_no_time, fig_width=10, fig_height=3,
             show_legend=SHOW_LEGEND)

## Pairplots

In [None]:
plot_pairplot(df_no_time, "Pairplot for Accuracy Metrics", accmetrics, save_dir=save_dir_no_time, cmap=COLOR_MAP,
              group_order=group_order_no_time, show_legend=SHOW_LEGEND)
plot_pairplot(df_no_time, "Pairplot for Uncertainty Metrics", uctmetrics, save_dir=save_dir_no_time, cmap=COLOR_MAP,
              group_order=group_order_no_time, show_legend=SHOW_LEGEND)

## Calibration curves

In [None]:
models_order = ["pnn", "ensemble", "mcdropout", "evidential", "eoe", "emc"]
plot_comparison_metrics(df_calib_no_time, ["Miscalibration Area", "Sharpness", "NLL", "CRPS", "Interval"],
                        cmap=COLOR_MAP, color_dict=color_dict, save_dir=save_dir_no_time, fig_width=20, fig_height=3,
                        show_legend=SHOW_LEGEND)

# Overall calibration
plot_calibration_data(final_aggregated_no_time, base_path, save_dir_no_time, title="Calibration Curves for Models",
                      color_name=COLOR_MAP_2, group_order=group_order_no_time, fig_width=5, fig_height=5,
                      show_legend=SHOW_LEGEND)

# Per-group
for i in range(len(final_aggregated_no_time)):
    df_sub = final_aggregated_no_time.iloc[[i]]
    plot_calibration_data(df_sub, base_path, save_dir_no_time,
                          title=f"Calibration Curves for {df_sub['Group'].values[0]}", color_name=COLOR_MAP_2,
                          group_order=group_order_no_time, fig_width=5, fig_height=5, show_legend=SHOW_LEGEND)

# Per-split
for s in final_aggregated_no_time["Split"].unique():
    df_split = final_aggregated_no_time[final_aggregated_no_time["Split"] == s]
    plot_calibration_data(df_split, base_path, save_dir_no_time, title=f"Calibration Curves for {s}",
                          color_name=COLOR_MAP_2, group_order=group_order_no_time, fig_width=5, fig_height=5,
                          show_legend=SHOW_LEGEND)

## RMSE Rejection curves + AUC comparison

In [None]:

save_dir_plot = os.path.join(save_dir_no_time, "rrcs")
os.makedirs(save_dir_plot, exist_ok=True)

uct_types = ["aleatoric", "epistemic", "both"]
for uct_t in uct_types:
    for normalize_rmse in [True, False]:
        add_to_title = ("-normalized" if normalize_rmse else "") + "-" + uct_t
        stats_df = plot_rmse_rejection_curves(df_no_time, base_path, cmap=COLOR_MAP_2, save_dir_plot=save_dir_plot,
                                              add_to_title="all" + add_to_title, normalize_rmse=normalize_rmse,
                                              unc_type=uct_t, max_rejection_ratio=0.95, group_order=group_order_no_time,
                                              fig_width=6, fig_height=5, show_legend=SHOW_LEGEND)
        plot_auc_comparison(stats_df, cmap=COLOR_MAP, color_dict=color_dict, save_dir=save_dir_plot,
                            add_to_title="all" + add_to_title, hatches_dict=hatches_dict_no_time,
                            group_order=group_order_no_time, fig_width=4, fig_height=3, show_legend=SHOW_LEGEND)
        plot_auc_comparison(stats_df, cmap=COLOR_MAP, color_dict=color_dict, save_dir=save_dir_plot,
                            add_to_title="all" + add_to_title + "-min-0.5", hatches_dict=hatches_dict_no_time,
                            group_order=group_order_no_time, min_y_axis=0.5, fig_width=4, fig_height=3,
                            show_legend=SHOW_LEGEND)
        save_stats_df(stats_df, save_dir_plot, add_to_title="all" + add_to_title)

# Per split (excluding time)
df_pcm_stratified = df_pcm[df_pcm["Split"] == "stratified"]
df_pcm_scaffold = df_pcm[df_pcm["Split"] == "scaffold_cluster"]
for name, df_sub in [("stratified", df_pcm_stratified), ("scaffold", df_pcm_scaffold)]:
    for uct_t in uct_types:
        for normalize_rmse in [True, False]:
            add_to_title = ("-normalized" if normalize_rmse else "") + "-" + uct_t
            stats_df = plot_rmse_rejection_curves(df_sub, base_path, cmap=COLOR_MAP_2, save_dir_plot=save_dir_plot,
                                                  add_to_title=name + add_to_title, normalize_rmse=normalize_rmse,
                                                  unc_type=uct_t, max_rejection_ratio=0.95,
                                                  group_order=group_order_no_time, fig_width=6, fig_height=5,
                                                  show_legend=SHOW_LEGEND)
            plot_auc_comparison(stats_df, cmap=COLOR_MAP, color_dict=color_dict, save_dir=save_dir_plot,
                                add_to_title=name + add_to_title, hatches_dict=hatches_dict_no_time,
                                group_order=group_order_no_time, fig_width=2, fig_height=3, show_legend=SHOW_LEGEND)
            plot_auc_comparison(stats_df, cmap=COLOR_MAP, color_dict=color_dict, save_dir=save_dir_plot,
                                add_to_title=name + add_to_title + "-min-0.5", hatches_dict=hatches_dict_no_time,
                                group_order=group_order_no_time, min_y_axis=0.5, fig_width=2, fig_height=3,
                                show_legend=SHOW_LEGEND)
            save_stats_df(stats_df, save_dir_plot, add_to_title=name + add_to_title)

# Special MC Dropout Experiment

In [None]:
mcdp_before_calib = df_merged[
    (df_merged["Model type"] == "mcdropout") & (df_merged["Task"] == "PCM_before_calibration")].copy()
mcdp_after_calib_iso = df_merged[(df_merged["Model type"] == "mcdropout") & (
            df_merged["Task"] == "PCM_after_calibration_with_isotonic_regression")].copy()
mcdp_after_calib_std = df_merged[(df_merged["Model type"] == "mcdropout") & (
            df_merged["Task"] == "PCM_after_calibration_with_std_recalibrator")].copy()
mc_group_order = [
    "stratified_mcdropout",
    "scaffold_cluster_mcdropout",
    "time_mcdropout"
]



In [None]:
# Now we shall plot the same but to compare the MCDP results
plot_metrics(mcdp_before_calib, 'Accuracy Metrics Before Calibration (MCDP)', accmetrics, plot_type='bar',
             save_dir=save_dir, group_order=mc_group_order)
plot_metrics(mcdp_after_calib_iso, 'Uncertainty Metrics After Calibration (Isotonic Regression) 1 (MCDP)',
             uctmetrics, plot_type='bar', save_dir=save_dir, group_order=mc_group_order)

plot_metrics(mcdp_after_calib_std, 'Uncertainty Metrics After Calibration (Std Recalibrator) 1 (MCDP)', uctmetrics,
             plot_type='bar', save_dir=save_dir, group_order=mc_group_order)

# Pair plot
plot_pairplot(mcdp_before_calib, 'Pairplot for Accuracy Metrics (MCDP)', accmetrics, save_dir=save_dir)

# Comparing Calibration Metrics Before and After Calibration in one plot
# Add a column to indicate calibration state
mcdp_before_calib['Calibration'] = 'Before Calibration'
mcdp_after_calib_iso['Calibration'] = 'After Isotonic Regression'
mcdp_after_calib_std['Calibration'] = 'After Std Recalibrator'

# Combine all data into one DataFrame
mcdp_combined = pd.concat([mcdp_before_calib, mcdp_after_calib_iso, mcdp_after_calib_std])


# Backup and Experimental Code

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.table import Table

# Data preparation
data = {
    'Parameter': ['weight_decay', 'batch_size', 'lr', 'dropout'],
    'Value1': [0.424, 0.287, 0.210, 0.079],
    'Value2': [-0.185, -0.093, 0.081, -0.204]
}

df = pd.DataFrame(data)

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('tight')
ax.axis('off')

tbl = Table(ax, bbox=[0, 0, 1, 1])
tbl.auto_set_font_size(False)
tbl.set_fontsize(12)

# Adding table cells
n_rows, n_cols = df.shape
width, height = 1.0 / n_cols, 1.0 / (n_rows + 1)

# Add header row
for col in range(n_cols):
    tbl.add_cell(0, col, width, height, text=df.columns[col], loc='center', facecolor='lightgrey')

# Add data rows
for row in range(n_rows):
    for col in range(n_cols):
        value = df.iloc[row, col]
        if col == 0:
            tbl.add_cell(row + 1, col, width, height, text=value, loc='center', facecolor='white')
        else:
            bar_color = 'blue' if value > 0 else 'red'
            tbl.add_cell(row + 1, col, width * abs(value), height, text='', loc='left', facecolor=bar_color,
                         edgecolor='none')

ax.add_table(tbl)
# Saving the plot
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Parameters for the normal distributions
mu1, sigma1 = 7, 0.1  # mean and standard deviation
mu2, sigma2 = 7, 0.2
mu3, sigma3 = 6, 0.3
mu4, sigma4 = 8, 0.4
# Generate data points
x = np.linspace(5, 9, 1000)
y1 = (1 / (sigma1 * np.sqrt(2 * np.pi))) * np.exp(- (x - mu1) ** 2 / (2 * sigma1 ** 2))
y2 = (1 / (sigma2 * np.sqrt(2 * np.pi))) * np.exp(- (x - mu2) ** 2 / (2 * sigma2 ** 2))
y3 = (1 / (sigma3 * np.sqrt(2 * np.pi))) * np.exp(- (x - mu3) ** 2 / (2 * sigma3 ** 2))
y4 = (1 / (sigma4 * np.sqrt(2 * np.pi))) * np.exp(- (x - mu4) ** 2 / (2 * sigma4 ** 2))
# Create the plot
plt.figure(figsize=(6, 5))
plt.plot(x, y1, label=f'$\mu={mu1}, \sigma={sigma1}$', linewidth=2)
plt.plot(x, y2, label=f'$\mu={mu2}, \sigma={sigma2}$', linewidth=2)
plt.plot(x, y3, label=f'$\mu={mu3}, \sigma={sigma3}$', linewidth=2)
plt.plot(x, y4, label=f'$\mu={mu4}, \sigma={sigma4}$', linewidth=2)

# # Add annotations for sigma
# plt.annotate(f'$\sigma={sigma1}$', xy=(mu1 + sigma1, (1/(sigma1 * np.sqrt(2 * np.pi))) * np.exp( - (sigma1)**2 / (2 * sigma1**2) )), xytext=(mu1 + sigma1 + 1, (1/(sigma1 * np.sqrt(2 * np.pi))) * np.exp( - (sigma1)**2 / (2 * sigma1**2) )),
#              arrowprops=dict(facecolor='black', shrink=0.05))
#
# plt.annotate(f'$\sigma={sigma2}$', xy=(mu2 + sigma2, (1/(sigma2 * np.sqrt(2 * np.pi))) * np.exp( - (sigma2)**2 / (2 * sigma2**2) )), xytext=(mu2 + sigma2 + 1, (1/(sigma2 * np.sqrt(2 * np.pi))) * np.exp( - (sigma2)**2 / (2 * sigma2**2) )),
#              arrowprops=dict(facecolor='black', shrink=0.05))
#
# plt.annotate(f'$\sigma={sigma3}$', xy=(mu3 + sigma3, (1/(sigma3 * np.sqrt(2 * np.pi))) * np.exp( - (sigma3)**2 / (2 * sigma3**2) )), xytext=(mu3 + sigma3 + 1, (1/(sigma3 * np.sqrt(2 * np.pi))) * np.exp( - (sigma3)**2 / (2 * sigma3**2) )),
#              arrowprops=dict(facecolor='black', shrink=0.05))
#
# plt.annotate(f'$\sigma={sigma4}$', xy=(mu4 + sigma4, (1/(sigma4 * np.sqrt(2 * np.pi))) * np.exp( - (sigma4)**2 / (2 * sigma4**2) )), xytext=(mu4 + sigma4 + 1, (1/(sigma4 * np.sqrt(2 * np.pi))) * np.exp( - (sigma4)**2 / (2 * sigma4**2) )),
#              arrowprops=dict(facecolor='black', shrink=0.05))

# Add title and labels
# plt.title('Normal Distributions with Different $\mu$ and $\sigma$')
plt.xlabel('output')
plt.ylabel('Probability Density')
# plt.legend()

# Show the plot
plt.grid(False)
# No background
plt.gca().set_facecolor('white')
# save the plot
plt.tight_layout()
plt.savefig('normal_distributions.png', dpi=1200)
plt.savefig('normal_distributions.pdf')
plt.savefig('normal_distributions.svg')
plt.show()