# Model Evaluation

## Set Up

Libraries/packages

In [None]:
import sys
import warnings

sys.path.append("../")
from src.data_utils import get_data, get_models
from src.eval import evaluate_models, NumpyEncoder, BIN_NAMES
from src.config import BASE_PATH
import numpy as np
import pandas as pd
import json
from shutil import rmtree

# print(f"Using device: {DEVICE}")
print(f"Path: {BASE_PATH}")

Globals

In [None]:
## Path to eval results
SAVE_PATH = BASE_PATH / "results"
# Data
file_dir = BASE_PATH / "data" / "processed"
OUTCOME_DICT = {
    "med": get_data("med_outcome", file_dir),
    "surg": get_data("surg_outcome", file_dir),
    "mort": get_data("mort_outcome", file_dir),
    "reop": get_data("reop_outcome", file_dir),
    "vte": get_data("vte_outcome", file_dir),
}

# Models
model_dir = BASE_PATH / "models" / "calibrated"
model_prefix_list = ["lgbm", "lr", "xgb", "nn", "stack"]
MODEL_DICT = {}
for outcome in OUTCOME_DICT.keys():
    MODEL_DICT[outcome] = get_models(model_prefix_list, outcome, model_dir)

## Get traditional + bin metrics

In [None]:
all_models_test_dict = {}
ALL_DICT = {}
rows = []
index = []
for outcome, cur_data in OUTCOME_DICT.items():
    ALL_DICT[outcome] = {}
    print(f"{'*' *30} Outcome: {outcome} {'*' *30}")
    cur_models_dict = MODEL_DICT[outcome]
    class_report_dict, bin_report_dict = evaluate_models(
        model_dict=cur_models_dict,
        outcome_name=outcome,
        X_train=cur_data["X_train"],
        y_train=cur_data["y_train"].values.ravel(),
        X_val=cur_data["X_val"],
        y_val=cur_data["y_val"].values.ravel(),
        X_test=cur_data["X_test"],
        y_test=cur_data["y_test"].values.ravel(),
        results_path=SAVE_PATH,
        threshold_str="val",
        show_cm=False,
        show_roc=False,
        show_cal=False,
        n_bootstraps=3000,
        show_progress=False,
    )
    ALL_DICT[outcome]["class"] = class_report_dict
    ALL_DICT[outcome]["bins"] = bin_report_dict
    ## ONLY export traditional test metrics (have access to train/val if need be)
    for model, metrics in class_report_dict["test"].items():
        rows.append(metrics)
        index.append(f"{outcome}_{model}")
all_models_outcomes_df = pd.DataFrame(rows, index=index)
# SAVE ALL RESULTS
all_save_path = BASE_PATH / "results" / "tables" / "all_dict_results.json"
if all_save_path.exists():
    all_save_path.unlink()
all_save_path.parent.mkdir(exist_ok=True, parents=True)
with open(all_save_path, "w") as f:
    json.dump(ALL_DICT, f, cls=NumpyEncoder, indent=2)

Re-format bin metrics

In [None]:
# Load it back
with open(all_save_path, "r") as f:
    loaded_dict = json.load(f)
rows = []
index = []

for outcome_name in loaded_dict.keys():
    bins_dict = loaded_dict[outcome_name]["bins"]

    for model_name, bins in bins_dict.items():
        for bin_name, metrics in bins.items():
            # Extract n and percentage
            n_perc = metrics["n_perc"]
            n = n_perc["n"]
            perc_cohort = n_perc["perc"]

            # Extract percentage of all positives
            perc_all_pos = metrics["perc_all_pos"]
            n_pos = perc_all_pos["n"]
            perc_pos = perc_all_pos["perc"]

            # Extract event rate with CIs
            event_dict = metrics["event_rate_w_CIs"]
            event_rate = event_dict["event_rate"]
            ci_lower = event_dict.get("lower_CI", "N/A")
            ci_upper = event_dict.get("upper_CI", "N/A")

            # Format event rate string
            ci_str = f"({ci_lower:.2%}, {ci_upper:.2%})"

            # Extract lift
            lift = metrics["lift"]

            # Extract thresholds and mean output
            thresholds = metrics["thresholds"]
            mean_output = metrics["mean_model_output"]

            # Build row
            row = {
                "N (% of tot cohort)": f"{int(n)} ({float(perc_cohort):.2%})",
                "N pos (% of All Positives)": f"{int(n_pos)} ({float(perc_pos):.2%})",
                "Event Rate (95% CI)": f"{event_rate:.2%} {ci_str}",
                "Lift": f"{float(lift):.2f}" if not np.isnan(lift) else np.nan,
                "Thresholds": thresholds,
                "Mean Model Output": (
                    f"{float(mean_output):.2%}" if not np.isnan(mean_output) else np.nan
                ),
            }
            rows.append(row)
            index.append((outcome_name, model_name, bin_name))

# Create DataFrame with MultiIndex
df = pd.DataFrame(
    rows, index=pd.MultiIndex.from_tuples(index, names=["Outcome", "Model", "Bin"])
)

# Define custom ordering for Bin level
bin_order = BIN_NAMES

# Convert Bin level to categorical with custom order
df.index = df.index.set_levels(  # type: ignore
    pd.CategoricalIndex(df.index.levels[2], categories=bin_order, ordered=True),  # type: ignore
    level=2,
)

# Sort with the custom ordering
df = df.sort_index()

# Convert to flat table
df_flat = df.reset_index()

Export tables

In [None]:
report_path = SAVE_PATH / "tables" / "metrics"
if report_path.exists():
    rmtree(report_path)
report_path.mkdir(exist_ok=True, parents=True)
df_flat.to_excel(report_path / "bin_report.xlsx")
all_models_outcomes_df.to_excel(report_path / "class_report.xlsx")