# Calibrate Models

## Set-Up

Import Packages/Libraries

In [None]:
import sys
import warnings

sys.path.append("../")
from src.data_utils import get_data, get_models
from src.nn_model import load_nn_clf

from sklearn.calibration import CalibratedClassifierCV
from sklearn.frozen import FrozenEstimator
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from src.config import BASE_PATH, SEED
import joblib

print(f"Path to project root: {BASE_PATH}")

Get Models + Set Globals

In [None]:
# Data
file_dir = BASE_PATH / "data" / "processed"
OUTCOME_DICT = {
    "med": get_data("med_outcome", file_dir),
    "surg": get_data("surg_outcome", file_dir),
    "mort": get_data("mort_outcome", file_dir),
    "reop": get_data("reop_outcome", file_dir),
    "vte": get_data("vte_outcome", file_dir),
}
X_shape = OUTCOME_DICT["surg"]["X_train"].shape[1]  # same for all outcomes
# Models
model_dir = BASE_PATH / "models" / "trained"
model_prefix_list = ["lgbm", "lr", "xgb", "stack"]
MODEL_DICT = {}

for outcome in OUTCOME_DICT.keys():
    ## Base models
    MODEL_DICT[outcome] = get_models(model_prefix_list, outcome, model_dir)
    ## Neural network
    nn_import = load_nn_clf(
        data_path=BASE_PATH / "models" / "trained" / outcome / "nn.pt",
        in_dim=X_shape,
        device="cpu",
    )
    MODEL_DICT[outcome]["nn"] = nn_import
N_SPLITS = 5

## Calibrate Models

In [None]:
# For each outcome
for outcome_name, model_dict in MODEL_DICT.items():
    print(f"Working on outcome: {outcome_name}...")
    # For each model
    for model_name, model in model_dict.items():
        print(f"Model: {model_name}")
        ### Fit Calibrated Classifier on validation set###
        X_val = OUTCOME_DICT[outcome_name]["X_val"]
        y_val = OUTCOME_DICT[outcome_name]["y_val"].values.ravel()
        # Frozen estimator -- won't refit
        skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
        calibrated_model = CalibratedClassifierCV(FrozenEstimator(model), cv=skf, n_jobs=-1)  # type: ignore
        calibrated_model.fit(X_val, y_val)
        ### Export model ###
        export_path = (
            BASE_PATH / "models" / "calibrated" / outcome_name / f"{model_name}.joblib"
        )
        if export_path.exists():
            warnings.warn(f"Over-writing models at path: {export_path}")
            export_path.unlink()
        export_path.parent.mkdir(exist_ok=True, parents=True)
        joblib.dump(calibrated_model, export_path)
        ### Get prelim results ###
        # Val
        val_proba = calibrated_model.predict_proba(X_val)[:, 1]  # type: ignore
        val_score = roc_auc_score(y_val, val_proba)

        # Train
        X_train = OUTCOME_DICT[outcome_name]["X_train"]
        y_train = OUTCOME_DICT[outcome_name]["y_train"]
        train_proba = calibrated_model.predict_proba(X_train)[:, 1]  # type: ignore
        train_score = roc_auc_score(y_train, train_proba)

        print(f"Train AUROC: \t{train_score:.3f}")
        print(f"Val AUROC: \t{val_score:.3f}")
        print("*" * 20)
    print("-" * 50)

In [None]:
file_dir = BASE_PATH / "data" / "processed"
OUTCOME_DICT = {
    "med": get_data("med_outcome", file_dir),
    "surg": get_data("surg_outcome", file_dir),
    "mort": get_data("mort_outcome", file_dir),
    "reop": get_data("reop_outcome", file_dir),
    "vte": get_data("vte_outcome", file_dir),
}  # Models
model_dir = BASE_PATH / "models" / "calibrated"
model_prefix_list = ["lgbm", "lr", "xgb", "nn", "stack"]
MODEL_DICT = {}
for outcome in OUTCOME_DICT.keys():
    MODEL_DICT[outcome] = get_models(model_prefix_list, outcome, model_dir)

In [None]:
model = MODEL_DICT["surg"]["nn"]
X = OUTCOME_DICT["surg"]["X_val"]
model.predict_proba(X)