# 03 â€” Modeling & Evaluation (Early Default)
#
# **Objective**
# Train and evaluate predictive models for `early_default` using a realistic temporal split.
#
# **Datasets**
# - **BASE**: origination-only features (no macro)
# - **MACRO**: BASE + monthly macroeconomic indicators (UNRATE, FEDFUNDS, CPI)
# 
# **Models**
# 1) Logistic Regression (interpretable baseline)
# 2) XGBoost (nonlinear model with class-imbalance handling)
#
# **Evaluation**
# - ROC-AUC (ranking ability)
# - PR-AUC (rare-event performance)
# - Threshold policy (recall / review-rate tradeoffs)
# - Calibration check (are probabilities meaningful as PDs?)
# - Lift (risk concentration in high-score segments)


In [2]:
import pandas as pd
import numpy as np

# 1) Load Processed Dataset (from Notebook 02)
# This dataset is already:
# - target-defined (`early_default`)
# - filtered for observability (>= 6 months observable OR early default within 6 months)
# - imputed (median for numeric; "Unknown" for categorical)
# - one-hot encoded
#
# `issue_d` is retained only for temporal splitting, not as a predictive feature.


In [3]:
df_base = pd.read_csv(
    "../data/processed/early_default_modeling_dataset.csv",
    parse_dates=["issue_d"]
)

df_macro = pd.read_csv(
    "../data/processed/early_default_modeling_dataset_macro.csv",
    parse_dates=["issue_d"]
)

print("Base shape:", df_base.shape)
print("Macro shape:", df_macro.shape)


Base shape: (1976721, 136)
Macro shape: (1976721, 139)


# 2) Temporal Train/Test Split
# To simulate real deployment:
# - Train on older loans (pre-2015)
# - Test on newer loans (2015+)
# This avoids look-ahead bias and provides realistic generalization testing.

In [4]:
def time_split(df, split_date="2015-01-01"):
    train = df[df["issue_d"] < split_date].copy()
    test  = df[df["issue_d"] >= split_date].copy()

    X_train = train.drop(columns=["early_default", "issue_d"])
    y_train = train["early_default"]

    X_test = test.drop(columns=["early_default", "issue_d"])
    y_test = test["early_default"]

    return X_train, X_test, y_train, y_test

Xtr_base, Xte_base, ytr, yte = time_split(df_base)
Xtr_macro, Xte_macro, ytr2, yte2 = time_split(df_macro)

print("Train default rate (base):", ytr.mean())
print("Test default rate (base):", yte.mean())
print("Train default rate (macro):", ytr2.mean())
print("Test default rate (macro):", yte2.mean())


Train default rate (base): 0.018583582566276607
Test default rate (base): 0.023350342115899526
Train default rate (macro): 0.018583582566276607
Test default rate (macro): 0.023350342115899526


# Sanity check: NaNs
# Since Notebook 02 performed imputation, we expect no missing values.

In [5]:
print("NaNs base train:", Xtr_base.isna().sum().sum())
print("NaNs base test:", Xte_base.isna().sum().sum())
print("NaNs macro train:", Xtr_macro.isna().sum().sum())
print("NaNs macro test:", Xte_macro.isna().sum().sum())


NaNs base train: 0
NaNs base test: 0
NaNs macro train: 0
NaNs macro test: 0


# 3) Utility: Metric evaluation (ROC-AUC and PR-AUC)
# PR-AUC is particularly important because early default is a rare event.


In [6]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

def eval_probs(y_true, y_proba, label):
    roc = roc_auc_score(y_true, y_proba)
    p, r, _ = precision_recall_curve(y_true, y_proba)
    pr = auc(r, p)
    print(f"{label} ROC-AUC: {roc:.4f}")
    print(f"{label} PR-AUC : {pr:.4f}")
    return roc, pr


# 4) Logistic Regression (Baseline)
# We standardize features and use `class_weight='balanced'` due to class imbalance.


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

def train_logistic(X_train, y_train, X_test):
    scaler = StandardScaler()
    Xtr_s = scaler.fit_transform(X_train)
    Xte_s = scaler.transform(X_test)

    model = LogisticRegression(max_iter=1000, class_weight="balanced")
    model.fit(Xtr_s, y_train)

    proba = model.predict_proba(Xte_s)[:, 1]
    return model, scaler, proba

log_base, sc_base, proba_log_base = train_logistic(Xtr_base, ytr, Xte_base)
eval_probs(yte, proba_log_base, "Logistic (BASE)")

log_macro, sc_macro, proba_log_macro = train_logistic(Xtr_macro, ytr2, Xte_macro)
eval_probs(yte2, proba_log_macro, "Logistic (MACRO)")


Logistic (BASE) ROC-AUC: 0.7188
Logistic (BASE) PR-AUC : 0.0586
Logistic (MACRO) ROC-AUC: 0.7123
Logistic (MACRO) PR-AUC : 0.0577


(np.float64(0.7122670565694902), np.float64(0.0577295642505537))

# 5) XGBoost (Advanced)
# XGBoost is strict about feature names. Some one-hot encoded columns can contain special characters
# that XGBoost rejects (e.g., `[`, `]`, `<`). We sanitize column names and align train/test columns.


In [8]:
import re

def clean_xgb_columns(X_train, X_test):
    # Ensure column names are strings + remove illegal characters
    X_train = X_train.copy()
    X_test = X_test.copy()

    X_train.columns = [re.sub(r"[^A-Za-z0-9_]+", "_", str(c)) for c in X_train.columns]
    X_test.columns  = [re.sub(r"[^A-Za-z0-9_]+", "_", str(c)) for c in X_test.columns]

    # Ensure same columns and same order
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    return X_train, X_test

Xtr_base_xgb, Xte_base_xgb = clean_xgb_columns(Xtr_base, Xte_base)
Xtr_macro_xgb, Xte_macro_xgb = clean_xgb_columns(Xtr_macro, Xte_macro)

# Quick sanity: no illegal characters remain
bad_cols = [c for c in Xtr_base_xgb.columns if any(ch in c for ch in ["[", "]", "<"])]
print("Bad cols remaining:", len(bad_cols))


Bad cols remaining: 0


In [13]:
from xgboost import XGBClassifier

def train_xgb(X_train, y_train, X_test):
    # scale_pos_weight = (# negative) / (# positive)
    neg, pos = np.bincount(y_train)
    spw = neg / pos

    model = XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        random_state=42,
        scale_pos_weight=spw,
        eval_metric="logloss"
    )
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    return model, spw, proba

xgb_base, spw_base, proba_xgb_base = train_xgb(Xtr_base_xgb, ytr, Xte_base_xgb)
eval_probs(yte, proba_xgb_base, "XGBoost (BASE)")

xgb_macro, spw_macro, proba_xgb_macro = train_xgb(Xtr_macro_xgb, ytr2, Xte_macro_xgb)
eval_probs(yte2, proba_xgb_macro, "XGBoost (MACRO)")


XGBoost (BASE) ROC-AUC: 0.7187
XGBoost (BASE) PR-AUC : 0.0601
XGBoost (MACRO) ROC-AUC: 0.6808
XGBoost (MACRO) PR-AUC : 0.0483


(np.float64(0.6808290574625906), np.float64(0.04826998178318193))

# 6) Model Selection Decision
# Based on the results, macro variables reduced performance (both ROC-AUC and PR-AUC).
# Therefore, we proceed with **XGBoost (BASE)** as the final operational model.
#
# Next we will:
# - choose a threshold based on an operational policy (e.g., >=80% recall)
# - report review rate and confusion matrix
# - check calibration and lift
# - evaluate review-capacity scenarios (top 10/20/30%)


In [14]:
# Final model artifacts used downstream:
y_final = yte
proba_final = proba_xgb_base

print("Final model chosen: XGBoost (BASE)")
print("Final test default rate:", y_final.mean())


Final model chosen: XGBoost (BASE)
Final test default rate: 0.023350342115899526


# 7) Save trained artifacts
# We save:
# - base scaler + logistic (baseline)
# - base xgboost (final model)
# - policy threshold later

In [15]:
import joblib

joblib.dump(sc_base, "../models/scaler_base.joblib")
joblib.dump(log_base, "../models/log_model_base.joblib")
joblib.dump(xgb_base, "../models/xgb_model_base.joblib")

print("Saved: models/scaler_base.joblib, models/log_model_base.joblib, models/xgb_model_base.joblib")

Saved: models/scaler_base.joblib, models/log_model_base.joblib, models/xgb_model_base.joblib


# 8) Threshold Optimization (Reference)
# We evaluate thresholds from 0.01 to 0.50.
# 
# Note:
# - F1 is included for reference only.
# - In credit early-warning, threshold selection is usually policy-driven (recall vs review capacity).


In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

thresholds = np.arange(0.01, 0.51, 0.01)
precisions, recalls, f1s = [], [], []

for t in thresholds:
    y_pred = (proba_final >= t).astype(int)
    precisions.append(precision_score(y_final, y_pred))
    recalls.append(recall_score(y_final, y_pred))
    f1s.append(f1_score(y_final, y_pred))

results = pd.DataFrame({
    "threshold": thresholds,
    "precision": precisions,
    "recall": recalls,
    "f1": f1s
}).sort_values("f1", ascending=False)

print("Top thresholds by F1:")
display(results.head(10))

Top thresholds by F1:


Unnamed: 0,threshold,precision,recall,f1
49,0.5,0.048943,0.573005,0.090183
48,0.49,0.047998,0.590875,0.088784
47,0.48,0.047018,0.607596,0.087282
46,0.47,0.046098,0.624289,0.085856
45,0.46,0.045172,0.640142,0.08439
44,0.45,0.044277,0.655435,0.08295
43,0.44,0.043432,0.670672,0.08158
42,0.43,0.042564,0.684984,0.080148
41,0.42,0.041805,0.700417,0.078901
40,0.41,0.041101,0.71585,0.077739


# 9) Policy Threshold: Target Recall (Operational Early Warning)
# Early warning systems often prioritize catching defaults (high recall).
# We choose the threshold that achieves at least 80% recall with the best precision available.


In [19]:
target_recall = 0.80
eligible = results[results["recall"] >= target_recall]
best_policy = eligible.sort_values("precision", ascending=False).head(1)
display(best_policy)

policy_threshold = float(best_policy["threshold"].iloc[0])
print("Policy threshold (>=80% recall):", policy_threshold)

joblib.dump({"policy_threshold": policy_threshold}, "../models/policy_threshold.joblib")
print("Saved: models/policy_threshold.joblib")


Unnamed: 0,threshold,precision,recall,f1
33,0.34,0.036374,0.810912,0.069625


Policy threshold (>=80% recall): 0.34
Saved: models/policy_threshold.joblib


# 10) Performance at Policy Threshold
# We report:
# - review/flag rate (how many loans get flagged)
# - confusion matrix
# - classification report (precision/recall/F1)


In [20]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred_policy = (proba_final >= policy_threshold).astype(int)
flag_rate = y_pred_policy.mean()

print("Policy threshold:", policy_threshold)
print("Flag / review rate:", flag_rate)

cm = confusion_matrix(y_final, y_pred_policy)
print("Confusion Matrix:\n", cm)

print("\nClassification Report:\n", classification_report(y_final, y_pred_policy))


Policy threshold: 0.34
Flag / review rate: 0.5205681569952924
Confusion Matrix:
 [[726307 767004]
 [  6751  28952]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.49      0.65   1493311
           1       0.04      0.81      0.07     35703

    accuracy                           0.49   1529014
   macro avg       0.51      0.65      0.36   1529014
weighted avg       0.97      0.49      0.64   1529014



# 11) Calibration Check (Deciles)
# Calibration compares predicted probabilities vs observed event rates.
# We bin loans into risk deciles and compute:
# - avg_pred: mean predicted probability in bin
# - event_rate: observed default rate in bin
# - count: number of loans in bin


In [21]:
cal = pd.DataFrame({"p": proba_final, "y": y_final.values})
cal["bin"] = pd.qcut(cal["p"], 10, duplicates="drop")

cal_summary = cal.groupby("bin").agg(
    avg_pred=("p", "mean"),
    event_rate=("y", "mean"),
    count=("y", "size")
).reset_index()

display(cal_summary)

  cal_summary = cal.groupby("bin").agg(


Unnamed: 0,bin,avg_pred,event_rate,count
0,"(0.0073999999999999995, 0.128]",0.090003,0.003643,152902
1,"(0.128, 0.19]",0.159579,0.006422,152901
2,"(0.19, 0.243]",0.216755,0.00981,152901
3,"(0.243, 0.295]",0.268897,0.012099,152902
4,"(0.295, 0.352]",0.323195,0.015638,152901
5,"(0.352, 0.414]",0.382835,0.020268,152901
6,"(0.414, 0.481]",0.447318,0.024297,152902
7,"(0.481, 0.554]",0.517102,0.03138,152901
8,"(0.554, 0.642]",0.596156,0.040647,152901
9,"(0.642, 0.927]",0.715281,0.069299,152902


# 12) Lift (Risk Concentration)
# Lift compares default rate in the highest-risk decile to the overall default rate.


In [22]:
overall_rate = y_final.mean()
top_decile_rate = cal_summary.iloc[-1]["event_rate"]
lift = top_decile_rate / overall_rate

print("Overall default rate:", overall_rate)
print("Top decile default rate:", top_decile_rate)
print("Top decile lift:", lift)

Overall default rate: 0.023350342115899526
Top decile default rate: 0.06929928974114138
Top decile lift: 2.967806184473617


# 13) Review-Capacity Scenarios (Top X% flagged)
# Instead of selecting threshold via F1, evaluate operational scenarios:
# - If we can review top 10%, 20%, 30% of loans by score,
#   how much recall do we get and what precision?


In [23]:
for target_flag_rate in [0.10, 0.20, 0.30]:
    thresh = np.percentile(proba_final, 100 * (1 - target_flag_rate))
    y_pred_tmp = (proba_final >= thresh).astype(int)

    recall_tmp = recall_score(y_final, y_pred_tmp)
    precision_tmp = precision_score(y_final, y_pred_tmp)

    print(f"\nTarget flag rate: {target_flag_rate}")
    print("Threshold:", round(thresh, 3))
    print("Precision:", round(precision_tmp, 4))
    print("Recall:", round(recall_tmp, 4))


Target flag rate: 0.1
Threshold: 0.642
Precision: 0.0693
Recall: 0.2968

Target flag rate: 0.2
Threshold: 0.554
Precision: 0.055
Recall: 0.4709

Target flag rate: 0.3
Threshold: 0.481
Precision: 0.0471
Recall: 0.6052
