## Step 1: Setup

In [11]:
from pathlib import Path
import pandas as pd
import numpy as np
import joblib

# Paths
PROJECT_ROOT = Path("..").resolve()
DATA_PATH   = PROJECT_ROOT / "data_work" / "loans_fe.parquet"   # adjust if name differs
RES_DIR     = PROJECT_ROOT / "results"
FIG_DIR     = RES_DIR / "figures"
MODEL_DIR   = RES_DIR / "models"

RES_DIR.mkdir(exist_ok=True, parents=True)
FIG_DIR.mkdir(exist_ok=True, parents=True)

In [12]:
print("Project root:", PROJECT_ROOT)
print("Data path:", DATA_PATH)
print("Model dir:", MODEL_DIR)

Project root: /Users/binodtandan/UNT Research/ai_stress_testing
Data path: /Users/binodtandan/UNT Research/ai_stress_testing/data_work/loans_fe.parquet
Model dir: /Users/binodtandan/UNT Research/ai_stress_testing/results/models


In [13]:

# 1) Load engineered dataset
df = pd.read_parquet(DATA_PATH)
print("Data shape:", df.shape)
print(df.columns.tolist()[:20], "...")

TARGET   = "target"
TIME_COL = "issue_q_start"

Data shape: (2258953, 33)
['issue_q_start', 'loan_amnt', 'term_m', 'int_rate', 'dti', 'fico', 'emp_length', 'GDPC1', 'UNRATE', 'CPIAUCSL', 'FEDFUNDS', 'target', 'log_annual_inc', 'grade_b', 'grade_c', 'grade_d', 'grade_e', 'grade_f', 'home_ownership_mortgage', 'home_ownership_own'] ...


In [14]:
# 2) Time-based train/test split (same logic as in 03_model_training.ipynb)
split_date = pd.Timestamp("2017-12-31")
train_df = df[df[TIME_COL] <= split_date].copy()
test_df  = df[df[TIME_COL] >  split_date].copy()

X_train = train_df.drop(columns=[TARGET, TIME_COL])
y_train = train_df[TARGET].astype(int)
X_test  = test_df.drop(columns=[TARGET, TIME_COL])
y_test  = test_df[TARGET].astype(int)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Train default rate:", y_train.mean(), " Test default rate:", y_test.mean())

Train shape: (1764843, 31)  Test shape: (494110, 31)
Train default rate: 0.14756893389383646  Test default rate: 0.01791706300216551


In [15]:
# 3) Load trained models (from 03_model_training.ipynb)
logreg_path = MODEL_DIR / "logistic_balanced.joblib"
tree_path   = MODEL_DIR / "tree_balanced.joblib"

logreg = joblib.load(logreg_path)
print("Loaded logistic model from", logreg_path)

Loaded logistic model from /Users/binodtandan/UNT Research/ai_stress_testing/results/models/logistic_balanced.joblib


In [16]:
# 4) Compute baseline PDs on 2018 test set (this is our reference scenario)
proba_lr_baseline = logreg.predict_proba(X_test)[:, 1]
baseline_summary = {
    "mean_pd": float(proba_lr_baseline.mean()),
    "p50_pd":  float(np.quantile(proba_lr_baseline, 0.5)),
    "p90_pd":  float(np.quantile(proba_lr_baseline, 0.9)),
    "p99_pd":  float(np.quantile(proba_lr_baseline, 0.99)),
}
baseline_summary

{'mean_pd': 0.16906894089845018,
 'p50_pd': 0.14116718042866067,
 'p90_pd': 0.33145939305832345,
 'p99_pd': 0.54159379087249}

## Step 2: Define & Apply Data-Driven Macro Stress Scenarios

In [17]:
# Macro level features (levels)
macro_cols = ["GDPC1", "UNRATE", "CPIAUCSL", "FEDFUNDS"]

# Macro dynamics / deltas (already engineered)
delta_cols = ["UNRATE_delta_qoq", "FEDFUNDS_delta_qoq",
              "GDPC1_delta_qoq", "inflation_qoq", "real_rate_qoq"]

all_macro_feats = macro_cols + delta_cols

# Use the full history to derive empirical quantiles
macro_quantiles = df[all_macro_feats].quantile([0.1, 0.25, 0.5, 0.75, 0.9])
macro_quantiles

Unnamed: 0,GDPC1,UNRATE,CPIAUCSL,FEDFUNDS,UNRATE_delta_qoq,FEDFUNDS_delta_qoq,GDPC1_delta_qoq,inflation_qoq,real_rate_qoq
0.1,17953.974,3.833333,234.162667,0.093333,-0.06044,-0.035714,0.001845,-0.000621,0.068379
0.25,18782.243,4.166667,236.96,0.136667,-0.041958,0.0625,0.004002,0.001585,0.091345
0.5,19197.938,4.9,240.607333,0.396667,-0.027211,0.134454,0.005795,0.004245,0.121902
0.75,19882.352,5.433333,247.238333,1.203333,-0.013072,0.202216,0.007885,0.006348,0.154176
0.9,20276.154,6.933333,251.686333,1.923333,0.006803,0.555556,0.01127,0.007955,0.19055


In [18]:
# Helper to build a scenario vector from quantiles
def make_scenario(name, level_q, delta_q):
    """
    name: string label for scenario
    level_q: which quantile to use for macro levels  (GDPC1, UNRATE, CPI, FEDFUNDS)
    delta_q: which quantile to use for macro deltas (UNRATE_delta_qoq, etc.)
    """
    s = {"scenario": name}
    # Levels
    for col in macro_cols:
        s[col] = float(macro_quantiles.loc[level_q, col])
    # Deltas
    for col in delta_cols:
        s[col] = float(macro_quantiles.loc[delta_q, col])
    return s

scenarios = []

# 0) Baseline (reference) = actual 2018 macro, i.e. "no shock"
#    We already have baseline PDs from proba_lr_baseline.
#    We'll still store it in the summary table below as "baseline_actual".
scenarios.append({"scenario": "baseline_actual"})  # placeholder; no override

# 1) Mild Adverse: moderately bad macro conditions
#    - High unemployment (75th)
#    - Low GDP (25th)
#    - Higher rates & inflation (75th)
scenarios.append(make_scenario("mild_adverse", level_q=0.75, delta_q=0.75))

# 2) Severe Adverse: very stressed macro conditions
#    - Very high unemployment (90th)
#    - Very low GDP (10th)
#    - Strong rate shock & inflation dynamics (90th deltas)
scenarios.append(make_scenario("severe_adverse", level_q=0.90, delta_q=0.90))

scenarios


[{'scenario': 'baseline_actual'},
 {'scenario': 'mild_adverse',
  'GDPC1': 19882.352,
  'UNRATE': 5.433333333333334,
  'CPIAUCSL': 247.23833333333334,
  'FEDFUNDS': 1.2033333333333334,
  'UNRATE_delta_qoq': -0.013071895424836777,
  'FEDFUNDS_delta_qoq': 0.2022160664819943,
  'GDPC1_delta_qoq': 0.00788524130554702,
  'inflation_qoq': 0.006347825364148241,
  'real_rate_qoq': 0.15417567635744467},
 {'scenario': 'severe_adverse',
  'GDPC1': 20276.154,
  'UNRATE': 6.933333333333334,
  'CPIAUCSL': 251.68633333333332,
  'FEDFUNDS': 1.9233333333333331,
  'UNRATE_delta_qoq': 0.006802721088435604,
  'FEDFUNDS_delta_qoq': 0.5555555555555556,
  'GDPC1_delta_qoq': 0.011270466267692791,
  'inflation_qoq': 0.007955306776687543,
  'real_rate_qoq': 0.1905504756186524}]

In [19]:
# === Apply scenarios and compute PD distributions ===================

stress_records = []

# Baseline reference (no macro override; just use proba_lr_baseline)
baseline_mean = float(proba_lr_baseline.mean())
baseline_p50  = float(np.quantile(proba_lr_baseline, 0.5))
baseline_p90  = float(np.quantile(proba_lr_baseline, 0.9))
baseline_p99  = float(np.quantile(proba_lr_baseline, 0.99))

stress_records.append({
    "scenario": "baseline_actual",
    "mean_pd": baseline_mean,
    "p50_pd":  baseline_p50,
    "p90_pd":  baseline_p90,
    "p99_pd":  baseline_p99,
    "uplift_vs_baseline": 0.0  # by definition
})

# Function to override macro features and recompute PDs
def apply_macro_scenario(X, scen_dict):
    X_new = X.copy()
    # For placeholder baseline_actual, we don't touch X
    if scen_dict.get("scenario") == "baseline_actual":
        return X_new
    for col in all_macro_feats:
        if col in scen_dict:
            X_new[col] = scen_dict[col]
    return X_new

for scen in scenarios:
    name = scen["scenario"]
    if name == "baseline_actual":
        continue  # already handled above

    X_scen = apply_macro_scenario(X_test, scen)
    proba_scen = logreg.predict_proba(X_scen)[:, 1]

    mean_pd = float(proba_scen.mean())
    p50_pd  = float(np.quantile(proba_scen, 0.5))
    p90_pd  = float(np.quantile(proba_scen, 0.9))
    p99_pd  = float(np.quantile(proba_scen, 0.99))

    uplift = (mean_pd / baseline_mean - 1.0) * 100.0 if baseline_mean > 0 else np.nan

    stress_records.append({
        "scenario": name,
        "mean_pd": mean_pd,
        "p50_pd":  p50_pd,
        "p90_pd":  p90_pd,
        "p99_pd":  p99_pd,
        "uplift_vs_baseline": uplift
    })

stress_df = pd.DataFrame(stress_records)
stress_df


Unnamed: 0,scenario,mean_pd,p50_pd,p90_pd,p99_pd,uplift_vs_baseline
0,baseline_actual,0.169069,0.141167,0.331459,0.541594,0.0
1,mild_adverse,0.195667,0.185901,0.317873,0.437739,15.731878
2,severe_adverse,0.06686,0.060201,0.11561,0.179246,-60.453837


In [20]:
out_path = RES_DIR / "stress_summary.csv"
stress_df.to_csv(out_path, index=False)
print("Saved stress scenario summary to:", out_path)

Saved stress scenario summary to: /Users/binodtandan/UNT Research/ai_stress_testing/results/stress_summary.csv


## Step 3: Define Fed Macro Stress Scenarios

In [None]:
# # === FED 2018 SCENARIOS: manual table =======================================

# fed_rows = [

#     # ---- Baseline ----
#     {"scenario": "Fed_Baseline", "quarter": "2018Q1", "UNRATE": 4.0 "GDPC1": 2.5 "CPIAUCSL": 2.1 "FEDFUNDS": 1.4},
#     {"scenario": "Fed_Baseline", "quarter": "2018Q2", "UNRATE": 4.0 "GDPC1": 2.8 "CPIAUCSL": ..., "FEDFUNDS": ...},
#     # ...
#     # ---- Adverse ----
#     # {"scenario": "Fed_Adverse", "quarter": "2018Q1", "UNRATE": ..., "GDPC1": ..., "CPIAUCSL": ..., "FEDFUNDS": ...},
#     # ...
#     # ---- Severely Adverse ----
#     # {"scenario": "Fed_Severe", "quarter": "2018Q1", "UNRATE": ..., "GDPC1": ..., "CPIAUCSL": ..., "FEDFUNDS": ...},
#     # ...
# ]

# fed_macro = pd.DataFrame(fed_rows)

# if fed_macro.empty:
#     print("⚠️ fed_macro is empty. Fill fed_rows with Fed scenario values from the PDF.")
# else:
#     display(fed_macro.head())
