### Stage 3 - EAD & LGD assumptions

This notebook adds onto the Stage 1 dataset with:
- exposure-at-default proxies at 6 and 12 months (scheduled amortisation)
- base and downturn LGD assumptions (grade-segmented)

In [None]:
import numpy as np
import pandas as pd
import os
os.makedirs("data", exist_ok=True)

In [None]:
#Configuration
DATA_PATH = "data/loans_stage1_sampled.csv"
OUT_PATH = "data/loans_stage3_enriched.csv"
CUTOFF_MONTHS = 24
#LGD assumptions
LGD_BASE = 0.40
LGD_DOWNTURN = 0.55
#Grade segmentation multipliers
#These are stylised and used for sensitivity/segmentation, not estimation
GRADE_MULT = {
    "A": 0.75,
    "B": 0.875,
    "C": 1.00,
    "D": 1.125,
    "E": 1.25,
    "F": 1.35,
    "G": 1.45,
}

In [None]:
df = pd.read_csv(DATA_PATH)
df["issue_d"] = pd.to_datetime(df["issue_d"], errors="coerce")
df = df.dropna(subset=["issue_d"]).sort_values("issue_d").reset_index(drop=True)
#maturity cutoff
max_issue = df["issue_d"].max()
cutoff = max_issue - pd.DateOffset(months=CUTOFF_MONTHS)
df = df[df["issue_d"] <= cutoff].copy().reset_index(drop=True)
print("Rows:", len(df))
print("Issue date range:", df["issue_d"].min().date(), "->", df["issue_d"].max().date())
print("Default rate (proxy):", float(df["default_12m"].mean()))
#engineered vars used later as well
df["revol_credit"] = df["open_acc"] * (df["revol_util"] / 100.0)
df["loan_to_income"] = np.where(df["annual_inc"] > 0, df["loan_amnt"] / df["annual_inc"], np.nan)

Rows: 541736
Issue date range: 2011-01-01 -> 2016-12-01
Default rate (proxy): 0.3986683550659362


In [None]:
def term_to_months(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    digits = "".join([c for c in s if c.isdigit()])
    try:
        return float(digits)
    except:
        return np.nan
    
df["term_m"] = df["term"].apply(term_to_months)
df["r_m"] = (df["int_rate"] / 100.0) / 12.0
#quick checks
print("Term months distribution:")
print(df["term_m"].value_counts(dropna=False).sort_index())

Term months distribution:
term_m
36.0    369129
60.0    172607
Name: count, dtype: int64


### EAD proxy

I use scheduled amortisation to approximate outstanding balance after *t* months.
This is a contractual EAD proxy and does not model prepayments or delinquency dynamics.

In [None]:
def balance_after_t_months(P, r_m, n, t):
    """Remaining balance after t payments for a fully-amortising loan.
    P: original principal
    r_m: monthly interest rate (decimal)
    n: total number of months
    t: months elapsed
    Handles r_m ~ 0 as a linear amortisation edge case."""
    if np.isnan(P) or np.isnan(r_m) or np.isnan(n) or np.isnan(t):
        return np.nan
    if n <= 0:
        return np.nan
    t = min(max(t, 0), n)
    #near-zero rate -> linear amortisation
    if abs(r_m) < 1e-10:
        payment = P / n
        return max(P - payment * t, 0.0)
    #standard amortisation
    payment = P * (r_m * (1 + r_m) ** n) / ((1 + r_m) ** n - 1)
    bal = P * (1 + r_m) ** t - payment * (((1 + r_m) ** t - 1) / r_m)
    return max(bal, 0.0)

#Vectorised apply (fast enough at this scale; explicit for clarity)
df["ead_6m"] = df.apply(lambda r: balance_after_t_months(r["loan_amnt"], r["r_m"], r["term_m"], 6), axis=1)
df["ead_12m"] = df.apply(lambda r: balance_after_t_months(r["loan_amnt"], r["r_m"], r["term_m"], 12), axis=1)

print("EAD_12m summary:")
display(df["ead_12m"].describe(percentiles=[0.1, 0.5, 0.9]))

EAD_12m summary:


count    541736.000000
mean      11598.974451
std        7139.202732
min         684.196071
10%        3532.481047
50%       10308.143123
90%       21877.884440
max       35890.068275
Name: ead_12m, dtype: float64

In [None]:
#EAD should generally be <= original principal
share_gt = float((df["ead_12m"] > df["loan_amnt"]).mean())
print("Share where EAD_12m > loan_amnt:", share_gt)
print("\nEAD sensitivity (mean):")
print("mean ead_6m :", float(df["ead_6m"].mean()))
print("mean ead_12m:", float(df["ead_12m"].mean()))

Share where EAD_12m > loan_amnt: 0.0

EAD sensitivity (mean):
mean ead_6m : 13363.571047400692
mean ead_12m: 11598.97445124416


In [None]:
# EAD should generally be <= original principal
share_gt = float((df["ead_12m"] > df["loan_amnt"]).mean())
print("Share where EAD_12m > loan_amnt:", share_gt)
print("\nEAD sensitivity (mean):")
print("mean ead_6m :", float(df["ead_6m"].mean()))
print("mean ead_12m:", float(df["ead_12m"].mean()))

Share where EAD_12m > loan_amnt: 0.0

EAD sensitivity (mean):
mean ead_6m : 13363.571047400692
mean ead_12m: 11598.97445124416


In [None]:
#Base / downturn LGD 
df["lgd_base"] = LGD_BASE
df["lgd_downturn"] = LGD_DOWNTURN

#Grade segmentation
df["grade"] = df["grade"].astype(str).str.strip().str.upper()
mult = df["grade"].map(GRADE_MULT).fillna(1.0)
df["lgd_base_seg"] = (LGD_BASE * mult).clip(0.05, 0.95)
df["lgd_downturn_seg"] = (LGD_DOWNTURN * mult).clip(0.05, 0.95)

#quick view
seg_counts = df["grade"].value_counts()
display(seg_counts)
print("LGD segmented summary (base):")
display(df["lgd_base_seg"].describe())
print("LGD segmented summary (downturn):")
display(df["lgd_downturn_seg"].describe())

grade
C    158910
B    144851
D     93236
A     72003
E     49816
F     18215
G      4705
Name: count, dtype: int64

LGD segmented summary (base):


count    541736.000000
mean          0.397411
std           0.065430
min           0.300000
25%           0.350000
50%           0.400000
75%           0.450000
max           0.580000
Name: lgd_base_seg, dtype: float64

LGD segmented summary (downturn):


count    541736.000000
mean          0.546440
std           0.089967
min           0.412500
25%           0.481250
50%           0.550000
75%           0.618750
max           0.797500
Name: lgd_downturn_seg, dtype: float64

In [None]:
#Preserve everything needed downstream
keep_cols = [
    "issue_d",
    "default_12m",
    "sample_weight",
    "loan_amnt", "installment", "int_rate", "term", "term_m",
    "grade", "sub_grade", "purpose",
    "annual_inc", "emp_length", "home_ownership", "verification_status",
    "dti", "delinq_2yrs", "inq_last_6mths", "open_acc", "pub_rec",
    "revol_bal", "revol_util", "total_acc", "acc_now_delinq", "addr_state",
    "revol_credit", "loan_to_income",
    "ead_6m", "ead_12m",
    "lgd_base", "lgd_downturn", "lgd_base_seg", "lgd_downturn_seg",
]
missing = [c for c in keep_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns before export: {missing}")
df[keep_cols].to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH, "| rows:", len(df))
df[["loan_amnt", "installment", "int_rate", "term_m", "ead_6m", "ead_12m"]].head()

Saved: data/loans_stage3_enriched.csv | rows: 541736


Unnamed: 0,loan_amnt,installment,int_rate,term_m,ead_6m,ead_12m
0,4800,146.4,6.17,36.0,4060.256205,3297.395949
1,5000,168.62,13.06,36.0,4295.898722,3544.550349
2,4000,129.77,10.37,36.0,3416.328549,2801.732336
3,5000,167.54,12.61,36.0,4291.676352,3537.503079
4,25000,882.89,16.32,36.0,21629.940258,17975.362463
