### Stage 5 - Expected Credit Loss (ECL)

This notebook computes simplified IFRS 9 ECL using outputs from Stage 4:

- Stage 1: 12-month ECL
- Stage 2/3: lifetime ECL proxy using a constant-hazard approximation
- base vs downturn LGD
- scenario-weighted ECL (configurable)
- portfolio summaries using `sample_weight` (to correct for Stage 1 downsampling)

In [None]:
import numpy as np
import pandas as pd
import os
os.makedirs("data", exist_ok=True)

#Configuration
DATA_PATH = "data/loans_stage4_staged.csv"
OUT_PATH = "data/loans_stage5_ecl.csv"
#Scenario weights
W_BASE = 0.6
W_DOWN = 0.4
#Guardrails
EPS = 1e-6
MAX_YEARS = 6.0

In [None]:
df = pd.read_csv(DATA_PATH)
df["issue_d"] = pd.to_datetime(df["issue_d"], errors="coerce")
df = df.dropna(subset=["issue_d"]).sort_values("issue_d").reset_index(drop=True)
required = ["stage", "pd_hat", "ead_12m", "lgd_base_used", "lgd_downturn_used", "term", "sample_weight"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns from Stage 4: {missing}")

print("Rows:", len(df))
print("Issue date range:", df["issue_d"].min().date(), "->", df["issue_d"].max().date())
print("Stage distribution:")
print(df["stage"].value_counts(normalize=True).sort_index())

Rows: 541736
Issue date range: 2011-01-01 -> 2016-12-01
Stage distribution:
stage
1    0.573789
2    0.027543
3    0.398668
Name: proportion, dtype: float64


In [None]:
def term_to_months(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    digits = "".join([c for c in s if c.isdigit()])
    try:
        return float(digits)
    except:
        return np.nan

df["term_m"] = df["term"].apply(term_to_months)
df["T_years"] = (df["term_m"] / 12.0).clip(lower=1.0, upper=MAX_YEARS)
df["pd_12m"] = df["pd_hat"].clip(EPS, 1 - EPS)
df["pd_lifetime"] = 1 - (1 - df["pd_12m"]) ** df["T_years"]
display(df[["pd_12m", "pd_lifetime", "term_m", "T_years"]].describe(percentiles=[0.1, 0.5, 0.9]))

Unnamed: 0,pd_12m,pd_lifetime,term_m,T_years
count,541736.0,541736.0,541736.0,541736.0
mean,0.184726,0.487285,43.646839,3.637237
std,0.099959,0.231714,11.182586,0.931882
min,0.019447,0.057213,36.0,3.0
10%,0.066846,0.189176,36.0,3.0
50%,0.170036,0.460384,36.0,3.0
90%,0.320851,0.83323,60.0,5.0
max,0.929389,0.999648,60.0,5.0


In [None]:
#Exposure proxy
df["ead_used"] = df["ead_12m"]

#Stage 1: 12m PD
#Stage 2/3: lifetime PD proxy
df["pd_used_main"] = np.where(df["stage"] == 1, df["pd_12m"], df["pd_lifetime"])

#Sensitivity: Stage 3 PD = 1
df["pd_used_s3_1"] = np.where(df["stage"] == 3, 1.0, df["pd_used_main"])

#ECL calculations
df["ecl_base_main"] = df["pd_used_main"] * df["lgd_base_used"] * df["ead_used"]
df["ecl_downturn_main"] = df["pd_used_main"] * df["lgd_downturn_used"] * df["ead_used"]
df["ecl_base_s3_1"] = df["pd_used_s3_1"] * df["lgd_base_used"] * df["ead_used"]
df["ecl_downturn_s3_1"] = df["pd_used_s3_1"] * df["lgd_downturn_used"] * df["ead_used"]
df["ecl_weighted_main"] = W_BASE * df["ecl_base_main"] + W_DOWN * df["ecl_downturn_main"]
df["ecl_weighted_s3_1"] = W_BASE * df["ecl_base_s3_1"] + W_DOWN * df["ecl_downturn_s3_1"]
display(df[["ecl_base_main", "ecl_downturn_main"]].describe(percentiles=[0.1, 0.5, 0.9]))

Unnamed: 0,ecl_base_main,ecl_downturn_main
count,541736.0,541736.0
mean,1928.388127,2651.533675
std,2554.333773,3512.208938
min,5.642921,7.759016
10%,145.846569,200.539032
50%,869.382462,1195.400886
90%,5386.291974,7406.151464
max,20144.098705,27698.13572


In [None]:
def wmean(x, w):
    return (x * w).sum() / w.sum()
w = df["sample_weight"].astype(float)
summary = []
for s, g in df.groupby("stage"):
    wg = w.loc[g.index]
    summary.append({
        "stage": s,
        "rows": len(g),
        "share": len(g) / len(df),
        "w_sum": float(wg.sum()),
        "w_share": float(wg.sum() / w.sum()),
        "avg_ecl_base_main_w": float(wmean(g["ecl_base_main"], wg)),
        "avg_ecl_downturn_main_w": float(wmean(g["ecl_downturn_main"], wg)),
        "avg_ecl_weighted_main_w": float(wmean(g["ecl_weighted_main"], wg)),
    })
summary_df = pd.DataFrame(summary).set_index("stage").sort_index()
print(summary_df)
print("\nWEIGHTED TOTALS:")
print("total_ecl_base_main_w :", float((df["ecl_base_main"] * w).sum()))
print("total_ecl_downturn_main_w :", float((df["ecl_downturn_main"] * w).sum()))
print("total_ecl_weighted_main_w :", float((df["ecl_weighted_main"] * w).sum()))
print("total_ecl_weighted_s3_1_w :", float((df["ecl_weighted_s3_1"] * w).sum()))
total_ead = float((df["ead_used"] * w).sum())
total_ecl = float((df["ecl_weighted_main"] * w).sum())
print("\nWeighted total EAD:", total_ead)
print("Weighted total ECL:", total_ecl)
print("ECL / EAD:", total_ecl / total_ead)

         rows     share         w_sum   w_share  avg_ecl_base_main_w  \
stage                                                                  
1      310842  0.573789  1.036140e+06  0.795898           771.491584   
2       14921  0.027543  4.973667e+04  0.038205          3468.951603   
3      215973  0.398668  2.159730e+05  0.165897          3487.033364   

       avg_ecl_downturn_main_w  avg_ecl_weighted_main_w  
stage                                                    
1                  1060.800928               887.215322  
2                  4769.808454              3989.294343  
3                  4794.670876              4010.088369  

WEIGHTED TOTALS:
total_ecl_base_main_w : 1725012436.3026912
total_ecl_downturn_main_w : 2371892099.9162016
total_ecl_weighted_main_w : 1983764301.748095
total_ecl_weighted_s3_1_w : 2436120045.1446166

Weighted total EAD: 14764123459.98682
Weighted total ECL: 1983764301.748095
ECL / EAD: 0.13436383860676995


In [None]:
by_year_stage = (
    df.assign(year=df["issue_d"].dt.year)
      .groupby(["year", "stage"])
      .apply(lambda g: pd.Series({
          "w_ead": float((g["ead_used"] * g["sample_weight"]).sum()),
          "w_ecl": float((g["ecl_weighted_main"] * g["sample_weight"]).sum()),
          "rows": len(g),
      })).reset_index())

by_year_stage["ecl_over_ead"] = by_year_stage["w_ecl"] / by_year_stage["w_ead"]
display(by_year_stage.sort_values(["year", "stage"]))
df.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH, "| rows:", len(df))

  .apply(lambda g: pd.Series({


Unnamed: 0,year,stage,w_ead,w_ecl,rows,ecl_over_ead
0,2011,1,159906500.0,13472530.0,5249.0,0.084253
1,2011,2,7490088.0,2612355.0,213.0,0.348775
2,2011,3,35344810.0,12336790.0,3297.0,0.349041
3,2012,1,429240700.0,34380650.0,12931.0,0.080096
4,2012,2,15577750.0,3585931.0,503.0,0.230196
5,2012,3,99903340.0,31333100.0,8644.0,0.313634
6,2013,1,1211850000.0,97485200.0,33106.0,0.080443
7,2013,2,40283100.0,12405030.0,1059.0,0.307946
8,2013,3,263201300.0,85144080.0,21022.0,0.323494
9,2014,1,2079935000.0,175352000.0,55385.0,0.084306


Saved: data/loans_stage5_ecl.csv | rows: 541736


In [None]:
df[["issue_d", "stage", "pd_hat", "pd_12m", "pd_lifetime",
    "ead_used", "lgd_base_used", "lgd_downturn_used",
    "ecl_weighted_main", "sample_weight"]].head()

Unnamed: 0,issue_d,stage,pd_hat,pd_12m,pd_lifetime,ead_used,lgd_base_used,lgd_downturn_used,ecl_weighted_main,sample_weight
0,2011-01-01,1,0.038266,0.038266,0.110461,3297.395949,0.3,0.4125,43.531207,3.333333
1,2011-01-01,3,0.551231,0.551231,0.981798,13278.647494,0.58,0.7975,8695.646676,1.0
2,2011-01-01,3,0.108335,0.108335,0.291067,2801.732336,0.35,0.48125,328.235706,1.0
3,2011-01-01,3,0.166766,0.166766,0.421502,3537.503079,0.4,0.55,685.890404,1.0
4,2011-01-01,3,0.288221,0.288221,0.639392,17975.362463,0.5,0.6875,6608.651634,1.0


### Summary

Dataset size after cutoff: 541,736 rows

Issue date range: 2011-01-01 â†’ 2016-12-01

Stage distribution (raw):

- Stage 1: 0.569992

- Stage 2: 0.031340

- Stage 3: 0.398668

Weighted stage shares (w_share):

- Stage 1: 0.790631

- Stage 2: 0.043471

- Stage 3: 0.165897

Weighted totals:

- total_ecl_base_main_w: 1,748,057,562.57

- total_ecl_downturn_main_w: 2,185,071,953.21

- total_ecl_weighted_main_w: 1,922,863,318.82

- Weighted total EAD: 14,765,945,339.59

ECL/EAD: 0.130223

Sensitivity (Stage 3 PD=1): 2,352,684,712.80