In [9]:
import pandas as pd
import numpy as np

from pathlib import Path

DATA_PATH = Path("../data/processed")
df = pd.read_csv(DATA_PATH / "credit_default_clean.csv")

df.head()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [10]:
TARGET = "default payment next month"

X = df.drop(columns=[TARGET])
y = df[TARGET]

X.shape, y.value_counts(normalize=True)


((30000, 24),
 default payment next month
 0    0.7788
 1    0.2212
 Name: proportion, dtype: float64)

## Feature Engineering Strategy

Credit risk is driven by:
- Credit utilization
- Payment behavior over time
- Recent delinquency
- Stability vs volatility of payments

We engineer features that summarize **behavioral patterns**, not raw monthly values.


In [11]:
bill_cols = [f"BILL_AMT{i}" for i in range(1, 7)]
limit_col = "LIMIT_BAL"

for col in bill_cols:
    X[f"util_{col.lower()}"] = X[col] / X[limit_col]

X[[c for c in X.columns if "util_" in c]].head()


Unnamed: 0,util_bill_amt1,util_bill_amt2,util_bill_amt3,util_bill_amt4,util_bill_amt5,util_bill_amt6
0,0.19565,0.1551,0.03445,0.0,0.0,0.0
1,0.02235,0.014375,0.02235,0.027267,0.028792,0.027175
2,0.324878,0.155856,0.150656,0.159233,0.166089,0.172767
3,0.9398,0.96466,0.98582,0.56628,0.57918,0.59094
4,0.17234,0.1134,0.7167,0.4188,0.38292,0.38262


In [12]:
pay_amt_cols = [f"PAY_AMT{i}" for i in range(1, 7)]

for pay_col, bill_col in zip(pay_amt_cols, bill_cols):
    X[f"pay_ratio_{pay_col.lower()}"] = X[pay_col] / (X[bill_col] + 1)

X[[c for c in X.columns if "pay_ratio_" in c]].head()


Unnamed: 0,pay_ratio_pay_amt1,pay_ratio_pay_amt2,pay_ratio_pay_amt3,pay_ratio_pay_amt4,pay_ratio_pay_amt5,pay_ratio_pay_amt6
0,0.0,0.222043,0.0,0.0,0.0,0.0
1,0.0,0.579374,0.372717,0.30553,0.0,0.613121
2,0.051915,0.106929,0.073746,0.069774,0.066894,0.321543
3,0.042561,0.041858,0.024345,0.038849,0.036913,0.033843
4,0.232072,6.468171,0.279049,0.429779,0.035985,0.03549


In [13]:
# Payment consistency (safe version)

X["avg_payment"] = X[pay_amt_cols].mean(axis=1)

X["payment_consistency"] = (
    X[pay_amt_cols]
    .replace(0, np.nan)   # ignore zero-only rows
    .std(axis=1)
    .fillna(0)            # customers with no payments → 0 volatility
)


In [14]:
# Final safety cleanup
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)

X.isna().sum().sum()


np.int64(0)

In [15]:
df_features = X.copy()
df_features[TARGET] = y

df_features.shape


(30000, 39)

In [17]:
OUTPUT_PATH = Path("../data/processed")
df_features.to_csv(OUTPUT_PATH / "credit_default_features.csv", index=False)
