In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from src import data_loading, preprocessing

df = data_loading.load_data('../data/processed/credit_default_cleaned.csv')
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,20000,F,University,Married,2,2,-1,-1,-2,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,F,University,Single,-1,2,0,0,0,2,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,F,University,Single,0,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,F,University,Married,0,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,M,University,Married,-1,0,-1,0,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


## Feature Engineering & Dimensionality Reduction

The following table outlines the engineered feature categories and the specific variables removed to address multicollinearity and improve model performance.

| Category | Engineered Features | Correlation Pruning (Dropped) | Description |
| :--- | :--- | :--- | :--- |
| **Behavioral** | `max_delinquency`, `avg_delinquency`, `num_missed_payments`, `recent_delinquency`, `delinquency_trend` | `PAY_2`, `PAY_3`, `PAY_4`, `PAY_5`, `PAY_6` | Captures historical payment patterns and trajectory while removing redundant lag status. |
| **Utilization & Exposure** | `average_bill_amount`, `credit_utilization`, `max_utilization`, `bill_volume`, `utility_volume` | `BILL_AMT1`, `BILL_AMT2`, `BILL_AMT3` | Consolidates high-correlation billing statements into aggregate exposure metrics. |
| **Stability & Volatility** | `bill_volume`, `utility_volume` | — | Measures the variance and consistency of credit usage over time. |
| **Payment Capacity** | `average_payment`, `payment_ratio`, `payment_std` | — | Assesses the borrower's ability to cover balances relative to their total debt. |


In [10]:
df = preprocessing.feature_engineering(df)
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,PAY_0,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,recent_delinquency,delinq_trend,avg_bill_amt,credit_utilization,max_utilization,bill_vol,uti_vol,avg_payment,payment_ratio,payment_std
0,20000,F,University,Married,2,0,0,0,0,689,...,2,4,1284.0,0.0642,0.19565,1761.633219,0.088082,114.833333,0.089364,281.283072
1,120000,F,University,Single,-1,3272,3455,3261,0,1000,...,-1,-3,2846.166667,0.023718,0.028792,637.967841,0.005316,833.333333,0.292689,752.772653
2,90000,F,University,Single,0,14331,14948,15549,1518,1500,...,0,0,16942.166667,0.188246,0.324878,6064.518593,0.067384,1836.333333,0.108382,1569.815488
3,50000,F,University,Married,0,28314,28959,29547,2000,2019,...,0,0,38555.666667,0.771113,0.98582,10565.793518,0.211316,1398.0,0.036258,478.058155
4,50000,M,University,Married,-1,20940,19146,19131,2000,36681,...,-1,-1,18223.166667,0.364463,0.7167,10668.590074,0.213372,9841.5,0.540025,13786.230736


In [3]:
# checking for missing values
df.isnull().sum()

LIMIT_BAL              0
SEX                    0
EDUCATION              0
MARRIAGE               0
PAY_0                  0
BILL_AMT4              0
BILL_AMT5              0
BILL_AMT6              0
PAY_AMT1               0
PAY_AMT2               0
PAY_AMT3               0
PAY_AMT4               0
PAY_AMT5               0
PAY_AMT6               0
default                0
max_delinquency        0
avg_delinquency        0
num_missed_payments    0
recent_delinquency     0
delinq_trend           0
avg_bill_amt           0
credit_utilization     0
max_utilization        0
bill_vol               0
uti_vol                0
avg_payment            0
payment_ratio          0
payment_std            0
dtype: int64

In [14]:
# save the feature engineered dataset
df.to_csv('../data/processed/credit_default_engineered.csv', index=False)