In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler

# Load previously cleaned data
df = pd.read_csv('../data/processed/credit_default_cleaned.csv')
df.head()

Unnamed: 0,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,PAY_AMT4,PAY_AMT5,PAY_AMT6,default,SEX_M,EDUCATION_High School,EDUCATION_Other,EDUCATION_University,MARRIAGE_Other,MARRIAGE_Single
0,20000,2,2,-1,-1,-2,-2,3913,3102,689,...,0,0,0,1,False,False,False,True,False,False
1,120000,-1,2,0,0,0,2,2682,1725,2682,...,1000,0,2000,1,False,False,False,True,False,True
2,90000,0,0,0,0,0,0,29239,14027,13559,...,1000,1000,5000,0,False,False,False,True,False,True
3,50000,0,0,0,0,0,0,46990,48233,49291,...,1100,1069,1000,0,False,False,False,True,False,False
4,50000,-1,0,-1,0,0,0,8617,5670,35835,...,9000,689,679,0,True,False,False,True,False,False


# Behavioral Features

In [None]:
# Payment Behavior Features
pay_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

df['max_delinquency'] = df[pay_cols].max(axis=1)
df['avg_delinquency'] = df[pay_cols].mean(axis=1)
df['num_missed_payments'] = (df[pay_cols] > 0).sum(axis=1)
df['recent_delinquency'] = df['PAY_0']

# Delinquency Trend
df['delinq_trend'] = df['PAY_0'] - df['PAY_6']

df.shape

(30000, 31)

# Utilization & Exposure Features

In [None]:
# Credit Utilization Ratio
bill_cols = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']

df['avg_bill_amt'] = df[bill_cols].mean(axis=1)
df['credit_utilization'] = df['avg_bill_amt'] / df['LIMIT_BAL']

# Max Utilization
df['max_utilization'] = df[bill_cols].max(axis=1) / df['LIMIT_BAL']

df.shape


(30000, 34)

# Payment Capacity & Liquidity Features

In [None]:
# Payment status Ratio

pay_amt_cols = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

df['avg_payment'] = df[pay_amt_cols].mean(axis=1)
df['payment_ratio'] = df['avg_payment'] / (df['avg_bill_amt'] + 1)

# Payment Consistency
df['payment_std'] = df[pay_amt_cols].std(axis=1)

df.shape

(30000, 37)

# Stability & Volatility Features 

In [None]:
# Bill Volatility 
df['bill_vol'] = df[bill_cols].std(axis=1)

# Utilization Volatility
df['uti_vol'] = df[bill_cols].div(df['LIMIT_BAL'], axis=0).std(axis=1)

df.shape

(30000, 39)

# Feature Reduction

In [None]:
# Correlation pruning
df = df.drop([
    'BILL_AMT1','BILL_AMT2','BILL_AMT3',
    'PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'
], axis=1)

df.shape

(30000, 31)

In [None]:
# checking for missing values
df.isnull().sum()

LIMIT_BAL                0
PAY_0                    0
BILL_AMT4                0
BILL_AMT5                0
BILL_AMT6                0
PAY_AMT1                 0
PAY_AMT2                 0
PAY_AMT3                 0
PAY_AMT4                 0
PAY_AMT5                 0
PAY_AMT6                 0
default                  0
SEX_M                    0
EDUCATION_High School    0
EDUCATION_Other          0
EDUCATION_University     0
MARRIAGE_Other           0
MARRIAGE_Single          0
max_delinquency          0
avg_delinquency          0
num_missed_payments      0
recent_delinquency       0
delinq_trend             0
avg_bill_amt             0
credit_utilization       0
max_utilization          0
avg_payment              0
payment_ratio            2
payment_std              0
bill_vol                 0
uti_vol                  0
dtype: int64

In [None]:
# filling in missing values
imputer = SimpleImputer(strategy='median')
df[['payment_ratio']] = imputer.fit_transform(df[['payment_ratio']])

In [None]:
df.isnull().sum()

LIMIT_BAL                0
PAY_0                    0
BILL_AMT4                0
BILL_AMT5                0
BILL_AMT6                0
PAY_AMT1                 0
PAY_AMT2                 0
PAY_AMT3                 0
PAY_AMT4                 0
PAY_AMT5                 0
PAY_AMT6                 0
default                  0
SEX_M                    0
EDUCATION_High School    0
EDUCATION_Other          0
EDUCATION_University     0
MARRIAGE_Other           0
MARRIAGE_Single          0
max_delinquency          0
avg_delinquency          0
num_missed_payments      0
recent_delinquency       0
delinq_trend             0
avg_bill_amt             0
credit_utilization       0
max_utilization          0
avg_payment              0
payment_ratio            0
payment_std              0
bill_vol                 0
uti_vol                  0
dtype: int64

In [None]:
# save the feature engineered dataset
df.to_csv('../data/processed/credit_default_engineered.csv', index=False)