# 0. IMPORT PACKAGES & DATASET

In [1]:
import pandas as pd 
import numpy as np
np.random.seed(42)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_columns', 100)
# pd.set_option('display.max_rows', None)  # Show all rows

import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from scipy.stats import skew

import shap
import joblib

In [2]:
df = pd.read_parquet("loan_final.parquet")

In [3]:
# Drop columns due to possible data leakage or missing values (>60%) or noisy 
cols_to_drop = [
    'annual_inc_joint', 'verification_status_joint', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc',
    'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il', 'sec_app_num_rev_accts', 'last_credit_pull_d', 'mths_since_last_record', 'mths_since_last_major_derog',
    'mths_since_recent_bc_dlq', 'mths_since_recent_revol_delinq', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog', 
    'dti_joint', 'revol_bal_joint', 'url', 'desc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 
    'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'policy_code', 'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status', 
    'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_length', 'hardship_dpd', 'hardship_loan_status',
    'orig_projected_additional_accrued_interest', 'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'debt_settlement_flag_date', 'settlement_status', 
    'settlement_date', 'settlement_amount', 'settlement_percentage', 'settlement_term', 'next_pymnt_d', 'issue_d', 'loan_status', 'title', 'emp_title', 'addr_state', 
    'zip_code', 'earliest_cr_line'
]
df = df.drop(columns=cols_to_drop)
print(df.shape)
df.head()

(2260668, 84)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,pymnt_plan,purpose,initial_list_status,disbursement_method,debt_settlement_flag,snapshot_date,emp_length,home_ownership,annual_inc,verification_status,application_type,mort_acc,num_tl_op_past_12m,inq_last_6mths,inq_last_12m,inq_fi,mths_since_last_delinq,mths_since_recent_inq,mths_since_rcnt_il,mths_since_recent_bc,acc_now_delinq,delinq_2yrs,pub_rec,collections_12_mths_ex_med,chargeoff_within_12_mths,tax_liens,pub_rec_bankruptcies,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_accts_ever_120_pd,delinq_amnt,dti,revol_bal,revol_util,total_rev_hi_lim,tot_coll_amt,tot_cur_bal,avg_cur_bal,all_util,max_bal_bc,il_util,bc_util,total_bal_il,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,tot_hi_cred_lim,open_acc,total_acc,open_acc_6m,open_act_il,open_il_12m,open_il_24m,open_rv_12m,open_rv_24m,acc_open_past_24mths,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,num_actv_bc_tl,num_actv_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_il_tl,num_bc_tl,num_op_rev_tl,num_sats,num_bc_sats,total_cu_tl,bc_open_to_buy,percent_bc_gt_75,pct_tl_nvr_dlq
0,LOAN_1653274,MEM_931131,6000,6000,6000.0,36 months,8.39,189.1,A,A5,n,credit_card,f,Cash,N,2022-01-01,< 1 year,RENT,48000.0,Not Verified,Individual,0.0,0.0,0.0,,,19.0,13.0,,17.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,18.98,6151,39.4,15600.0,0.0,41018.0,6836.0,,,,45.2,,41018.0,13600.0,40773.0,56373.0,6.0,20.0,,,,,,,6.0,90.0,86.0,17.0,13.0,1.0,1.0,8.0,1.0,11.0,6.0,5.0,6.0,4.0,,7449.0,25.0,85.0
1,LOAN_812253,MEM_1111718,35000,35000,35000.0,60 months,20.99,946.68,E,E4,n,debt_consolidation,f,Cash,N,2022-01-01,< 1 year,MORTGAGE,110000.0,Verified,Individual,3.0,1.0,1.0,,,67.0,3.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,11.27,4093,40.5,10100.0,59.0,153530.0,21933.0,,,,,,49839.0,0.0,57417.0,186222.0,8.0,16.0,,,,,,,2.0,141.0,189.0,31.0,3.0,0.0,2.0,6.0,2.0,6.0,3.0,3.0,8.0,0.0,,,,87.5
2,LOAN_2156025,MEM_545144,15000,15000,15000.0,36 months,7.49,466.53,A,A4,n,credit_card,w,Cash,N,2022-01-01,2 years,RENT,95000.0,Not Verified,Individual,1.0,3.0,0.0,2.0,0.0,7.0,10.0,7.0,10.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.09,34397,44.7,76900.0,0.0,88183.0,4409.0,63.0,15084.0,86.0,50.7,53786.0,88183.0,63000.0,62783.0,139683.0,21.0,29.0,0.0,4.0,2.0,4.0,1.0,2.0,6.0,128.0,298.0,10.0,7.0,7.0,12.0,22.0,12.0,6.0,14.0,17.0,21.0,10.0,0.0,31060.0,22.2,93.1
3,LOAN_1914224,MEM_196440,7000,7000,7000.0,36 months,10.99,229.14,B,B4,n,debt_consolidation,w,Cash,N,2022-01-01,1 year,RENT,80000.0,Source Verified,Individual,0.0,1.0,1.0,3.0,0.0,,1.0,40.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.89,14258,75.4,18900.0,0.0,21285.0,3041.0,65.0,4224.0,50.0,83.5,7027.0,21285.0,6200.0,14000.0,32900.0,7.0,11.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,53.0,103.0,11.0,11.0,2.0,5.0,8.0,5.0,3.0,4.0,6.0,7.0,3.0,2.0,1020.0,33.3,100.0
4,LOAN_683879,MEM_968196,15600,15600,15600.0,36 months,12.69,523.3,C,C2,n,credit_card,w,Cash,N,2022-01-01,4 years,RENT,50000.0,Not Verified,Individual,0.0,6.0,0.0,,,,12.0,,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.02,19643,77.0,25500.0,0.0,31290.0,2235.0,,,,95.1,,31290.0,17500.0,26208.0,51708.0,15.0,22.0,,,,,,,12.0,52.0,90.0,10.0,8.0,5.0,8.0,12.0,8.0,10.0,6.0,12.0,15.0,6.0,,856.0,80.0,100.0


# 1. SPLIT DATA

In [4]:
# Sort by date column 
df = df.sort_values('snapshot_date')

# Define cutoff dates to split into train/val/test 
# Train/val/test: 2022-01-01 to 2024-03-31
# OOT1: 2024-04-01 to 2024-06-30
# OOT2: 2024-07-01 to 2024-09-30
# OOT3: 2024-10-01 to 2024-12-31

oot3 = df[(df['snapshot_date'] >= '2024-10-01') & (df['snapshot_date'] <= '2024-12-31')]
oot2 = df[(df['snapshot_date'] >= '2024-07-01') & (df['snapshot_date'] <= '2024-09-30')]
oot1 = df[(df['snapshot_date'] >= '2024-04-01') & (df['snapshot_date'] <= '2024-06-30')]
train_val_test = df[df['snapshot_date'] < '2024-04-01']

In [5]:
X = train_val_test.drop(columns=['id', 'member_id', 'grade', 'sub_grade', 'snapshot_date'])
y = train_val_test['grade']

In [6]:
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_, y_, test_size=0.5, random_state=42)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(1185713, 79) (1185713,) (254081, 79) (254081,) (254082, 79) (254082,)


# 2. DATA CLEANING & PREPROCESSING

## 2.1 Demographic Features

In [7]:
def clean_demographic_features(df): 
    df['home_ownership'] = df['home_ownership'].fillna('MISSING')
    df['annual_inc'] = df['annual_inc'].fillna(-1)
    df['emp_length'] = df['emp_length'].fillna('MISSING')

    emp_length_map = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10,
    'MISSING': -1  # or use np.nan if you plan to impute
    }
    df['emp_length'] = df['emp_length'].map(emp_length_map)

    df['application_type'] = df['application_type'].map({'Individual': 0, 'Joint App': 1})


    return df 

# X_train = clean_demographic_features(X_train)

## 2.2 Credit History Features

In [8]:
def clean_credit_history_features(df):
    # Fill NA with -1. 
    cols = [
        'mort_acc', 'num_tl_op_past_12m', 'inq_last_12m', 'inq_fi', 'mths_since_last_delinq',
        'mths_since_recent_inq', 'mths_since_rcnt_il', 'mths_since_recent_bc', 'num_tl_120dpd_2m',
        'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_accts_ever_120_pd'
    ]
    df = df.fillna({col: -1 for col in cols})

    # Impute missing values with the mode. 
    cols = [
        'inq_last_6mths', 'acc_now_delinq', 'delinq_2yrs', 'pub_rec', 
        'collections_12_mths_ex_med', 'chargeoff_within_12_mths', 'tax_liens', 'pub_rec_bankruptcies',
        'delinq_amnt'
    ]

    for col in cols:
        df[col] = df[col].fillna(df[col].mode()[0])

    return df

# X_train = clean_credit_history_features(X_train)

## 2.3 Financial Features

In [9]:
def clean_financial_features(df):
    # Impute with mean 
    df['dti'] = df['dti'].fillna(df['dti'].mean())
    
    # Impute with 0
    cols = [
        "revol_util", "total_rev_hi_lim", "tot_coll_amt", "tot_cur_bal", "avg_cur_bal", "all_util", "max_bal_bc", 
        "open_acc", "total_acc", "open_acc_6m", "open_act_il", "open_il_12m", "open_il_24m", "open_rv_12m", "open_rv_24m", 
        "acc_open_past_24mths", "num_actv_bc_tl", "num_actv_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0", "num_il_tl", 
        "num_bc_tl", "num_op_rev_tl", "num_sats", "num_bc_sats", "total_cu_tl"
    ]
    df[cols] = df[cols].fillna(0)

    # Impute it with -1 
    cols = [
        "il_util", "bc_util", "total_bal_il", "total_bal_ex_mort", "total_bc_limit", "total_il_high_credit_limit", 
        "tot_hi_cred_lim", "mo_sin_old_il_acct", "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", 
        "bc_open_to_buy", "percent_bc_gt_75", "pct_tl_nvr_dlq"
    ]
    df = df.fillna({col: -1 for col in cols})

    return df

# X_train = clean_financial_features(X_train)

## 2.4 Loan Terms & Payment Info Features

In [10]:
def clean_loan_terms_and_payment_info_features(df):
    # Convert string values to 0/1 
    binary_cols = ['pymnt_plan', 'debt_settlement_flag', 'initial_list_status', 'disbursement_method']
    df[binary_cols] = df[binary_cols].apply(lambda col: col.str.lower().map({'y': 1, 'n': 0, 'w': 1, 'f': 0, 'cash': 1, 'directpay': 0}))

    # Extract only the months
    df['term'] = df['term'].str.extract(r'(\d+)').astype(int)

    return df

# X_train = clean_loan_terms_and_payment_info_features(X_train)

In [11]:
# # Sanity check for missing values 
# for col in X_train:
#     print(col)
#     print(X_train[col].isna().sum())

## 2.5 Clean & Preprocess

In [12]:
X_train = clean_demographic_features(X_train)
X_train = clean_credit_history_features(X_train)
X_train = clean_financial_features(X_train)
X_train = clean_loan_terms_and_payment_info_features(X_train)

X_val = clean_demographic_features(X_val)
X_val = clean_credit_history_features(X_val)
X_val = clean_financial_features(X_val)
X_val = clean_loan_terms_and_payment_info_features(X_val)

X_test = clean_demographic_features(X_test)
X_test = clean_credit_history_features(X_test)
X_test = clean_financial_features(X_test)
X_test = clean_loan_terms_and_payment_info_features(X_test)

# 3. Splitting and Preprocessing for OOT data

In [13]:
X_oot1 = oot1.drop(columns=['id', 'member_id', 'grade', 'sub_grade', 'snapshot_date'])
y_oot1 = oot1['grade']

X_oot2 = oot2.drop(columns=['id', 'member_id', 'grade', 'sub_grade', 'snapshot_date'])
y_oot2 = oot2['grade']

X_oot3 = oot3.drop(columns=['id', 'member_id', 'grade', 'sub_grade', 'snapshot_date'])
y_oot3 = oot3['grade']

In [14]:
X_oot1 = clean_demographic_features(X_oot1)
X_oot1 = clean_credit_history_features(X_oot1)
X_oot1 = clean_financial_features(X_oot1)
X_oot1 = clean_loan_terms_and_payment_info_features(X_oot1)

X_oot2 = clean_demographic_features(X_oot2)
X_oot2 = clean_credit_history_features(X_oot2)
X_oot2 = clean_financial_features(X_oot2)
X_oot2 = clean_loan_terms_and_payment_info_features(X_oot2)

X_oot3 = clean_demographic_features(X_oot3)
X_oot3 = clean_credit_history_features(X_oot3)
X_oot3 = clean_financial_features(X_oot3)
X_oot3 = clean_loan_terms_and_payment_info_features(X_oot3)

# 4. EXTRA PREPROCESSING FOR RESPECTIVE MODELS

In [15]:
X_train.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,pymnt_plan,purpose,initial_list_status,disbursement_method,debt_settlement_flag,emp_length,home_ownership,annual_inc,verification_status,application_type,mort_acc,num_tl_op_past_12m,inq_last_6mths,inq_last_12m,inq_fi,mths_since_last_delinq,mths_since_recent_inq,mths_since_rcnt_il,mths_since_recent_bc,acc_now_delinq,delinq_2yrs,pub_rec,collections_12_mths_ex_med,chargeoff_within_12_mths,tax_liens,pub_rec_bankruptcies,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_accts_ever_120_pd,delinq_amnt,dti,revol_bal,revol_util,total_rev_hi_lim,tot_coll_amt,tot_cur_bal,avg_cur_bal,all_util,max_bal_bc,il_util,bc_util,total_bal_il,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,tot_hi_cred_lim,open_acc,total_acc,open_acc_6m,open_act_il,open_il_12m,open_il_24m,open_rv_12m,open_rv_24m,acc_open_past_24mths,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,num_actv_bc_tl,num_actv_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_il_tl,num_bc_tl,num_op_rev_tl,num_sats,num_bc_sats,total_cu_tl,bc_open_to_buy,percent_bc_gt_75,pct_tl_nvr_dlq
876556,21000,21000,21000.0,36,19.72,777.45,0,debt_consolidation,0,1,0,10,RENT,56276.0,Verified,0,0.0,0.0,2.0,4.0,4.0,44.0,0.0,13.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.05,5144,51.4,10000.0,0.0,108176.0,18029.0,68.0,4868.0,73.0,97.4,103032.0,108176.0,5000.0,103113.0,113113.0,6.0,31.0,0.0,4.0,0.0,2.0,0.0,1.0,3.0,135.0,32.0,23.0,13.0,1.0,2.0,2.0,2.0,29.0,1.0,2.0,6.0,1.0,3.0,132.0,100.0,92.0
1231439,16000,16000,16000.0,60,7.97,324.2,0,debt_consolidation,0,1,0,0,MORTGAGE,82000.0,Not Verified,1,1.0,1.0,0.0,0.0,0.0,-1.0,13.0,40.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.16,1692,6.9,24400.0,0.0,163837.0,23405.0,44.0,84.0,69.0,0.6,25980.0,27672.0,14400.0,37925.0,210778.0,8.0,24.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,137.0,136.0,9.0,9.0,2.0,3.0,8.0,3.0,15.0,4.0,5.0,8.0,3.0,1.0,14308.0,0.0,100.0
1131787,30000,30000,29900.0,60,13.99,697.9,0,debt_consolidation,0,1,0,1,RENT,192000.0,Source Verified,0,0.0,0.0,0.0,-1.0,-1.0,-1.0,11.0,-1.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.27,24173,86.0,28100.0,0.0,244376.0,22216.0,0.0,0.0,-1.0,86.0,-1.0,244376.0,28100.0,218549.0,246649.0,11.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,123.0,113.0,26.0,15.0,6.0,6.0,9.0,6.0,20.0,9.0,6.0,11.0,6.0,0.0,3927.0,83.3,100.0
505502,10800,10800,10800.0,36,13.44,366.19,0,debt_consolidation,1,1,0,1,MORTGAGE,88000.0,Source Verified,0,4.0,2.0,0.0,2.0,1.0,5.0,9.0,4.0,30.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.2,23070,73.5,31400.0,0.0,549530.0,22897.0,84.0,6576.0,89.0,94.5,138884.0,162502.0,23500.0,164915.0,599509.0,25.0,59.0,1.0,10.0,1.0,3.0,1.0,1.0,5.0,160.0,150.0,9.0,4.0,6.0,9.0,26.0,9.0,28.0,10.0,12.0,25.0,7.0,2.0,1290.0,100.0,96.4
910324,10800,10800,10775.0,60,13.99,251.25,0,credit_card,0,1,0,10,RENT,51500.0,Not Verified,0,0.0,1.0,1.0,-1.0,-1.0,26.0,5.0,-1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.18,11167,72.7,15356.0,0.0,11167.0,1861.0,0.0,0.0,-1.0,96.6,-1.0,11167.0,11556.0,0.0,15356.0,8.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,125.0,211.0,9.0,9.0,3.0,3.0,26.0,3.0,1.0,17.0,8.0,8.0,4.0,0.0,389.0,75.0,100.0


In [16]:
# Standard Scaling 
# loan_amnt, funded_amnt, funded_amnt_inv, term, int_rate, installment, emp_length
# annual_inc, mort_acc, num_tl_op_past_12m, inq_last_6mths, inq_last_12m, inq_fi,
# mths_since_last_delinq, mths_since_recent_inq, mths_since_rcnt_il, mths_since_recent_bc,
# acc_now_delinq, delinq_2yrs, pub_rec, collections_12_mths_ex_med, chargeoff_within_12_mths
# tax_liens, pub_rec_bankruptcies, num_tl_120dpd_2m, num_tl_30dpd, num_tl_90g_dpd_24m, 
# num_accts_ever_120_pd, delinq_amnt, dti, revol_bal, revol_util, total_rev_hi_lim, tot_coll_amt
# tot_cur_bal, avg_cur_bal and the rest. 

# One-hot encoding
# purpose, home_ownership, verification_status, 

# Binary 
# pymnt_plan, initial_list_status, disbursement_method, debt_settlement_flag, application_type

X_train.avg_cur_bal.value_counts(dropna=False)
# X_train.loan_status.info()

avg_cur_bal
0.0        37277
2542.0       177
2442.0       165
2750.0       164
2758.0       161
           ...  
80002.0        1
53364.0        1
57824.0        1
71695.0        1
66912.0        1
Name: count, Length: 68863, dtype: int64

## 4.1 Label

In [17]:
# convert label to numerical value
label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)
y_oot1 = label_encoder.transform(y_oot1)
y_oot2 = label_encoder.transform(y_oot2)
y_oot3 = label_encoder.transform(y_oot3)

## 4.2 Features

In [18]:
# get categorical feature and numerical feature
categorical_features = ['term', 'pymnt_plan', 'purpose', 'initial_list_status', 'disbursement_method', 'debt_settlement_flag', 'home_ownership', 'verification_status', 'application_type']

numerical_cols = [col for col in X_train.columns if col not in X_train.columns[X_train.columns.str.startswith(tuple(categorical_features))]]

In [19]:
"""
skewness = df[numerical_cols].apply(skew)
# display skewness values
print(skewness.sort_values(ascending=False))
"""

'\nskewness = df[numerical_cols].apply(skew)\n# display skewness values\nprint(skewness.sort_values(ascending=False))\n'

In [20]:
datasets = {
    "train": X_train,
    "val": X_val,
    "test": X_test,
    "oot1": X_oot1,
    "oot2": X_oot2,
    "oot3": X_oot3,
}

for name, df in datasets.items():
    # one-hot encoding for categorical features
    df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

    # impute missing values of numerical feature with median
    df[numerical_cols] = df[numerical_cols].replace(-1, np.nan)
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
    # normalization
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    """
    # handle skewness
    for col in skewness[(skewness > 1) | (skewness < -1)].index:
        if df[col].min() <= 0:
            shift = abs(df[col].min()) + 1
            df[col] = np.log1p(df[col] + shift)
        else:
            df[col] = np.log1p(df[col])
    """

    datasets[name] = df

X_train = datasets["train"]
X_val = datasets["val"]
X_test = datasets["test"]
X_oot1 = datasets["oot1"]
X_oot2 = datasets["oot2"]
X_oot3 = datasets["oot3"]

In [21]:
X_train

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,emp_length,annual_inc,mort_acc,num_tl_op_past_12m,inq_last_6mths,inq_last_12m,inq_fi,mths_since_last_delinq,mths_since_recent_inq,mths_since_rcnt_il,mths_since_recent_bc,acc_now_delinq,delinq_2yrs,pub_rec,collections_12_mths_ex_med,chargeoff_within_12_mths,tax_liens,pub_rec_bankruptcies,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_accts_ever_120_pd,delinq_amnt,dti,revol_bal,revol_util,total_rev_hi_lim,tot_coll_amt,tot_cur_bal,avg_cur_bal,all_util,max_bal_bc,il_util,bc_util,total_bal_il,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,tot_hi_cred_lim,open_acc,total_acc,open_acc_6m,open_act_il,open_il_12m,open_il_24m,open_rv_12m,open_rv_24m,acc_open_past_24mths,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,num_actv_bc_tl,num_actv_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_il_tl,num_bc_tl,num_op_rev_tl,num_sats,num_bc_sats,total_cu_tl,bc_open_to_buy,percent_bc_gt_75,pct_tl_nvr_dlq,term_60,pymnt_plan_1,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_1,disbursement_method_1,debt_settlement_flag_1,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,application_type_1
876556,0.647935,0.648672,0.650416,1.373364,1.242467,1.129274,-0.283767,-0.817885,-1.149385,1.604179,1.217104,2.537603,0.732482,-1.206730,-0.238568,-0.047630,-0.058230,-0.353274,-0.346104,-0.119996,-0.080712,-0.123211,-0.352114,-0.023382,-0.049021,-0.165163,-0.364702,-0.017335,0.089164,-0.505421,0.045257,-0.652551,-0.027347,-0.186677,0.300835,1.017342,0.231190,0.143911,1.402389,2.060808,1.166377,-0.789519,1.346190,-0.353736,-0.995278,0.569190,-0.573187,0.839783,-0.522685,0.710430,-0.593513,-0.284769,-0.444413,0.174463,-1.558740,0.528058,0.524760,-1.078448,-0.996092,-1.397308,-1.007432,2.813288,-1.345804,-1.240952,-0.908340,-1.180080,0.935031,-0.676957,1.619997,-0.257989,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,True,False
1231439,0.104018,0.104643,0.106629,-1.060137,-0.454878,-1.650973,0.053079,-0.287581,-0.595413,-0.650311,-0.845214,-0.857379,-0.112694,1.114747,1.082726,0.486885,-0.058230,-0.353274,-0.346104,-0.119996,-0.080712,-0.123211,-0.352114,-0.023382,-0.049021,-0.165163,-0.364702,-0.017335,-0.838026,-0.656803,-1.752347,-0.252995,-0.027347,0.161351,0.630152,0.273139,-0.631026,-0.086676,-2.041590,-0.136406,-0.468470,-0.378396,-0.122853,0.193765,-0.640672,-0.013902,-0.573187,0.103979,-0.522685,-0.662786,0.153495,-0.284769,-1.068985,0.213106,-0.470576,-0.279421,0.084456,-0.657364,-0.707574,-0.672642,-0.711355,0.923370,-0.723392,-0.619443,-0.567479,-0.533709,0.039268,0.190242,-1.185725,0.635762,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True
1131787,1.626986,1.627925,1.618356,0.186644,0.944566,-1.372948,1.493488,-0.817885,-1.149385,-0.650311,-0.329634,-0.008633,-0.112694,0.757597,-0.238568,0.046696,-0.058230,-0.353274,-0.346104,-0.119996,-0.080712,-0.123211,-0.352114,-0.023382,-0.049021,-0.165163,-0.364702,-0.017335,-0.614321,0.329065,1.442945,-0.150332,-0.027347,0.664931,0.557318,-1.091232,-0.646165,0.086264,0.996796,-0.218276,3.932281,0.220794,3.947596,0.394855,-0.108764,0.402592,-0.573187,-0.631826,-0.522685,-0.662786,-0.593513,-0.695023,-0.444413,-0.057394,-0.711227,0.701089,0.744912,1.026971,0.157980,-0.551864,0.176874,1.598341,0.313962,-0.412273,-0.056189,0.435848,-0.408614,-0.444803,1.151441,0.635762,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,False
505502,-0.461655,-0.461148,-0.458909,0.072735,-0.297632,-1.372948,0.131647,1.303332,-0.041442,-0.650311,0.185945,-0.008633,-1.803047,0.400446,-0.678999,0.172465,-0.058230,1.953270,-0.346104,-0.119996,-0.080712,-0.123211,-0.352114,-0.023382,-0.049021,-0.165163,-0.364702,-0.017335,0.603470,0.280694,0.938000,-0.058767,-0.027347,2.572944,0.599034,1.513477,0.539022,1.066259,1.299212,3.083164,2.269611,0.019606,2.738928,2.372956,2.373477,2.901555,0.423999,3.047197,0.728949,1.397038,0.153495,-0.284769,0.180159,0.657499,-0.324092,-0.279421,-0.465925,1.026971,1.023534,1.501355,1.065104,2.678294,0.521432,0.830745,2.329835,0.759034,0.487150,-0.606118,1.619997,0.233574,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,True,False,False
910324,-0.461655,-0.461148,-0.461628,0.186644,-0.728063,1.129274,-0.346307,-0.817885,-0.595413,0.476934,-0.329634,-0.008633,-0.437762,-0.313855,-0.238568,-0.330608,-0.058230,-0.353274,-0.346104,-0.119996,-0.080712,-0.123211,-0.352114,-0.023382,-0.049021,-0.165163,-0.364702,-0.017335,-0.836588,-0.241292,0.905684,-0.503938,-0.027347,-0.793237,-0.689568,-1.091232,-0.646165,0.086264,1.373926,-0.218276,-0.803648,-0.502782,-0.977511,-0.901753,-0.640672,0.235995,-0.573187,-0.631826,-0.522685,-0.662786,-0.593513,-0.695023,-0.132127,-0.018751,0.314158,-0.279421,0.084456,-0.236281,-0.707574,1.501355,-0.711355,-0.966549,1.973727,0.002066,-0.567479,-0.210523,-0.408614,-0.661235,0.918566,0.635762,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257858,-0.548682,-0.548193,-0.545915,1.739942,-0.248613,0.017175,-0.431422,-0.817885,0.512530,0.476934,0.701525,-0.857379,1.382618,-0.849580,-0.385378,-0.110514,-0.058230,-0.353274,-0.346104,-0.119996,-0.080712,-0.123211,-0.352114,-0.023382,-0.049021,-0.165163,-0.364702,-0.017335,0.649506,-0.528400,-0.350619,-0.622029,0.016653,-0.641974,-0.606688,1.265409,-0.359601,0.893319,0.655245,-0.000698,-0.312365,-0.863871,-0.174527,-0.725862,-0.108764,-0.180499,-0.573187,0.471881,0.728949,1.397038,0.900503,1.356243,1.117017,-0.849574,-1.025121,-0.337098,-0.025620,-0.657364,-0.130538,0.414356,-0.119202,-0.426572,-0.308450,0.002066,-0.056189,-0.533709,-0.408614,-0.637316,0.685691,0.133027,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,True,False
1413575,1.191852,1.192702,1.194202,1.131049,0.832745,0.573224,3.824331,-0.287581,1.066501,0.476934,1.217104,-0.008633,-2.063101,-1.206730,-0.630063,-0.393492,14.020933,0.799998,1.402225,-0.119996,-0.080712,2.506487,-0.352114,-0.023382,17.912833,-0.165163,-0.364702,0.409169,-0.371914,0.950642,0.913763,0.529467,-0.027347,2.306057,1.584720,1.234401,2.505501,0.316851,0.637456,0.411557,1.274699,1.073656,0.359226,2.286513,0.423145,0.235995,1.421185,0.839783,1.980584,1.397038,0.900503,0.945990,0.804731,1.121214,1.213986,-0.683161,-0.686077,-0.236281,-0.130538,-0.189532,-0.119202,0.518387,-0.100980,-0.205104,0.284672,0.112663,-0.408614,0.015407,-0.063436,-0.224473,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,True,False,False
130569,0.104018,0.104643,0.106629,-0.973152,0.219454,0.017175,0.314972,-0.817885,1.620473,-0.650311,0.185945,-0.008633,-1.868060,0.043296,0.299737,-0.582145,-0.058230,1.953270,-0.346104,-0.119996,9.398409,-0.123211,-0.352114,-0.023382,-0.049021,1.894167,1.140348,-0.017335,-0.521530,0.028494,0.715824,-0.222474,-0.021320,-0.707401,-0.664943,1.017342,0.288683,-1.009024,0.569857,-0.661214,-0.524865,-0.185955,-0.652527,-0.764044,-0.108764,0.569190,0.423999,-0.263923,-0.522685,0.023822,3.141526,1.766497,0.804731,0.019892,0.607125,-0.452452,-0.245773,1.026971,1.312052,1.742910,1.361181,-0.696561,2.181198,0.416405,-0.056189,2.374962,0.039268,-0.493375,0.217136,-2.268928,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,True,False,False
670483,2.170903,2.171955,2.173018,0.928085,1.628784,0.017175,1.205406,-0.817885,-1.149385,-0.650311,-0.329634,-0.008633,-0.112694,-0.492430,-0.238568,0.801305,-0.058230,-0.353274,1.402225,-0.119996,-0.080712,-0.123211,2.393425,-0.023382,-0.049021,-0.165163,-0.364702,-0.017335,-0.656760,-0.336893,0.699666,-0.560986,-0.027347,0.097418,1.078370,-1.091232,-0.646165,0.086264,-2.002454,-0.218276,2.089076,-0.907607,2.536357,-0.039172,-1.172581,0.319293,-0.573187,-0.631826,-0.522685,-0.662786,-0.593513,-0.695023,-1.381271,1.333750,-0.020662,1.162506,2.836356,-1.078448,-0.996092,-0.672642,-1.007432,1.598341,-0.515921,-1.033783,-1.078770,-1.180080,-0.408614,-0.546718,-1.185725,0.635762,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,True,False,False


# 5. Model training and evaluation

## 5.1 Base Model

### 5.1.1 Training

In [22]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

joblib.dump(model_lr, 'linear_regression_base.pkl')

['linear_regression_base.pkl']

### 5.1.2 Evaluation

In [23]:
def evaluate_linear_model(model, X, y_true, dataset_name):
    y_pred_continuous = model.predict(X)
    # round to nearest class label
    y_pred = np.round(y_pred_continuous).astype(int)
    # clip to valid range
    y_pred = np.clip(y_pred, y_true.min(), y_true.max())

    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average=None, zero_division=0)
    recall = recall_score(y_true, y_pred, average=None, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n{dataset_name} Evaluation:")
    print(f"F1 Score: {f1:.4f}")
    print(f"Per-class Precision: {precision}")
    print(f"Per-class Recall: {recall}")
    print("Confusion Matrix:\n", cm)

    return {
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "confusion_matrix": cm
    }

In [24]:
# evaluate
results = {}
for name, X, y in [
    ('Train', X_train, y_train),
    ('Validation', X_val, y_val),
    ('Test', X_test, y_test),
    ('OOT1', X_oot1, y_oot1),
    ('OOT2', X_oot2, y_oot2),
    ('OOT3', X_oot3, y_oot3),
]:
    results[name] = evaluate_linear_model(model_lr, X, y, name)


Train Evaluation:
F1 Score: 0.8387
Per-class Precision: [0.92644688 0.88472559 0.85869541 0.77794385 0.57113286 0.32341045
 0.41171453]
Per-class Recall: [0.92753477 0.87480839 0.89467921 0.77696087 0.45783471 0.36746324
 0.45880862]
Confusion Matrix:
 [[210901  16473      4      0      0      0      0]
 [ 16499 304757  27095     18      1      0      0]
 [   117  23234 304751  12521      3      0      0]
 [    82      1  22850 132203  15017      1      0]
 [    31      0    193  24716  32558  13615      0]
 [    11      0      7    414   9194   7996   4138]
 [     4      0      0     67    233   3112   2896]]

Validation Evaluation:
F1 Score: 0.8378
Per-class Precision: [0.92786253 0.88445691 0.85690612 0.77510403 0.5701495  0.33099924
 0.40900066]
Per-class Recall: [0.92771035 0.87505196 0.89421097 0.77721046 0.45446212 0.3646789
 0.45109489]
Confusion Matrix:
 [[45250  3526     0     0     0     0     0]
 [ 3457 65257  5856     5     0     0     0]
 [   25  4998 65154  2685     0  

In [25]:
# SHAP
explainer = shap.LinearExplainer(model_lr, X_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_train)



In [26]:
# top 5 features
mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.Series(mean_abs_shap, index=X_train.columns)
top_5_features = shap_importance.sort_values(ascending=False).head(5)

print("Top 5 important features:\n", top_5_features)

Top 5 important features:
 int_rate           0.915455
funded_amnt_inv    0.635894
funded_amnt        0.429422
loan_amnt          0.221292
num_sats           0.046974
dtype: object


## 5.2 Lasso Regression

### 5.2.1 Training

In [23]:
lasso = LassoCV(alphas=[0.001, 0.01, 0.1, 1], cv=5)
lasso.fit(X_train, y_train)

joblib.dump(model_lr, 'linear_regression_lasso.pkl')

['linear_regression_lasso.pkl']

### 5.2.2 Evaluation

In [28]:
# evaluate
results = {}
for name, X, y in [
    ('Train', X_train, y_train),
    ('Validation', X_val, y_val),
    ('Test', X_test, y_test),
    ('OOT1', X_oot1, y_oot1),
    ('OOT2', X_oot2, y_oot2),
    ('OOT3', X_oot3, y_oot3),
]:
    results[name] = evaluate_linear_model(lasso, X, y, name)


Train Evaluation:
F1 Score: 0.8385
Per-class Precision: [0.92528375 0.88693238 0.85834631 0.77702659 0.56838052 0.31584291
 0.40656246]
Per-class Recall: [0.92823404 0.87474237 0.89679003 0.77543284 0.45251923 0.3573989
 0.45738276]
Confusion Matrix:
 [[211060  16318      0      0      0      0      0]
 [ 16798 304734  26838      0      0      0      0]
 [   117  22510 305470  12529      0      0      0]
 [    82     20  23167 131943  14942      0      0]
 [    31      0    341  24839  32180  13722      0]
 [    11      0     66    393   9299   7777   4214]
 [     4      0      0    101    196   3124   2887]]

Validation Evaluation:
F1 Score: 0.8377
Per-class Precision: [0.92637133 0.88686891 0.85672687 0.77425629 0.56578839 0.32430385
 0.4064772 ]
Per-class Recall: [0.92861243 0.87459604 0.89643436 0.77589283 0.4485039  0.35696414
 0.44890511]
Confusion Matrix:
 [[45294  3482     0     0     0     0     0]
 [ 3538 65223  5814     0     0     0     0]
 [   26  4832 65316  2688     0  

In [29]:
# SHAP
explainer = shap.LinearExplainer(lasso, X_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_train)



In [30]:
# top 5 features
mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.Series(mean_abs_shap, index=X_train.columns)
top_5_features = shap_importance.sort_values(ascending=False).head(5)

print("Top 5 important features:\n", top_5_features)

Top 5 important features:
 int_rate              0.915712
term_60               0.040244
num_tl_op_past_12m    0.021615
all_util              0.018192
open_rv_12m           0.014928
dtype: object


## 5.3 Ridge

### 5.3.1 Training

In [24]:
alphas = [0.001, 0.01, 0.1, 1, 10]

# initialize and fit RidgeCV
ridge = RidgeCV(alphas=alphas, cv=5)
ridge.fit(X_train, y_train)

joblib.dump(model_lr, 'linear_regression_ridge.pkl')

['linear_regression_ridge.pkl']

### 5.3.2 Evaluation

In [32]:
# evaluate
results = {}
for name, X, y in [
    ('Train', X_train, y_train),
    ('Validation', X_val, y_val),
    ('Test', X_test, y_test),
    ('OOT1', X_oot1, y_oot1),
    ('OOT2', X_oot2, y_oot2),
    ('OOT3', X_oot3, y_oot3),
]:
    results[name] = evaluate_linear_model(ridge, X, y, name)


Train Evaluation:
F1 Score: 0.8387
Per-class Precision: [0.92645844 0.88473497 0.85869375 0.77795693 0.57116792 0.32347826
 0.41183163]
Per-class Recall: [0.92752597 0.87482275 0.89470269 0.7769785  0.45780659 0.36755515
 0.45880862]
Confusion Matrix:
 [[210899  16475      4      0      0      0      0]
 [ 16496 304762  27093     18      1      0      0]
 [   117  23229 304759  12518      3      0      0]
 [    82      1  22852 132206  15012      1      0]
 [    31      0    195  24717  32556  13614      0]
 [    11      0      7    414   9194   7998   4136]
 [     4      0      0     67    233   3112   2896]]

Validation Evaluation:
F1 Score: 0.8378
Per-class Precision: [0.92784351 0.8844689  0.85692304 0.77516769 0.5701848  0.33099924
 0.40900066]
Per-class Recall: [0.92771035 0.87505196 0.89425215 0.77721046 0.4545276  0.3646789
 0.45109489]
Confusion Matrix:
 [[45250  3526     0     0     0     0     0]
 [ 3458 65257  5855     5     0     0     0]
 [   25  4997 65157  2683     0  

In [33]:
# SHAP
explainer = shap.LinearExplainer(ridge, X_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_train)



In [34]:
# top 5 features
mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_importance = pd.Series(mean_abs_shap, index=X_train.columns)
top_5_features = shap_importance.sort_values(ascending=False).head(5)

print("Top 5 important features:\n", top_5_features)

Top 5 important features:
 int_rate           0.915419
funded_amnt_inv    0.629355
funded_amnt        0.418893
loan_amnt          0.225081
num_sats            0.04701
dtype: object
