In [17]:
import numpy as np
import pandas as pd
import scorecardpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df = pd.read_csv('Loan_Default_imputed.csv')

In [4]:
df.Status = df.Status.astype(int)

In [5]:
bins = sc.woebin(df, y='Status')

[INFO] creating woe binning ...


  dat.loc[:,y] = dat[y].apply(lambda x: x if pd.isnull(x) else int(x)) #dat[y].astype(int)
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd

Binning on 143942 rows and 31 columns in 00:00:28


In [6]:
df_woe = sc.woebin_ply(df, bins)

[INFO] converting into woe values ...
Woe transformating on 143942 rows and 30 columns in 00:00:15


In [13]:
df_woe

Unnamed: 0,Status,business_or_commercial_woe,loan_purpose_woe,rate_of_interest_woe,loan_limit_woe,occupancy_type_woe,Gender_woe,Secured_by_woe,approv_in_adv_woe,loan_type_woe,...,loan_amount_woe,construction_type_woe,open_credit_woe,total_units_woe,age_woe,submission_of_application_woe,co-applicant_credit_type_woe,Upfront_charges_woe,term_woe,Credit_Score_woe
0,1,-0.084870,0.094115,1.389952,-0.029674,-0.018941,0.191430,0.0,0.035346,-0.099372,...,0.451443,0.0,0.0,0.0,-0.128688,0.199346,-0.363815,0.478947,-0.025679,-0.001903
1,1,0.465009,0.094115,-0.017534,-0.029674,-0.018941,0.065339,0.0,0.035346,0.465009,...,-0.032819,0.0,0.0,0.0,0.071628,0.199346,0.306957,0.478947,-0.025679,0.029713
2,0,-0.084870,0.094115,0.286010,-0.029674,-0.018941,0.065339,0.0,-0.198225,-0.099372,...,-0.183942,0.0,0.0,0.0,-0.128688,0.199346,-0.363815,-0.219889,-0.025679,0.027824
3,0,-0.084870,-0.090582,0.286010,-0.029674,-0.018941,0.065339,0.0,0.035346,-0.099372,...,-0.183942,0.0,0.0,0.0,-0.028398,-0.427978,-0.363815,0.478947,-0.025679,0.029713
4,0,-0.084870,0.094115,-0.017534,-0.029674,-0.018941,-0.319219,0.0,-0.198225,-0.099372,...,0.149209,0.0,0.0,0.0,-0.128688,-0.427978,0.306957,-1.877887,-0.025679,-0.022570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143937,0,-0.084870,0.022735,-1.103975,-0.029674,-0.018941,0.191430,0.0,0.035346,-0.099372,...,-0.183942,0.0,0.0,0.0,0.071628,0.199346,0.306957,-0.943607,-0.038733,-0.022570
143938,0,-0.084870,0.094115,-2.327266,-0.029674,0.236439,0.065339,0.0,0.035346,-0.099372,...,-0.295659,0.0,0.0,0.0,-0.128688,-0.427978,-0.363815,-1.877887,-0.025679,0.029713
143939,0,-0.084870,-0.090582,-1.103975,-0.029674,-0.018941,0.065339,0.0,0.035346,-0.099372,...,-0.183942,0.0,0.0,0.0,-0.028398,-0.427978,0.306957,-0.219889,-0.038733,-0.022570
143940,0,-0.084870,-0.090582,-1.103975,-0.029674,-0.018941,0.065339,0.0,0.035346,-0.099372,...,0.165988,0.0,0.0,0.0,0.071628,0.199346,0.306957,0.478947,-0.038733,-0.022570


In [7]:
iv_values = {col: bin_df['total_iv'].iloc[0] for col, bin_df in bins.items()}
iv_df = pd.DataFrame(list(iv_values.items()), columns=['Variable', 'IV'])


In [8]:
iv_df.sort_values(by='IV', ascending=False)

Unnamed: 0,Variable,IV
7,credit_type,4.657177
2,rate_of_interest,0.619766
21,Interest_rate_spread,0.231496
22,income,0.154297
12,dtir1,0.141857
14,co-applicant_credit_type,0.110648
27,submission_of_application,0.084715
20,Neg_ammortization,0.083731
29,LTV,0.062206
10,loan_amount,0.04066


In [9]:
features=iv_df[iv_df['IV']>=0.03]['Variable'].to_list()

After calculating IV, we conclude that the features 'business_or_commercial', 'rate_of_interest', 'Gender', 'loan_type', 'income', 'Neg_ammortization', 'dtir1', 'credit_type', 'property_value','Interest_rate_spread', 'LTV', 'loan_amount', 'submission_of_application', 'co-applicant_credit_type', 'Upfront_charges' have sufficient predictive power (IV>=0.03) and would be used to train models.

In [10]:
features.append('Status')

In [11]:
df_learn = df[features]
df_learn.head()

Unnamed: 0,rate_of_interest,credit_type,Gender,loan_amount,dtir1,co-applicant_credit_type,property_value,Neg_ammortization,Interest_rate_spread,income,business_or_commercial,loan_type,submission_of_application,LTV,Status
0,4.199231,EXP,Sex Not Available,116500.0,45.0,CIB,118000.0,not_neg,0.522692,1740.0,nob/c,type1,to_inst,98.728814,1
1,3.94,EQUI,Male,206500.0,43.615385,EXP,228769.230769,not_neg,1.006962,4980.0,b/c,type2,to_inst,87.282005,1
2,4.56,EXP,Male,406500.0,46.0,CIB,508000.0,neg_amm,0.2,9480.0,nob/c,type1,to_inst,80.019685,0
3,4.25,EXP,Male,456500.0,42.0,CIB,658000.0,not_neg,0.681,11880.0,nob/c,type1,not_inst,69.3769,0
4,4.0,CRIF,Joint,696500.0,39.0,EXP,758000.0,not_neg,0.3042,10440.0,nob/c,type1,not_inst,91.886544,0


In [12]:
numerical_columns = df_learn.select_dtypes(include=[np.number]).columns.to_list()
categorical_columns = df_learn.select_dtypes(include=['object']).columns.tolist()

oh = OneHotEncoder(sparse_output=False, handle_unknown='ignore').set_output(transform='pandas')
sc = RobustScaler()
encoded = oh.fit_transform(df_learn[categorical_columns])
scaled = pd.DataFrame(sc.fit_transform(df_learn[numerical_columns]), columns = numerical_columns)
df_enc_scal = pd.concat([encoded, scaled], axis=1)

In [13]:
df_enc_scal.head()

Unnamed: 0,credit_type_CIB,credit_type_CRIF,credit_type_EQUI,credit_type_EXP,Gender_Female,Gender_Joint,Gender_Male,Gender_Sex Not Available,co-applicant_credit_type_CIB,co-applicant_credit_type_EXP,...,submission_of_application_not_inst,submission_of_application_to_inst,rate_of_interest,loan_amount,dtir1,property_value,Interest_rate_spread,income,LTV,Status
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.318501,-0.75,0.545455,-0.882353,0.16392,-0.857143,0.984135,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,-0.076112,-0.375,0.41958,-0.556561,0.90655,-0.155844,0.506653,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.867681,0.458333,0.636364,0.264706,-0.330931,0.818182,0.203719,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.395785,0.666667,0.272727,0.705882,0.406686,1.337662,-0.240224,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.015222,1.666667,0.0,1.0,-0.171139,1.025974,0.698722,0.0


In [14]:
X, y = df_enc_scal.drop('Status', axis=1), df_enc_scal.Status

In [None]:
logistic_model = LogisticRegression(max_iter=300)
param_grid = {
    'logisticregression__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}
grid_search_logistic = GridSearchCV(logistic_model, param_grid, cv=5)
grid_search_logistic.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)