In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.utils import resample
from typing import Dict
from transformers.columnSelectorTransformer import ColumnSelectorTransformer
from transformers.woeTransformer import WOETransformer
from transformers.binningTransformer import BinningTransformer

In [2]:
from pd_modeling.bins import bins

In [3]:
dataset = pd.read_csv('../files/credit_risk_data_v2.csv', low_memory=False)

In [4]:
cols_to_keep = ['loan_amnt', 'funded_amnt', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'purpose',
       'addr_state', 'dti', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'total_pymnt', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee',
       'last_pymnt_amnt',
       'tot_coll_amt',
       'tot_cur_bal', 'total_rev_hi_lim', 'status']

In [5]:
column_t = ColumnSelectorTransformer(columns=cols_to_keep)
binning_t = BinningTransformer(bins=bins)
woe_t = WOETransformer(columns=cols_to_keep)

In [6]:
dataset_c = column_t.transform(dataset)
dataset_c.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,total_rec_late_fee,last_pymnt_amnt,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,status
0,5000,5000,36 months,10.65,162.87,B,B2,10+ years,RENT,24000.0,...,0.0,5861.071414,5000.0,861.07,0.0,171.62,,,,0
1,2500,2500,60 months,15.27,59.83,C,C4,< 1 year,RENT,30000.0,...,0.0,1008.71,456.46,435.17,0.0,119.66,,,,1
2,2400,2400,36 months,15.96,84.33,C,C5,10+ years,RENT,12252.0,...,0.0,3003.653644,2400.0,603.65,0.0,649.91,,,,0
3,10000,10000,36 months,13.49,339.31,C,C1,10+ years,RENT,49200.0,...,0.0,12226.30221,10000.0,2209.33,16.97,357.48,,,,0
4,3000,3000,60 months,12.69,67.79,B,B5,1 year,RENT,80000.0,...,766.9,3242.17,2233.1,1009.07,0.0,67.79,,,,0


In [7]:
dataset_c = column_t.clean(X=dataset_c)
dataset_c.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,total_rec_late_fee,last_pymnt_amnt,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,status
42535,27050,27050,36 months,10.99,885.46,B,B2,10+ years,OWN,55000.0,...,10018.9,21251.04,17031.1,4219.94,0.0,885.46,0.0,114834.0,59900.0,0
42536,9750,9750,36 months,13.98,333.14,C,C1,1 year,RENT,26000.0,...,3710.96,7994.83,6039.04,1955.79,0.0,333.14,0.0,14123.0,15100.0,0
42537,12000,12000,36 months,6.62,368.45,A,A2,10+ years,MORTGAGE,105000.0,...,4266.62,8842.8,7733.38,1109.42,0.0,368.45,0.0,267646.0,61100.0,0
42538,12000,12000,36 months,13.53,407.4,B,B5,10+ years,RENT,40000.0,...,0.0,13359.77,11999.99,1359.78,0.0,119.17,15386.0,13605.0,8100.0,0
42539,15000,15000,36 months,8.9,476.3,A,A5,2 years,MORTGAGE,63000.0,...,5449.27,11431.2,9550.73,1880.47,0.0,476.3,1514.0,272492.0,15400.0,0


In [8]:
dataset_c = column_t.undersampling(X=dataset_c, y='status')
dataset_c.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,total_rec_late_fee,last_pymnt_amnt,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,status
0,25000,25000,60 months,14.3,585.61,C,C1,10+ years,MORTGAGE,60000.0,...,16678.72,14640.25,8321.28,6318.97,0.0,585.61,0.0,104064.0,30500.0,0
1,25000,25000,60 months,14.98,594.49,C,C3,7 years,MORTGAGE,82000.0,...,17529.68,13673.27,7470.32,6202.95,0.0,594.49,0.0,254150.0,63200.0,0
2,10000,10000,36 months,18.55,364.29,D,D2,10+ years,RENT,89000.0,...,0.0,10472.83,9999.99,472.84,0.0,51.38,0.0,60834.0,4900.0,0
3,5000,5000,36 months,7.9,156.46,A,A4,3 years,OWN,80000.0,...,0.0,5573.942773,5000.0,573.94,0.0,1346.07,0.0,19319.0,46400.0,0
4,6000,6000,36 months,14.3,205.95,C,C1,8 years,RENT,75000.0,...,2111.65,5148.0,3888.35,1259.65,0.0,205.95,0.0,30785.0,21700.0,0


In [9]:
dataset_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75362 entries, 0 to 75361
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   loan_amnt            75362 non-null  int64  
 1   funded_amnt          75362 non-null  int64  
 2   term                 75362 non-null  object 
 3   int_rate             75362 non-null  float64
 4   installment          75362 non-null  float64
 5   grade                75362 non-null  object 
 6   sub_grade            75362 non-null  object 
 7   emp_length           75362 non-null  object 
 8   home_ownership       75362 non-null  object 
 9   annual_inc           75362 non-null  float64
 10  verification_status  75362 non-null  object 
 11  purpose              75362 non-null  object 
 12  addr_state           75362 non-null  object 
 13  dti                  75362 non-null  float64
 14  revol_bal            75362 non-null  int64  
 15  revol_util           75362 non-null 

In [10]:
x_train_c = dataset_c.drop("status", axis=1)
y_train = dataset_c["status"]
x_train_c.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,initial_list_status,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,total_rec_late_fee,last_pymnt_amnt,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,25000,25000,60 months,14.3,585.61,C,C1,10+ years,MORTGAGE,60000.0,...,w,16678.72,14640.25,8321.28,6318.97,0.0,585.61,0.0,104064.0,30500.0
1,25000,25000,60 months,14.98,594.49,C,C3,7 years,MORTGAGE,82000.0,...,f,17529.68,13673.27,7470.32,6202.95,0.0,594.49,0.0,254150.0,63200.0
2,10000,10000,36 months,18.55,364.29,D,D2,10+ years,RENT,89000.0,...,w,0.0,10472.83,9999.99,472.84,0.0,51.38,0.0,60834.0,4900.0
3,5000,5000,36 months,7.9,156.46,A,A4,3 years,OWN,80000.0,...,f,0.0,5573.942773,5000.0,573.94,0.0,1346.07,0.0,19319.0,46400.0
4,6000,6000,36 months,14.3,205.95,C,C1,8 years,RENT,75000.0,...,f,2111.65,5148.0,3888.35,1259.65,0.0,205.95,0.0,30785.0,21700.0


In [11]:
x_train_b = binning_t.transform(x_train_c)
x_train_b.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,initial_list_status,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,total_rec_late_fee,last_pymnt_amnt,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,"(18000.0,inf)","(20429.0,25286.0)",60 months,"(12.0,15.0)","(486.0,717.0)",C,C1,10+ years,MORTGAGE,"(7000.0,178500.0)",...,w,"(16080.0,inf)","(13786.0,25000.0)","(5833.0,11667.0)","(6139.0,9208.0)","(-inf,2.0)","(100.0,6007.0)","(-inf,3000.0)","(-inf,500000.0)","(20000.0,40000.0)"
1,"(18000.0,inf)","(20429.0,25286.0)",60 months,"(12.0,15.0)","(486.0,717.0)",C,C3,7 years,MORTGAGE,"(7000.0,178500.0)",...,f,"(16080.0,inf)","(7000.0,13786.0)","(5833.0,11667.0)","(6139.0,9208.0)","(-inf,2.0)","(100.0,6007.0)","(-inf,3000.0)","(-inf,500000.0)","(60000.0,inf)"
2,"(9500.0,18000.0)","(5857.0,10714.0)",36 months,"(17.0,20.0)","(254.0,486.0)",D,D2,10+ years,RENT,"(7000.0,178500.0)",...,w,"(-inf,2000.0)","(7000.0,13786.0)","(5833.0,11667.0)","(-inf,1000)","(-inf,2.0)","(-inf,100.0)","(-inf,3000.0)","(-inf,500000.0)","(-inf,20000.0)"
3,"(1000.0,9500.0)","(1000.0,5857.0)",36 months,"(6.0,9.0)","(23.0,254.0)",A,A4,3 years,OWN,"(7000.0,178500.0)",...,f,"(-inf,2000.0)","(-inf,7000)","(2000.0,5833.0)","(-inf,1000)","(-inf,2.0)","(100.0,6007.0)","(-inf,3000.0)","(-inf,500000.0)","(40000.0,60000.0)"
4,"(1000.0,9500.0)","(5857.0,10714.0)",36 months,"(12.0,15.0)","(23.0,254.0)",C,C1,8 years,RENT,"(7000.0,178500.0)",...,f,"(2000.0,8040.0)","(-inf,7000)","(2000.0,5833.0)","(1000.0,3069.0)","(-inf,2.0)","(100.0,6007.0)","(-inf,3000.0)","(-inf,500000.0)","(20000.0,40000.0)"


In [12]:
woe_t.fit(x_train_b, y_train)


WOETransformer(columns=['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'purpose', 'addr_state', 'dti', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'total_pymnt', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'last_pymnt_amnt', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'status'], target_mappings={0: 'good', 1: 'bad'})

In [13]:
x_train_b["total_rev_hi_lim"].value_counts()

total_rev_hi_lim
(-inf,20000.0)       34132
(20000.0,40000.0)    25836
(40000.0,60000.0)     9313
(60000.0,inf)         6081
Name: count, dtype: int64

In [14]:

woe_t.transform(x_train_b).head()

['(18000.0,inf)', '(9500.0,18000.0)', '(1000.0,9500.0)', '(-inf,1000.0)']
['(25286.0,inf)', '(15571.0,20429.0)', '(10714.0,15571.0)', '(20429.0,25286.0)', '(5857.0,10714.0)', '(1000.0,5857.0)', '(-inf,1000.0)']
[' 60 months', ' 36 months']
['(20.0,inf)', '(17.0,20.0)', '(15.0,17.0)', '(12.0,15.0)', '(9.0,12.0)', '(-inf,6.0)', '(6.0,9.0)']
['(948.0,inf)', '(486.0,717.0)', '(254.0,486.0)', '(717.0,948.0)', '(23.0,254.0)']
['G', 'F', 'E', 'D', 'C', 'B', 'A']
['G1', 'F5', 'G2', 'G5', 'G3', 'F3', 'F4', 'E5', 'F2', 'E4', 'F1', 'G4', 'E3', 'E2', 'D5', 'D4', 'E1', 'D2', 'D3', 'D1', 'C4', 'C5', 'C3', 'C2', 'C1', 'B5', 'B4', 'B3', 'B2', 'B1', 'A5', 'A4', 'A3', 'A2', 'A1']
['< 1 year', '6 years', '5 years', '9 years', '7 years', '8 years', '2 years', '3 years', '1 year', '4 years', '10+ years']
['NONE', 'OTHER', 'RENT', 'OWN', 'MORTGAGE']
['(7000.0,178500.0)', '(178500.0,350000.0)', '(521500.0,693000.0)', '(350000.0,521500.0)', '(693000.0,inf)']
['Verified', 'Source Verified', 'Not Verified']
['s

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,initial_list_status,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,total_rec_late_fee,last_pymnt_amnt,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,-0.05234,0.006831,-0.260854,0.148548,-0.070657,-0.053435,0.113274,0.10284,0.143572,-0.01175,...,0.149141,0.883355,0.916576,0.747297,0.12502,0.082051,-0.218153,-0.003029,-0.014988,-0.006193
1,-0.05234,0.006831,-0.260854,0.148548,-0.070657,-0.053435,-0.079615,-0.054376,0.143572,-0.01175,...,-0.09563,0.883355,0.253832,0.747297,0.12502,0.082051,-0.218153,-0.003029,-0.014988,0.490093
2,-0.010994,0.026456,0.124963,-0.550966,-0.024024,-0.418535,-0.419312,0.10284,-0.174877,-0.01175,...,0.149141,-0.622209,0.253832,0.747297,-0.173128,0.082051,-0.503628,-0.003029,-0.014988,-0.135211
3,0.073545,0.109302,0.124963,1.282256,0.178507,1.273492,1.11349,-0.031643,0.041779,-0.01175,...,-0.09563,-0.622209,-0.643936,-0.250153,-0.173128,0.082051,-0.218153,-0.003029,-0.014988,0.19887
4,0.073545,0.026456,0.124963,0.148548,0.178507,-0.053435,0.113274,-0.042903,-0.174877,-0.01175,...,-0.09563,1.290653,-0.643936,-0.250153,0.037963,0.082051,-0.218153,-0.003029,-0.014988,-0.006193


woe_t.transform(x_train_b).head()
# WOE

Now that we transform our data to the respective Weight of Evidence, we can get the mappings for a given column to analyze the WoE and IV, and determine if our bins have a good predictive power, or if we need to re-structure them.

In [15]:
woe_t.woe_mappings["loan_amnt"]

Unnamed: 0,loan_amnt,good,bad,woe,info_val
2,"(18000.0,inf)",0.306706,0.323187,-0.05234,0.000863
3,"(9500.0,18000.0)",0.40811,0.412622,-0.010994,5e-05
1,"(1000.0,9500.0)",0.282583,0.262546,0.073545,0.001474
0,"(-inf,1000.0)",0.002601,0.001645,0.457833,0.000437


In [16]:
woe_t.woe_mappings["funded_amnt"]

Unnamed: 0,funded_amnt,good,bad,woe,info_val
5,"(25286.0,inf)",0.115284,0.128632,-0.109564,0.001463
3,"(15571.0,20429.0)",0.17465,0.183673,-0.050374,0.000455
2,"(10714.0,15571.0)",0.215865,0.214405,0.006785,1e-05
4,"(20429.0,25286.0)",0.109153,0.10841,0.006831,5e-06
6,"(5857.0,10714.0)",0.263263,0.256389,0.026456,0.000182
1,"(1000.0,5857.0)",0.119185,0.106844,0.109302,0.001349
0,"(-inf,1000.0)",0.002601,0.001645,0.457833,0.000437


In [17]:
woe_t.woe_mappings["int_rate"]

Unnamed: 0,int_rate,good,bad,woe,info_val
4,"(20.0,inf)",0.085481,0.197314,-0.836507,0.09355
3,"(17.0,20.0)",0.132056,0.229108,-0.550966,0.053472
2,"(15.0,17.0)",0.140761,0.180781,-0.250225,0.010014
1,"(12.0,15.0)",0.291261,0.251055,0.148548,0.005972
6,"(9.0,12.0)",0.189565,0.097105,0.668943,0.061851
0,"(-inf,6.0)",0.000159,5.3e-05,1.098612,0.000117
5,"(6.0,9.0)",0.160718,0.044585,1.282256,0.148912


In [18]:
woe_t.woe_mappings["installment"]

Unnamed: 0,installment,good,bad,woe,info_val
4,"(948.0,inf)",0.04347,0.05114,-0.162488,0.001246
2,"(486.0,717.0)",0.231629,0.248587,-0.070657,0.001198
1,"(254.0,486.0)",0.410393,0.420371,-0.024024,0.00024
3,"(717.0,948.0)",0.096494,0.097529,-0.010669,1.1e-05
0,"(23.0,254.0)",0.218014,0.182373,0.178507,0.006362


In [19]:
woe_t.woe_mappings["annual_inc"]

Unnamed: 0,annual_inc,good,bad,woe,info_val
4,"(7000.0,178500.0)",0.969958,0.981423,-0.01175,0.000135
0,"(178500.0,350000.0)",0.026751,0.016932,0.457385,0.004491
2,"(521500.0,693000.0)",0.000478,0.000292,0.492476,9.1e-05
1,"(350000.0,521500.0)",0.002282,0.001141,0.693147,0.000791
3,"(693000.0,inf)",0.000531,0.000212,0.916291,0.000292


In [20]:
woe_t.woe_mappings["dti"]

Unnamed: 0,dti,good,bad,woe,info_val
3,"(27.0,inf)",0.132799,0.174836,-0.275012,0.011561
2,"(20.0,27.0)",0.241634,0.280778,-0.150142,0.005877
1,"(13.0,20.0)",0.316472,0.30384,0.040735,0.000515
5,"(7.0,13.0)",0.221517,0.17611,0.229393,0.010416
0,"(-inf,3.0)",0.016135,0.011969,0.298708,0.001245
4,"(3.0,7.0)",0.071442,0.052467,0.308704,0.005858
