In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from datetime import datetime

In [3]:
 from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
file_path = Path('LoanStats_2019Q1.csv')
df = pd.read_csv(file_path)
infer_datetime_format=True,
df.dropna()
df = df.dropna(axis='columns')
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [5]:
df.drop(columns=['issue_d','next_pymnt_d'],inplace=True)

In [6]:
totaldf= pd.get_dummies(df, columns=["home_ownership","verification_status","pymnt_plan","hardship_flag","debt_settlement_flag","initial_list_status","application_type",])
totaldf

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,hardship_flag_N,debt_settlement_flag_N,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App
0,10500.0,0.1719,375.35,66000.0,low_risk,27.24,0.0,0.0,8.0,0.0,...,0,1,0,1,1,1,0,1,1,0
1,25000.0,0.2000,929.09,105000.0,low_risk,20.23,0.0,0.0,17.0,1.0,...,0,0,1,1,1,1,0,1,1,0
2,20000.0,0.2000,529.88,56000.0,low_risk,24.26,0.0,0.0,8.0,0.0,...,0,0,1,1,1,1,0,1,1,0
3,10000.0,0.1640,353.55,92000.0,low_risk,31.44,0.0,1.0,10.0,1.0,...,0,0,1,1,1,1,0,1,1,0
4,22000.0,0.1474,520.39,52000.0,low_risk,18.76,0.0,1.0,14.0,0.0,...,1,0,0,1,1,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,26000.0,low_risk,9.60,0.0,0.0,9.0,0.0,...,0,1,0,1,1,1,0,1,1,0
68813,12000.0,0.2727,368.37,63000.0,low_risk,29.07,0.0,0.0,8.0,0.0,...,1,0,0,1,1,1,0,1,1,0
68814,5000.0,0.1992,185.62,52000.0,low_risk,14.86,0.0,0.0,5.0,1.0,...,0,1,0,1,1,1,0,1,1,0
68815,40000.0,0.0646,1225.24,520000.0,low_risk,9.96,0.0,1.0,21.0,0.0,...,0,0,1,1,1,1,1,0,1,0


### Creating X and Y

In [7]:
# Create our features
X = totaldf.drop(columns=['loan_status'])


# Create our target
y = totaldf['loan_status']

In [8]:
X.columns

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_in

In [9]:
y.to_frame()

Unnamed: 0,loan_status
0,low_risk
1,low_risk
2,low_risk
3,low_risk
4,low_risk
...,...
68812,low_risk
68813,low_risk
68814,low_risk
68815,low_risk


In [10]:
X.describe() 

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,hardship_flag_N,debt_settlement_flag_N,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.478007,0.373992,0.148001,1.0,1.0,1.0,0.123879,0.876121,0.86034,0.13966
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.49952,0.483865,0.355104,0.0,0.0,0.0,0.329446,0.329446,0.346637,0.346637
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
# Check the balance of our target values
y.value_counts(normalize=False)

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

### Split the Data into Training and Testing

In [12]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=1, stratify=y)

In [13]:
X_train.shape 

(51612, 90)

### Data Pre-Processing

In [14]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler

In [15]:
data_scaler = StandardScaler()

In [16]:
scalex=data_scaler.fit(X_train)
scalex

StandardScaler()

In [17]:
x_scaled = scalex.transform(X_train)
x_scaledtest=scalex.transform(X_test)

In [18]:
x_scaledtest[:1]

array([[-1.23536216, -1.196855  , -1.24170452, -0.7698957 ,  0.84655651,
        -0.30466134, -0.65734502, -1.58959925, -0.37432858, -0.80912184,
        -1.35175982, -1.23347867, -1.23288407, -0.49027494, -0.49009392,
        -0.3176014 , -0.87264252, -0.03171854,  0.        ,  0.        ,
        -0.48278872, -0.12022901,  0.        ,  0.        , -0.10541046,
        -0.60864983, -0.86744979, -0.77587596, -0.86943676, -0.58876428,
        -0.09486962, -0.66422271,  0.70648829, -0.84126057, -1.04548911,
        -1.10381271, -0.25574628, -0.92921628, -0.24892946, -0.58842955,
        -0.91344373, -1.21494145,  0.36200528, -0.44134787, -1.71827525,
        -0.07812619, -0.00769495, -1.98012018,  0.06467376,  0.61115568,
         0.92287889, -0.25583179,  1.24916058,  0.8784122 , -0.33774962,
        -1.54138186, -1.62554222, -1.27831377, -0.74386823, -1.12179819,
        -1.50556199, -0.9356566 , -1.62604944, -1.58743832,  0.        ,
         0.        , -0.1351054 , -1.16977358,  0.5

### Balanced Random Forest Classifier

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(x_scaled, y_train)
brf

BalancedRandomForestClassifier(random_state=1)

In [20]:
y_pred = brf.predict(x_scaledtest) 

In [21]:
 # Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.7257262973840806

In [22]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[   54,    33],
       [ 2897, 14221]])

In [23]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.02      0.62      0.83      0.04      0.72      0.50        87
   low_risk       1.00      0.83      0.62      0.91      0.72      0.53     17118

avg / total       0.99      0.83      0.62      0.90      0.72      0.53     17205



### Easy Ensemble Classifier

In [24]:
# Train the Classifier
from imblearn.ensemble import EasyEnsembleClassifier 

In [25]:
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(x_scaled, y_train)
eec

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [26]:
y_pred1 = eec.predict(x_scaledtest) 

In [27]:
balanced_accuracy_score(y_test, y_pred1)

0.7447212250867206

In [28]:
confusion_matrix(y_test, y_pred1)

array([[   61,    26],
       [ 3624, 13494]])

In [29]:
print(classification_report_imbalanced(y_test, y_pred1))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.02      0.70      0.79      0.03      0.74      0.55        87
   low_risk       1.00      0.79      0.70      0.88      0.74      0.56     17118

avg / total       0.99      0.79      0.70      0.88      0.74      0.56     17205

