# Import dependancies

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

# Load and clean CSV

In [2]:
csv = pd.read_csv('Resources/LoanStats_2019Q1.csv')[:-2]

# Remove the `Issued` and 'Fully Paid'loan status
issued_mask = (csv['loan_status'] != 'Issued') & (csv['loan_status'] != 'Fully Paid')
csv = csv.loc[issued_mask]

loan_name = {
    'Current': 'low_risk',
    'Charged Off': 'high_risk',
    'In Grace Period': 'high_risk',
    'Late (31-120 days)': 'high_risk',
    'Late (16-30 days)': 'high_risk'
}

csv["loan_status"] = csv["loan_status"].apply(lambda x: loan_name[x])
# csv

# Encode the dataframe

In [3]:
df = csv.copy()

columns = df.columns.tolist()

for i in columns:
    le = LabelEncoder()
    print(i)
    try:
        df[i] = df[i].fillna(value='fill')
        df[i] = le.fit_transform(df[i])
    except:
        df[i] = csv[i]
        df[i] = df[i].fillna(value=0.0)
        df[i] = le.fit_transform(df[i])

df

id
member_id
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
loan_status
pymnt_plan
url
desc
purpose
title
zip_code
addr_state
dti
delinq_2yrs
earliest_cr_line
inq_last_6mths
mths_since_last_delinq
mths_since_last_record
open_acc
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
out_prncp
out_prncp_inv
total_pymnt
total_pymnt_inv
total_rec_prncp
total_rec_int
total_rec_late_fee
recoveries
collection_recovery_fee
last_pymnt_d
last_pymnt_amnt
next_pymnt_d
last_credit_pull_d
collections_12_mths_ex_med
mths_since_last_major_derog
policy_code
application_type
annual_inc_joint
dti_joint
verification_status_joint
acc_now_delinq
tot_coll_amt
tot_cur_bal
open_acc_6m
open_act_il
open_il_12m
open_il_24m
mths_since_rcnt_il
total_bal_il
il_util
open_rv_12m
open_rv_24m
max_bal_bc
all_util
total_rev_hi_lim
inq_fi
total_cu_tl
inq_last_12m
acc_open_past_24mths
avg_cur_bal
bc_open_to_buy
b

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
82,0,0,735,735,758,0,31,8120,3,17,...,0,0,0,0,0,0,0,0,0,0
93,0,0,365,365,382,0,24,4113,2,14,...,0,0,0,0,0,0,0,0,0,0
99,0,0,928,928,950,0,28,9263,3,16,...,0,0,0,0,0,0,0,0,0,0
132,0,0,735,735,758,1,28,5869,3,16,...,0,0,0,0,0,0,0,0,0,0
133,0,0,345,345,362,0,22,3832,2,13,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115670,0,0,147,147,162,0,15,1349,2,10,...,0,0,0,0,0,0,0,0,0,0
115671,0,0,186,186,202,0,1,1489,0,0,...,0,0,0,0,0,0,0,0,0,0
115672,0,0,576,576,598,0,21,6216,2,13,...,0,0,0,0,0,0,0,0,0,0
115673,0,0,576,576,598,1,9,3789,1,7,...,0,0,0,0,0,0,0,0,0,0


# Build and test the model 
### Oversampling

In [4]:
# split the table into features and outcomes
x_cols = [i for i in df.columns if i not in ('loan_status')]
X = df[x_cols]
y = df['loan_status']

# split features and outcomes into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print("Naive:\t\t", Counter(y_train))

# oversample to make up for the low number of risky loans
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
print("Oversampled:\t", Counter(y_resampled))

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)
y_predictions = model.predict(X_test)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_predictions)

# Displaying results
print("\nConfusion Matrix")
display(cm_df)
print(f"\nAccuracy Score: {acc_score}\n")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_predictions))

Naive:		 Counter({1: 70595, 0: 417})
Oversampled:	 Counter({1: 70595, 0: 70595})

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,104,46
Actual 1,4802,18719



Accuracy Score: 0.7445876734265833

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.02      0.69      0.80      0.04      0.74      0.55       150
          1       1.00      0.80      0.69      0.89      0.74      0.56     23521

avg / total       0.99      0.80      0.69      0.88      0.74      0.56     23671



# Build and test the model 
### Undersampling

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print("Naive:\t\t", Counter(y_train))

ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
print("Undersampled:\t", Counter(y_resampled))

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)
y_predictions = model.predict(X_test)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_predictions)

# Displaying results
print("\nConfusion Matrix")
display(cm_df)
print(f"\nAccuracy Score: {acc_score}\n")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_predictions))

Naive:		 Counter({1: 70595, 0: 417})
Undersampled:	 Counter({0: 417, 1: 417})

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,107,43
Actual 1,5227,18294



Accuracy Score: 0.7455531936000453

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.02      0.71      0.78      0.04      0.74      0.55       150
          1       1.00      0.78      0.71      0.87      0.74      0.56     23521

avg / total       0.99      0.78      0.71      0.87      0.74      0.56     23671



In [6]:
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

y_predictions = model.predict(X_test)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"\nAccuracy Score: {acc_score}\n")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,112,38
Actual 1,6613,16908



Accuracy Score: 0.7327568272324023

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.02      0.75      0.72      0.03      0.73      0.54       150
          1       1.00      0.72      0.75      0.84      0.73      0.54     23521

avg / total       0.99      0.72      0.75      0.83      0.73      0.54     23671



# Build and test the model 
### SMOTEENN

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

y_predictions = model.predict(X_test)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"\nAccuracy Score: {acc_score}\n")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,108,42
Actual 1,5214,18307



Accuracy Score: 0.7491628757280728

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.02      0.72      0.78      0.04      0.75      0.56       150
          1       1.00      0.78      0.72      0.87      0.75      0.56     23521

avg / total       0.99      0.78      0.72      0.87      0.75      0.56     23671



# Report
While all 3 of the resampling models have a decent recalls, which means that we are appropriately most of the risky loans accordingly, our precision is abysmal. Even though a false positive is much worse than a false negative because the loses from one default greatly outweigh the lose from denying a safe loan, a 2% precision is an order of magnitude off the mark.
None of these are exciting in their results, but the undersampling seems to have done marginally better than the others with the strongest recall and second strongest accuracy score.

# Forest of randomized trees

In [8]:
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
brf.fit(X_train, y_train)
y_predictions = brf.predict(X_test)

feature_importance = sorted(zip(brf.feature_importances_, X.columns.tolist()))[::-1]

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_predictions)

# Displaying results
print("Feature Importance")
for i in feature_importance:
    print(i)
print("\nConfusion Matrix")
display(cm_df)
print(f"\nAccuracy Score: {acc_score}\n")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_predictions))

Feature Importance
(0.12916274737959993, 'last_pymnt_d')
(0.07835293150730595, 'last_pymnt_amnt')
(0.07357118709850285, 'total_rec_int')
(0.06598765825488215, 'total_rec_prncp')
(0.05578880100468741, 'total_pymnt')
(0.0426909895694608, 'total_pymnt_inv')
(0.03273026638949338, 'next_pymnt_d')
(0.014592965346517215, 'total_rec_late_fee')
(0.013042253977801238, 'int_rate')
(0.012896508802225572, 'annual_inc')
(0.012219967131551284, 'sub_grade')
(0.011250446007832227, 'all_util')
(0.010754303038043747, 'out_prncp_inv')
(0.010717252135789248, 'out_prncp')
(0.010597684265182228, 'dti')
(0.01048182789913683, 'issue_d')
(0.010440691876068284, 'total_bc_limit')
(0.010276168243511317, 'grade')
(0.009888996413757335, 'tot_cur_bal')
(0.009556236418744497, 'max_bal_bc')
(0.009279580043228621, 'total_bal_ex_mort')
(0.009262189811591973, 'mo_sin_old_il_acct')
(0.009052376860090507, 'emp_title')
(0.008763058734121615, 'bc_util')
(0.008716908863325495, 'revol_bal')
(0.008518538063751973, 'earliest_cr_l

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,129,21
Actual 1,1342,22179



Accuracy Score: 0.9014723013477318

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.09      0.86      0.94      0.16      0.90      0.80       150
          1       1.00      0.94      0.86      0.97      0.90      0.82     23521

avg / total       0.99      0.94      0.86      0.97      0.90      0.82     23671



# EasyEnsembleClassifier

In [9]:
eec = EasyEnsembleClassifier(n_estimators=100, random_state=0)
eec.fit(X_train, y_train)
y_predictions = eec.predict(X_test)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}\n")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,143,7
Actual 1,379,23142


Accuracy Score: 0.9686100364213539

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.27      0.95      0.98      0.43      0.97      0.94       150
          1       1.00      0.98      0.95      0.99      0.97      0.94     23521

avg / total       1.00      0.98      0.95      0.99      0.97      0.94     23671



# Report
Both of these new models offer a market improvement across the board over the resampling models. The random forest is very impressive not only with boosting all 3 of the precision, recall, and accuracy, but also the feature_importances_ gives a peak at what the model ended up being the strongest predictors. However all of those remarkable features do not quite measure up to EasyEnsemble. EasyEnsemble may not have anything as nifty as feature_importances_ but it more than makes up for it by out performing the random forest. Precision, recall, and accuracy are significantly greater in this final iteration, having the highest recall and accuracy is great but this is the only model that manages to bring the precision out of the single digits and all the way up to 27%. For these reason the EasyEnsemble model is not just the best of the bunch, but is a good model in its own right.