In [1]:
# Unit 11 - Risky Business

In [2]:
# resampling

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [5]:
lending_data = pd.read_csv('lending_data.csv')
lending_data.head(3)

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk


In [6]:
# Split the Data into Training and Testing

In [7]:
y = lending_data['loan_status']
X = lending_data.drop(columns='loan_status')

In [8]:
X = pd.get_dummies(X, drop_first=True)

In [9]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_own,homeowner_rent
0,10700.0,7.672,52800,0.431818,5,1,22800,1,0
1,8400.0,6.692,43600,0.311927,3,0,13600,1,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0,1
3,10700.0,7.664,52700,0.43074,5,1,22700,1,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0,0


In [10]:
y.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# data is split into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    random_state=1, 
    # stratify=y
)

In [13]:
# Data Pre-Processing

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
# in order to improve an algorithm's performance, the features data will be scaled
scaler = StandardScaler()

In [16]:
# Fitting Standard Scaler with the training data

X_scaler = scaler.fit(X_train)

In [17]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Simple Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [21]:
# Calculated the balanced accuracy score

from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.9889115309798473

In [22]:
 # Display the confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[  609,    10],
       [  113, 18652]], dtype=int64)

In [23]:
 # Print the imbalanced classification report
    
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.98      0.99      0.91      0.99      0.98       619
   low_risk       1.00      0.99      0.98      1.00      0.99      0.98     18765

avg / total       0.99      0.99      0.98      0.99      0.99      0.98     19384



In [24]:
# Oversampling

In [25]:
# Naive Random Oversampling

In [26]:
from collections import Counter

In [27]:
Counter(y_train)
# This initial train-test split gives us the following imbalanced data.

Counter({'low_risk': 56271, 'high_risk': 1881})

In [28]:
# implement random oversampling

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

# After oversampling, we can see that the two classes are now balanced.

Counter({'low_risk': 56271, 'high_risk': 56271})

In [29]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [30]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  615,     4],
       [  116, 18649]], dtype=int64)

In [31]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9936781215845847

In [32]:
# Print the imbalanced classification report
    #  which includes more evaluation metrics and produces then separately for the two classes.
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

# minority class has low precision and, therefore, a lower F1 score despite oversampling. 
# This may be due to overfitting in the training set.

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [33]:
# SMOTE Oversampling

In [34]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)
from collections import Counter

Counter(y_resampled)

Counter({'low_risk': 56271, 'high_risk': 56271})

In [35]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [36]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9936781215845847

In [37]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  615,     4],
       [  116, 18649]], dtype=int64)

In [38]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [39]:
# Undersampling

In [40]:
# Fit the data using `ClusterCentroids` and check the count of each class

In [41]:
# this cell can take a lonmg time

from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

from collections import Counter

Counter(y_resampled)

Counter({'high_risk': 1881, 'low_risk': 1881})

In [42]:
# Logistic regression using cluster centroid undersampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [43]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  606,    13],
       [  112, 18653]], dtype=int64)

In [44]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9865149130022852

In [45]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.98      0.99      0.91      0.99      0.97       619
   low_risk       1.00      0.99      0.98      1.00      0.99      0.97     18765

avg / total       0.99      0.99      0.98      0.99      0.99      0.97     19384



In [46]:
#SMOTEENN combination sampling

In [47]:
import matplotlib.pyplot as plt
%matplotlib inline

In [48]:
from imblearn.combine import SMOTEENN

sm = SMOTEENN(random_state=1)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high_risk': 55630, 'low_risk': 55948})

In [49]:
# Logistic regression using random combination sampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [50]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  615,     4],
       [  122, 18643]], dtype=int64)

In [51]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9935182494822666

In [52]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.83      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [53]:
# Final Questions

# 1. Which model had the best balanced accuracy score?
        # SMOTE model has best balacned accuracy score 0.993678
# 2. Which model had the best recall score?
        # SMOTE & SMOOTEEN model has best recall score at 0.99
# 3. Which model had the best geometric mean score?
        # All models have a geometric mean of 0.99

In [54]:
# Ensemble Learning

In [55]:
import warnings
warnings.filterwarnings('ignore')

In [56]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [57]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [58]:
loan_stats = pd.read_csv('LoanStats.csv')
loan_stats.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [59]:
x_cols = [i for i in loan_stats.columns if i not in ('loan_status','Unnamed: 0')]
X = loan_stats[x_cols]
y = loan_stats['loan_status']

In [60]:
X = pd.get_dummies(X, drop_first=True)

In [61]:
Counter(y)

Counter({'low_risk': 68470, 'high_risk': 347})

In [62]:
# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [63]:
# Data Pre-Processing

In [64]:
from sklearn.preprocessing import StandardScaler

In [65]:
scaler = StandardScaler()

In [66]:
X_scaler = scaler.fit(X)

In [67]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [68]:
# Ensemble Learners

In [69]:
# Balanced Random Forest Classifier

In [70]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=1000, random_state=1)

In [72]:
y_pred= brf.predict(X_test_scaled)

In [73]:
balanced_accuracy_score(y_test, y_pred)

0.7758966115273829

In [74]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score =balanced_accuracy_score(y_test, y_pred)

In [75]:
# Print the imbalanced classification report
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,66,35
Actual 1,1739,15365


Accuracy Score : 0.7758966115273829
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.04      0.65      0.90      0.07      0.77      0.57       101
   low_risk       1.00      0.90      0.65      0.95      0.77      0.60     17104

avg / total       0.99      0.90      0.65      0.94      0.77      0.60     17205



In [76]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
importances_sorted = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.07238231767592367, 'total_rec_prncp'),
 (0.062446216786166575, 'total_pymnt_inv'),
 (0.05966414370203058, 'last_pymnt_amnt'),
 (0.057734659879791816, 'total_rec_int'),
 (0.05745143689951241, 'total_pymnt'),
 (0.03334625714511916, 'int_rate'),
 (0.020006944027414035, 'issue_d_Jan-2019'),
 (0.01830336393679433, 'mths_since_recent_inq'),
 (0.017399962109694068, 'dti'),
 (0.01715033939997833, 'installment')]

In [77]:
# Easy Ensemble Classifier
from imblearn.ensemble import EasyEnsembleClassifier

In [78]:
# Train the Classifier
classifier = EasyEnsembleClassifier(n_estimators=10)

In [79]:
classifier = classifier.fit(X_train_scaled, y_train)

In [80]:
y_pred = classifier.predict(X_test_scaled)

In [81]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score =balanced_accuracy_score(y_test, y_pred)

In [82]:
# Print the imbalanced classification report
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,90,11
Actual 1,1098,16006


Accuracy Score : 0.9134467995443136
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.08      0.89      0.94      0.14      0.91      0.83       101
   low_risk       1.00      0.94      0.89      0.97      0.91      0.84     17104

avg / total       0.99      0.94      0.89      0.96      0.91      0.84     17205



In [83]:
# Final Questions

# 1. Which model had the best balanced accuracy score?
        # The Easy Emsemble had a higer score of accuracy score of 0.93 vs. 0.77
# 2. Which model had the best recall score?
        # The Easy Emsemble had a higer recall score of .95 vs. 0.90
# 3. Which model had the best geometric mean score?
        # The Easy Emsemble had a higer geometric mean score of .93 vs. 0.77
# 4. What are the top three features?
        # top three features are:
            # 1. total_rec_prncp
            # 2. total_pymnt_inv
            # 3. last_pymnt_amnt