# Credit Risk Resampling

In [58]:
# Import a library to ignore the warnings
import warnings
warnings.filterwarnings('ignore')

In [59]:
# Import the libraries we will be using for this assignment 
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

### Part Zero - Read in the CSV file I will use for my data 

In [60]:
# Load the data from the CSV file 
csv_path = Path('/Users/cg/Documents/FinTech_Bootcamp/[Unit_11]Machine_Learning/[Homework]Risky_Buisness/Lending_Data.csv')
ld_df = pd.read_csv(csv_path)
ld_df.head(10)

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk
5,10100.0,7.438,mortgage,50600,0.407115,4,1,20600,low_risk
6,10300.0,7.49,mortgage,51100,0.412916,4,1,21100,low_risk
7,8800.0,6.857,mortgage,45100,0.334812,3,0,15100,low_risk
8,9300.0,7.096,own,47400,0.367089,3,0,17400,low_risk
9,9700.0,7.248,rent,48800,0.385246,4,0,18800,low_risk


In [61]:
# Replace the strings from 'homeowner' with binary data
ld_df = pd.get_dummies(ld_df, columns=['homeowner'])

In [62]:
# Check the new data frame to make sure I replaced the strings
ld_df.head(10)

# Loan status is fine to remain a string for now - will be replaced later

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status,homeowner_mortgage,homeowner_own,homeowner_rent
0,10700.0,7.672,52800,0.431818,5,1,22800,low_risk,0,1,0
1,8400.0,6.692,43600,0.311927,3,0,13600,low_risk,0,1,0
2,9000.0,6.963,46100,0.349241,3,0,16100,low_risk,0,0,1
3,10700.0,7.664,52700,0.43074,5,1,22700,low_risk,0,1,0
4,10800.0,7.698,53000,0.433962,5,1,23000,low_risk,1,0,0
5,10100.0,7.438,50600,0.407115,4,1,20600,low_risk,1,0,0
6,10300.0,7.49,51100,0.412916,4,1,21100,low_risk,1,0,0
7,8800.0,6.857,45100,0.334812,3,0,15100,low_risk,1,0,0
8,9300.0,7.096,47400,0.367089,3,0,17400,low_risk,0,1,0
9,9700.0,7.248,48800,0.385246,4,0,18800,low_risk,0,0,1


### Part Two - Spliting the Data into Training and Testing

In [63]:
# Create my features
x_feat = ld_df.drop(columns=['loan_status'])

# Create my target
y_feat = ld_df['loan_status']

In [64]:
# Check the quantities of the target values - 'loan_status'
y_feat.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [67]:
# Import the library I need for the training and test split
from sklearn.model_selection import train_test_split

# Create the X_train, X_test, y_train, & y_test
x_train, x_test, y_train, y_test= train_test_split(x_feat, y_feat, random_state=1)

# Display the shape of my X_train set
x_train.shape

(58152, 10)

In [68]:
# View the amounts of low & high risks 
Counter(y_train)

Counter({'low_risk': 56271, 'high_risk': 1881})

### Part Three - Data Pre-Processing 

##### Scale the training and testing data using the ***StandardScaler*** from ***SciKit-Learn***. Remember that when scaling the data, you only scale the features data (***X_train*** and ***X_testing***).

In [17]:
# Import the library for the StandardScaler instance
from sklearn.preprocessing import StandardScaler

# Create the StandardScaler instance
scaler = StandardScaler()

In [69]:
# Fit the Standard Scaler with the training data

# When fitting scaling functions, only train on the training dataset
x_scaler = scaler.fit(x_train)

In [70]:
# Scale both the training and testing data
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

### Part Four - Simple Logistic Regression

In [71]:
# Import the library I need for Logistic Regression
from sklearn.linear_model import LogisticRegression

#Train the Simple Logistic Regression using train data
slr_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model
slr_model.fit(x_train_scaled, y_train)

LogisticRegression(random_state=1)

In [72]:
# Predict the model
slr_y_predict = slr_model.predict(x_test_scaled)

# Calculated the balanced accuracy score
slr_bas = balanced_accuracy_score(y_test, slr_y_predict)

# View the BAS 
print(f" The Balanced Accuracy Score is : {slr_bas}")


 The Balanced Accuracy Score is : 0.9889115309798473


In [74]:
# Create the SLR Confusion Matrix
slr_cm = confusion_matrix(y_test, slr_y_pred)
slr_cm_df = pd.DataFrame(
    slr_cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"])

# Display the SLR Confusion Matrix
display(slr_cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,609,10
Actual 1,113,18652


In [75]:
# Show the imbalanced classification report
print("This is the Classification Report:")
...
print("")
...
print(classification_report_imbalanced(y_test, slr_y_predict))

This is the Classification Report:

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.98      0.99      0.91      0.99      0.98       619
   low_risk       1.00      0.99      0.98      1.00      0.99      0.98     18765

avg / total       0.99      0.99      0.98      0.99      0.99      0.98     19384



### Part Five - Oversampling 

##### In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using ***Counter*** from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the ***imbalanced_classification_report*** from imbalanced-learn.

#### Naive Random Oversampling

In [76]:
# Import the library needed for Random Over-Sampling 
from imblearn.over_sampling import RandomOverSampler

# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
ros_x_resampled, ros_y_resampled = ros.fit_resample(x_train_scaled, y_train)


# View the amount of target classes with Counter - the amount should now be equal
Counter(ros_y_resampled)

Counter({'low_risk': 56271, 'high_risk': 56271})

In [77]:
# Train the Logistic Regression model using the resampled data - set the random state always to 1
ros_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model
ros_model.fit(ros_x_resampled, ros_y_resampled)

LogisticRegression(random_state=1)

In [78]:
# Create the predictions from the model
ros_y_predict = ros_model.predict(x_test_scaled)

# Calculate the BAS
ros_bas = balanced_accuracy_score(y_test, ros_y_predict)

# Display the BAS 
print(f"The Balanced Accuracy Score is : {ros_bas}")

The Balanced Accuracy Score is : 0.9934649587814939


In [79]:
# Create the confusion matrix 
ros_cm = confusion_matrix(y_test, ros_y_pred)
ros_cm_df = pd.DataFrame(
    ros_cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"])

# Display the confusion matrix
display(ros_cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,615,4
Actual 1,124,18641


In [80]:
# Display the imbalanced classification report
print("Classification Report")
...
print("")
...
print(classification_report_imbalanced(y_test, ros_y_predict))

Classification Report

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.83      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



#### Synthetic Minority Oversampling Technique (SMOTE) Oversampling

In [81]:
# Import the library needed for the SMOTE
from imblearn.over_sampling import SMOTE

# Resample the training data with SMOTE - again set the random state to one
smote_x_resampled, smote_y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(x_train_scaled, y_train)

# View the amount of target classes with Counter - they should now be equal
Counter(smote_y_resampled)

Counter({'low_risk': 56271, 'high_risk': 56271})

In [82]:
# Train the Logistic Regression model using the resampled data
smote_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model
smote_model.fit(smote_x_resampled, smote_y_resampled)

LogisticRegression(random_state=1)

In [83]:
# Use the model to make predictions
smote_y_predict = smote_model.predict(x_test_scaled)

# Calculate the Balanced Accuracy Score
smote_bas = balanced_accuracy_score(y_test, smote_y_predict)

# Display the BAS
print(f"The Balanced Accuracy Score is : {smote_bas}")

The Balanced Accuracy Score is : 0.9934649587814939


In [84]:
# Create the confusion matrix for this model
smote_cm = confusion_matrix(y_test, smote_y_predict)
smote_cm_df = pd.DataFrame(
    smote_cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"])

# Display the confusion matrix
display(smote_cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,615,4
Actual 1,124,18641


In [85]:
# Display the imbalanced classification report
print("Classification Report")
...
print("")
...
print(classification_report_imbalanced(y_test, smote_y_predict))

Classification Report

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.83      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



### Part Six - Undersampling

##### In this section, you will test an undersampling algorithm to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using ***Counter*** from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Display the confusion matrix from sklearn.metrics.
5. Generate a classication report using the ***imbalanced_classification_report*** from imbalanced-learn.

In [86]:
# Load in the library needed for undersampling
from imblearn.under_sampling import ClusterCentroids

# Resample the data using the ClusterCentroids resampler
cc = ClusterCentroids(random_state=1)
cc_x_resampled, cc_y_resampled = cc.fit_resample(x_train_scaled, y_train)

# View the amount of target classes with Counter - they should now be equal
Counter(cc_y_resampled)

Counter({'high_risk': 1881, 'low_risk': 1881})

In [87]:
# Train the Logistic Regression model using the resampled data - set the random state to one 
cc_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model
cc_model.fit(cc_x_resampled, cc_y_resampled)

LogisticRegression(random_state=1)

In [88]:
# Have the model predict
cc_y_predict = cc_model.predict(x_test_scaled)

# Calculate the balanced accuracy score
cc_bas = balanced_accuracy_score(y_test, cc_y_predict)

# Display the BAS
print(f"The Balanced Accuracy Score is : {cc_bas}")

The Balanced Accuracy Score is : 0.9929503031930944


In [90]:
# Create the confusion matrix
cc_cm = confusion_matrix(y_test, cc_y_predict)
cc_cm_df = pd.DataFrame(
    cc_cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"])

# Display the confusion matrix
display(cc_cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,614,5
Actual 1,113,18652


In [91]:
# Show the imbalanced classification report
print("Classification Report")
...
print("")
...
print(classification_report_imbalanced(y_test, cc_y_predict))

Classification Report

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



### Part Seven - A Combination of (Over & Under) Sampling

##### In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using ***Counter*** from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Display the confusion matrix from sklearn.metrics.
5. Generate a classication report using the ***imbalanced_classification_report*** from imbalanced-learn.

In [92]:
# Import the library needed to resample with SMOTEENN
from imblearn.combine import SMOTEENN

# Resample the training data with SMOTEENN
sm = SMOTEENN(random_state=1)
sm_x_resampled, sm_y_resampled = sm.fit_resample(x_train_scaled, y_train)

# View the amount of target classes with Counter - notice this time they are not equal
Counter(sm_y_resampled)

Counter({'high_risk': 55509, 'low_risk': 55937})

In [94]:
# Import the library needed for the model
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model using the resampled data
sm_model = LogisticRegression(solver='lbfgs', random_state=1)
sm_model.fit(sm_x_resampled, sm_y_resampled)

LogisticRegression(random_state=1)

In [95]:
# Have the model predict the results
sm_y_predict = sm_model.predict(x_test_scaled)

# Calculate the balanced accuracy score
sm_bas = balanced_accuracy_score(y_test, sm_y_predict)

# Display the BAS
print(f"The Balanced Accuracy Score is : {sm_bas}")

The Balanced Accuracy Score is : 0.9934649587814939


In [96]:
# Create the Confusion Correlation Matrix
sm_cm = confusion_matrix(y_test, sm_y_predict)
sm_cm_df = pd.DataFrame(
    sm_cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"])

# Display the confusion matrix
display(sm_cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,615,4
Actual 1,124,18641


In [97]:
# Show the imbalanced classification report
print("Classification Report")
...
print("")
...
print(classification_report_imbalanced(y_test, sm_y_predict))

Classification Report

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.83      0.99      0.99      0.91      0.99      0.99       619
   low_risk       1.00      0.99      0.99      1.00      0.99      0.99     18765

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



### Part Eight - Final Questions

##### Which model had the best balanced accuraccy score?

In [108]:
print("Let's view all the results of the Balanced Accuraccy Score:")
...
print("")
...
print(f"The Simple Logistic Regression had a BAS of: {slr_bas}")
print(f"The Random Over Sampling had a BAS of: {ros_bas}")
print(f"The Synthetic Minority Oversampling Technique had a BAS of: {smote_bas}")
print(f"The Cluster Centroids had a BAS of: {cc_bas}")
print(f"The Smoteenn model had a BAS of: {sm_bas}")

Let's view all the results of the Balanced Accuraccy Score:

The Simple Logistic Regression had a BAS of: 0.9889115309798473
The Random Over Sampling had a BAS of: 0.9934649587814939
The Synthetic Minority Oversampling Technique had a BAS of: 0.9934649587814939
The Cluster Centroids had a BAS of: 0.9929503031930944
The Smoteenn model had a BAS of: 0.9934649587814939


In this project there were three different models that all ended with a tie for the highest Balanced Accuraccy Score. These three models were the *Random Over Sampling Model*, the *Synthetic Minority Oversampling Technique*, & the *Smoteenn*. All of these models had a BAS of **0.993465**.

##### Which model had the best recall score?

During this project each of the five models ended up with the same recall score. This score was **0.99** for each of the models. These results can all be seen in the classification reports for each model in the row & column that corresponds to *avg/total* & *rec* respectfully.

##### Which model had the best geometric mean score?

Similarly, during this project each of the five models ended up with the same geometric mean score. This score was **0.99** for each of the models. These results can all be seen in the classification reports for each model in the row & column that corresponds to *avg/total* & *geo* respectfully.