In [1]:
# Import the modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

## Split the Data into Training and Testing Sets

In [2]:
# read the csv file from the Resources folder into a Pandas DataFrame
lending_data_df = pd.read_csv('Resources/lending_data.csv')

# Display the DataFrame
display(lending_data_df)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


## Separate the data into labels and features

In [3]:
# Separate the y varible, the labels
y = lending_data_df["loan_status"]

# Separate the X variable, the features
X = lending_data_df.drop(columns = ["loan_status"])

In [5]:
# Review the y variable Series
print(y)

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64


In [7]:
# Review the X varible DataFrame
display(X)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


In [8]:
# Check the balance of our target values
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [9]:
# import the train_test_learn module
from sklearn.model_selection import train_test_split


In [10]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Create a Logistic Regression Model with the a Original Data 

In [11]:
# Import the logisticRegression Module from SKLearn 
from sklearn.linear_model import LogisticRegression

In [13]:
# Instantiate the Logistic Regression model, Assign a random_state parameter of 1 to the module
lr_model = LogisticRegression(random_state=1)

In [15]:
# Fit the model using training data
lr_original= lr_model.fit(X_train, y_train)

In [16]:
lr_prediction = lr_original.predict(X_test)

In [20]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, lr_prediction)

0.9520479254722232

In [24]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, lr_prediction)

array([[18663,   102],
       [   56,   563]])

In [26]:
# Print the classification report for the model
print(classification_report_imbalanced(y_test, lr_prediction))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.91      1.00      0.95      0.91     18765
          1       0.85      0.91      0.99      0.88      0.95      0.90       619

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



### How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high_risk loan) labels?

#### According to the classification report: An 85% prediction which is a decent report. And the Recall is higher than the prediction which reads there's 91% percent in fraudulant report.

## Predict a Logistic Regression Model with Resampled Training Data

In [27]:
# Import the RandomOverSampler module from imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

In [40]:
# Instantiate the random oversampler model 
# Assign a random_state parameter of 1 to the model
random_oversample = RandomOverSampler(random_state=1)

In [41]:
# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_oversample.fit_resample(X_train, y_train)

# Display of oversample module
print(oversample_fit)

(        loan_size  interest_rate  borrower_income  debt_to_income  \
0          8600.0          6.792            44500        0.325843   
1          7800.0          6.419            41000        0.268293   
2         10000.0          7.386            50100        0.401198   
3          9300.0          7.093            47300        0.365751   
4          9200.0          7.045            46900        0.360341   
...           ...            ...              ...             ...   
112537    17500.0         10.577            80100        0.625468   
112538    20000.0         11.611            89900        0.666296   
112539    19200.0         11.266            86600        0.653580   
112540    19100.0         11.245            86400        0.652778   
112541    20700.0         11.913            92700        0.676375   

        num_of_accounts  derogatory_marks  total_debt  
0                     3                 0       14500  
1                     2                 0       11000  
2 

In [44]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

In [46]:
# Instantiate the Logistsic Regression
# Assign a random_state parameter of 1 to the model
resampled_model = lr_model

#Displaying the model
resampled_model

LogisticRegression(random_state=1)

In [49]:
# Fit the model using the resampled training data
resampled_model = resampled_model.fit(X_resampled, y_resampled)

In [51]:
# Make a prediction using the testing data
resampled_predct = resampled_model.predict(X_test)

In [55]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, resampled_predct)

0.9936781215845847

In [56]:
# Generate a confusion
confusion_matrix(y_test, resampled_predct)

array([[18649,   116],
       [    4,   615]])

In [57]:
# Print the classification report for the model 
print(classification_report_imbalanced(y_test, resampled_predct))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.84      0.99      0.99      0.91      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



### How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

#### According to the resampled model: Prediction is down 1% and recall is higher by 8%, and the F1 is increased by 3%.