In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data_csv = Path('Resources/lending_data.csv')

df_lending_data = pd.read_csv(lending_data_csv)

# Review the DataFrame
print(f'Shape: {df_lending_data.shape}')
print(df_lending_data.info())
df_lending_data[0:5]

Shape: (77536, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB
None


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_lending_data['loan_status']

# Separate the X variable, the features
X = df_lending_data.drop(columns = ['loan_status'])

In [4]:
# Review the y variable Series
print(y.shape)

(77536,)


In [5]:
# Review the X variable DataFrame
print(X.shape)
X.head(10)

(77536, 7)


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
5,10100.0,7.438,50600,0.407115,4,1,20600
6,10300.0,7.49,51100,0.412916,4,1,21100
7,8800.0,6.857,45100,0.334812,3,0,15100
8,9300.0,7.096,47400,0.367089,3,0,17400
9,9700.0,7.248,48800,0.385246,4,0,18800


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [6]:
# Check the balance of our target values to see how many 0s and 1s or Falses and Trues, respectively
y.value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [8]:
# Preview the the splits
print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

(58152, 7)
(19384, 7)
(58152,)
(19384,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# Leaving default solver will be 'lbfgs', but explicitly type here for practice
lr_classifier = LogisticRegression(solver = 'lbfgs', random_state = 1)
lr_classifier

In [10]:
# Fit the model using training data
lr_classifier.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [11]:
# Make a prediction using the testing data
lr_predictions = lr_classifier.predict(X_test)
pd.DataFrame({'Prediction': lr_predictions, 'Actual': y_test}).head(15)

Unnamed: 0,Prediction,Actual
60914,0,0
36843,0,0
1966,0,0
70137,0,0
27237,0,0
40013,0,0
43107,0,0
61988,0,0
57437,0,0
46757,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [12]:
# Print the balanced_accuracy score of the model using the balanced_accuracy_score function
# help(balanced_accuracy_score)
balanced_accuracy = balanced_accuracy_score(y_test, lr_predictions)
balanced_accuracy

0.9520479254722232

In [13]:
# Generate a confusion matrix for the model
cm_test = confusion_matrix(y_test, lr_predictions)
print(cm_test)

[[18663   102]
 [   56   563]]


In [14]:
# Print the classification report for the model
training_report = classification_report(y_test, lr_predictions)
print(training_report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The Logistic Regression model seems to have worked well with the given dataset to predict the healthy and high-risk loans. The overall accuracy is `99%` which is high along with the aggregated sums of both columns and rows that resulted in matching output which further helps to validate the model being accurate and precise. Even though the precision score for the high-risk loan is only about `85%`, the recall is still at a `91%` which is still good. The reason for this is probably due to the smaller ratio of observations in the high-risk loan from the dataset (see `value_counts()` function cell) that the model was used to train on. Additional step such as continue training the model with additional high-risk loan observation may help increase the precision of the high-risk loan.

Finally, the balanced accuracy score also yielded a 95% accuracy which further helped to reinforce that this model has worked well since we are not dealing with a high imbalance dataset.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [15]:
# Install the library needed for RandomOverSampler module
# Comment out if not already installed
# Note: after installing, make sure to restart the kernel then rerun
!pip install -U imbalanced-learn



In [16]:
# Import the RandomOverSampler module form imbalanced-learn
# Note: RandomOverSampler module tries to balance the class distribution of a dataset by duplicating rows of
#       the minority class (in this case, high-risk loan or 1 under the loan_status)
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
ros_model = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_res, y_res = ros_model.fit_resample(X, y)

In [17]:
# Count the distinct values of the resampled labels data
print(X_res.nunique())
print()
print(X_res.shape)
print()
print(y_res.value_counts())
display(X_res.head(15))

loan_size            182
interest_rate       4692
borrower_income      662
debt_to_income       662
num_of_accounts       17
derogatory_marks       4
total_debt           662
dtype: int64

(150072, 7)

loan_status
0    75036
1    75036
Name: count, dtype: int64


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
5,10100.0,7.438,50600,0.407115,4,1,20600
6,10300.0,7.49,51100,0.412916,4,1,21100
7,8800.0,6.857,45100,0.334812,3,0,15100
8,9300.0,7.096,47400,0.367089,3,0,17400
9,9700.0,7.248,48800,0.385246,4,0,18800


### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [18]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_ros_classifier = LogisticRegression()

# Fit the model using the resampled training data
lr_ros_classifier.fit(X_res, y_res)

# Make a prediction using the testing data
lr_ros_predictions = lr_ros_classifier.predict(X_test)

In [19]:
# Display the prediction versus actual
pd.DataFrame({'Prediction': lr_ros_predictions, 'Actual': y_test}).head(15)

Unnamed: 0,Prediction,Actual
60914,0,0
36843,0,0
1966,0,0
70137,0,0
27237,0,0
40013,0,0
43107,0,0
61988,0,0
57437,0,0
46757,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [20]:
# Print the balanced_accuracy score of the model 
ros_balanced_accuracy = balanced_accuracy_score(y_test, lr_ros_predictions)
print('Resampled Balanced Accuracy is %0.2f' % (ros_balanced_accuracy*100))

Resampled Balanced Accuracy is 99.37


In [21]:
# Generate a confusion matrix for the model
ros_cm_test = confusion_matrix(y_test, lr_ros_predictions)
print(ros_cm_test)

[[18649   116]
 [    4   615]]


In [22]:
# Print the classification report for the model
ros_training_report = classification_report(y_test, lr_ros_predictions)
print(ros_training_report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** After resampling the data to rebalance the high-risk loans against the healthy loans, we can see an increase in recall from `91%` of originall data model to `99%` of the new resampled data model. Additionally, we can also see the balanced accuracy score has increased as well (from `95.20%` to `99.37%`), further supports Logistic Regression model as a reliable model in predicting the differences between healthy and high-risk loans as well as the previous answer statement in regard to introducing additional observations for high-risk loans into the model. However, the model using the resmampling method is the best choice to use out of the two (original logistic regression and random over sampler logistic regression).