# Credit Risk Classification

In [30]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

### Create Training and Testing Sets for Data

In [2]:
#Read lending_data.csv from Resources into pd Dataframe and review
df = pd.read_csv(Path('Resources/lending_data.csv'))

df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [4]:
#Set y equal to the loan_status column and set X equal to all other features.
y = df['loan_status']

X = df.drop(columns='loan_status')

display(y)
display(X)

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


In [5]:
#Check the y values 0 is healthy loan and 1 is a loan with a high risk for defaulting
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [11]:
#split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
#Initiate and fit scale data using StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


### Create Logisitic Regression Model

In [33]:
#initiate and fit logistic regression model with X_train, y_train, then predict with X_test_scaled
logistic_regression = LogisticRegression(random_state=1)

logistic_regression.fit(X_train_scaled, y_train)

y_pred = logistic_regression.predict(X_test_scaled)

In [34]:
#evaluate the model using accuracy score, confusion matrix, and classification report
print(f'Accuracy Score = {balanced_accuracy_score(y_test, y_pred):.3f}')
print(f'Confusion Matrix = {chr(10)}{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report = {chr(10)} {classification_report_imbalanced(y_test, y_pred)}')

Accuracy Score = 0.989
Confusion Matrix = 
[[18652   113]
 [   10   609]]
Classification Report = 
                    pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.98      1.00      0.99      0.98     18765
          1       0.84      0.98      0.99      0.91      0.99      0.98       619

avg / total       0.99      0.99      0.98      0.99      0.99      0.98     19384



**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The Precision with which it guesses 0 is 100%(although 10 high-risk loans were predicted as healthy loans) but the recall is 99%. We can see in the confusion matrix that 113 loans were falsely predicted as high-risk loans(1), while 1 is guessed with 84% precision and has a recall of 98%. We can see that some of the high-risk loans have been misidentified due to a lack of data. 

### Logisitic Regression with Resampled Training Data

In [35]:
#instatiate and fit original training data to RandomOverSampler
rov = RandomOverSampler(random_state=1)

X_train_rov, y_train_rov = rov.fit_resample(X_train_scaled,y_train)

In [36]:
#count values of resampled y
y_train_rov.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

In [37]:
#initate, fit, predict using LogisticRegression with RandomOverSampled Data and then predict with X_test_scaled
lg_rov = LogisticRegression(random_state=1)
lg_rov.fit(X_train_rov, y_train_rov)
y_pred_rov = lg_rov.predict(X_test_scaled)

In [38]:
#evalute lg_rov model performace using accuracy_score, confusion_matrix, classification_report
print(f'Accuracy Score ROV = {balanced_accuracy_score(y_test, y_pred_rov):.3f}')
print(f'Confusion Matrix ROV = {chr(10)}{confusion_matrix(y_test, y_pred_rov)}')
print(f'Classification Report ROV = {chr(10)} {classification_report_imbalanced(y_test, y_pred_rov)}')

Accuracy Score ROV = 0.993
Confusion Matrix ROV = 
[[18640   125]
 [    4   615]]
Classification Report ROV = 
                    pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.83      0.99      0.99      0.91      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The Precision with which it guesses 0 is 100%(although 4 high-risk loans were predicted as healthy loans) but the recall is 99%. We can see in the confusion matrix that 125 loans were falsely predicted as high-risk loans(1), while 1 is guessed with 83% precision and has a recall of 99%. We can see that althought the model still misidentified 4 loans as healthy it still let less through than the original model which let 10 loans pass. We can also see the the lg_rov model also identified more healthy loans as high-risk than the original model(125 compared to 113).