In [259]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pprint as pp

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

# Preprocessing

In [260]:
# Reading csv files
filepath_2019loans = "Resources/2019loans.csv"
filepath_2020q1loans = "Resources/2020Q1loans.csv"

df_2019loan = pd.read_csv(filepath_2019loans)
df_2020loan = pd.read_csv(filepath_2020q1loans)

In [261]:
# Checking the df 2019 loan data
# df_2019loan
# print(df_2019loan.columns)
# print(len(df_2019loan.columns))

In [262]:
# Checking the df 2020 loan data
# df_2020loan
# print(df_2020loan.columns)
# print(len(df_2020loan.columns))

In [263]:
# Creating the X data without the target data
X = df_2019loan.drop("loan_status", axis = 1)
# X

In [264]:
# Creating the y data with only the target data
y = df_2019loan["loan_status"]
# y

In [265]:
# One-hot encoding entire df
X_dummies = pd.get_dummies(X)
# X_dummies
# print(X_dummies.columns)
# print(len(X_dummies.columns))

In [271]:
# Converting output/target data labels into numerical values
y_labels = LabelEncoder().fit_transform(y)
# y_labels

In [268]:
# Training set for data from all of 2019 loans
X_2019 = df_2019loan.drop("loan_status", axis = 1)
y_2019 = df_2019loan["loan_status"]
X_dummies_2019 = pd.get_dummies(X_2019)
y_labels_2019 = LabelEncoder().fit_transform(y_2019)
# print(X_dummies_2019.columns)
# print(len(X_dummies_2019.columns))
X_dummies_2019

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
12176,354944,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,...,1,1,0,1,1,0,1,0,1,0
12177,354973,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,...,0,1,0,1,1,0,1,0,1,0
12178,355002,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,...,0,1,0,1,0,1,1,0,1,0


In [269]:
# Testing set for data from 1st quart 2020 loans
X_2020 = df_2020loan.drop("loan_status", axis = 1)
y_2020 = df_2020loan["loan_status"]
X_dummies_2020 = pd.get_dummies(X_2020)
y_labels_2020 = LabelEncoder().fit_transform(y_2020)
# print(X_dummies_2020.columns)
# print(len(X_dummies_2020.columns))

# Add in the 
X_dummies_2020["debt_settlement_flag_Y"] = 0
X_dummies_2020

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,...,0,1,1,0,1,0,1,0,1,0
4698,77291,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,...,0,1,0,1,1,0,1,0,1,0
4699,77292,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,...,1,1,1,0,1,0,1,0,1,0
4700,77297,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,...,0,1,0,1,1,0,1,0,1,0


In [272]:
# Split the data into X_train, X_test, y_train, y_test
X_train = X_dummies_2019
X_test = X_dummies_2020
y_train = y_labels_2019 
y_test = y_labels_2020

# <u>First Prediction</u>

Before running the logistics regression model and the random forest classifier, my prediction is that the random forest classifier will perform better than the logistics regression model with the unscaled data.  And for the scaled data, I would predict random forest classifier would still be better than the logistics regression model.

This is a classification problem so both logistics regression model and random forest classifier work well.  Though it takes longer and computationally heavy, random forest classifier are more accurate.  If we're measuring how fast a model performs, then logistics regression will most likely perform faster than the random forest.  However, I'm looking for accuracy therefore I'm predicting that random forest classifier will do better.

# Unscaled Logistic Regression

In [273]:
# Create the Logistic Regression model
logreg_model = LogisticRegression()

# Fit/train the model using the training data from 2019 loans
logreg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [274]:
# Validate the model using the test data
print(f"Training Data Score: {logreg_model.score(X_train, y_train)}")
print(f"Testing Data Score: {logreg_model.score(X_test, y_test)}")

Training Data Score: 0.6485221674876848
Testing Data Score: 0.5253083794130158


In [275]:
# Confusion Matrix
y_true_log = y_test
y_pred_log = logreg_model.predict(X_test)
confusion_matrix(y_true_log, y_pred_log)
tn, fp, fn, tp = confusion_matrix(y_true_log, y_pred_log).ravel()
tn, fp, fn, tp
print(f"True negative: {tn}")
print(f"False positive: {fp}")
print(f"False negative: {fn}")
print(f"True postive: {tp}")


True negative: 526
False positive: 1825
False negative: 407
True postive: 1944


In [284]:
# Classification Report
print(classification_report(y_true_log, y_pred_log))

              precision    recall  f1-score   support

           0       0.56      0.22      0.32      2351
           1       0.52      0.83      0.64      2351

    accuracy                           0.53      4702
   macro avg       0.54      0.53      0.48      4702
weighted avg       0.54      0.53      0.48      4702



# Unscaled Random Forest Classifier

In [278]:
# Create Random Forest Classifier
ranfor_class = RandomForestClassifier(n_estimators = 200, random_state = 1)

# Fit/train the classifier with 2019 loan data
ranfor_class.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=1)

In [279]:
# Validate the model using the test data
print(f"Training Data Score: {ranfor_class.score(X_train, y_train)}")
print(f"Testing Data Score: {ranfor_class.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6210123351765207


In [280]:
# Confusion Matrix
y_true_rf = y_test
y_pred_rf = ranfor_class.predict(X_test)
confusion_matrix(y_true_rf, y_pred_rf)
tn, fp, fn, tp = confusion_matrix(y_true_rf, y_pred_rf).ravel()
tn, fp, fn, tp
print(f"True negative: {tn}")
print(f"False positive: {fp}")
print(f"False negative: {fn}")
print(f"True postive: {tp}")

True negative: 815
False positive: 1536
False negative: 246
True postive: 2105


In [283]:
# Classification Report
print(classification_report(y_true_rf, y_pred_rf))

              precision    recall  f1-score   support

           0       0.77      0.35      0.48      2351
           1       0.58      0.90      0.70      2351

    accuracy                           0.62      4702
   macro avg       0.67      0.62      0.59      4702
weighted avg       0.67      0.62      0.59      4702



# <u>Evaluating First Prediction</u>

Based on the test score, both models didn't perform that well with the random forest classifier being able to classify 62.1% of the potential loan risk and the logistic regression only did 52.5%.  Overall it's poor perform for both models, but the random forest model performed just a bit better than the logistic regression.  One thing to note is that the training of the random forest model was overfit.  We had a large number of false alarms which is not good because that means that for 1536 of the loans that were predicted to be low risk turned out to be high risk.

# Scale Data

In [288]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# <u>Second Prediction</u>

My second predictions after running the logistic regression model and the random forest classifier on the unscaled model still remains the same.  I believe that with scaled data the random forest classifier will do even better.  What is likely to happen is that for the random forest classifier, I might need to adjust the features to improve the scores.

# Scaled Logistic Regression

In [289]:
# Create the Logistic Regression model
logreg_model = LogisticRegression()

# Fit/train the model using the scaled training data from 2019 loans
logreg_model.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [290]:
# Validate the model using the scaled test data
print(f"Training Data Score: {logreg_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {logreg_model.score(X_test_scaled, y_test)}")

Training Data Score: 0.713136288998358
Testing Data Score: 0.7201190982560612


In [291]:
# Confusion Matrix
y_true_log = y_test
y_pred_slog = logreg_model.predict(X_test_scaled)
confusion_matrix(y_true_log, y_pred_slog)
tn, fp, fn, tp = confusion_matrix(y_true_log, y_pred_slog).ravel()
tn, fp, fn, tp
print(f"True negative: {tn}")
print(f"False positive: {fp}")
print(f"False negative: {fn}")
print(f"True postive: {tp}")

True negative: 1242
False positive: 1109
False negative: 207
True postive: 2144


In [292]:
# Classification Report
print(classification_report(y_true_log, y_pred_slog))

              precision    recall  f1-score   support

           0       0.86      0.53      0.65      2351
           1       0.66      0.91      0.77      2351

    accuracy                           0.72      4702
   macro avg       0.76      0.72      0.71      4702
weighted avg       0.76      0.72      0.71      4702



# Scaled Random Forest Classifier

In [293]:
# Create Random Forest Classifier
ranfor_class = RandomForestClassifier(n_estimators = 200, random_state = 1)

# Fit/train the classifier with scaled 2019 loan data
ranfor_class.fit(X_train_scaled, y_train)

RandomForestClassifier(n_estimators=200, random_state=1)

In [294]:
# Validate the classifier using the scaled test data
print(f"Training Data Score: {ranfor_class.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {ranfor_class.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.6214376860910251


In [295]:
# Confusion Matrix
y_true_rf = y_test
y_pred_srf = ranfor_class.predict(X_test_scaled)
confusion_matrix(y_true_rf, y_pred_srf)
tn, fp, fn, tp = confusion_matrix(y_true_rf, y_pred_srf).ravel()
tn, fp, fn, tp
print(f"True negative: {tn}")
print(f"False positive: {fp}")
print(f"False negative: {fn}")
print(f"True postive: {tp}")

True negative: 814
False positive: 1537
False negative: 243
True postive: 2108


In [286]:
# Classification Report
print(classification_report(y_true_rf, y_pred_srf))

              precision    recall  f1-score   support

           0       0.77      0.35      0.48      2351
           1       0.58      0.90      0.70      2351

    accuracy                           0.62      4702
   macro avg       0.67      0.62      0.59      4702
weighted avg       0.67      0.62      0.59      4702



# <u>Evaluating Second Prediction</u>

Working with the scaled data this time and running them through the models, the logistic regression model provided a better score than the random forest classifier model.  The logistic regression scored 72.0% with the scaled data and that's significantly better than the unscaled results.  My prediction that random forest would perform better with the scaled data was not correct.  In fact, the random forest classifier seemed to perform the same with the scaled data as it did with the unscaled data.  The random forest scored 62.1% with the scaled data as it did with the unscaled data at 62.1%.