In [38]:
import numpy as np
import pandas as pd
from pathlib import Path

import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.datasets import make_regression, make_swiss_roll
from sklearn.metrics import accuracy_score
from sklearn.utils.random import sample_without_replacement
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, classification_report

In [20]:
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))

In [57]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,40000.0,0.1033,856.4,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.143,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.143,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.7,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.5,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk


In [60]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   4702 non-null   float64
 1   int_rate                    4702 non-null   float64
 2   installment                 4702 non-null   float64
 3   home_ownership              4702 non-null   object 
 4   annual_inc                  4702 non-null   float64
 5   verification_status         4702 non-null   object 
 6   pymnt_plan                  4702 non-null   object 
 7   dti                         4702 non-null   float64
 8   delinq_2yrs                 4702 non-null   float64
 9   inq_last_6mths              4702 non-null   float64
 10  open_acc                    4702 non-null   float64
 11  pub_rec                     4702 non-null   float64
 12  revol_bal                   4702 non-null   float64
 13  total_acc                   4702 

In [61]:
pd.get_dummies(train_df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 94 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            12180 non-null  float64
 1   int_rate                             12180 non-null  float64
 2   installment                          12180 non-null  float64
 3   annual_inc                           12180 non-null  float64
 4   dti                                  12180 non-null  float64
 5   delinq_2yrs                          12180 non-null  float64
 6   inq_last_6mths                       12180 non-null  float64
 7   open_acc                             12180 non-null  float64
 8   pub_rec                              12180 non-null  float64
 9   revol_bal                            12180 non-null  float64
 10  total_acc                            12180 non-null  float64
 11  out_prncp                   

In [22]:
# Convert categorical data to numeric and separate target feature for training data
X_2019 = pd.get_dummies(train_df.drop("target", axis=1))

#create target for 2019 data
y_2019 = train_df["target"]

In [23]:
X_2019.shape

(12180, 92)

In [62]:
y_2019.shape

(12180,)

In [24]:
print(pd.get_dummies(test_df))

      loan_amnt  int_rate  installment  annual_inc    dti  delinq_2yrs  \
0       40000.0    0.1033       856.40    128700.0  12.47          0.0   
1       24450.0    0.1430       572.72     44574.0  15.05          0.0   
2       13500.0    0.1430       316.23     60000.0  28.72          0.0   
3       10625.0    0.1774       268.31     60000.0  15.70          0.0   
4        6375.0    0.1862       232.46     60000.0  35.50          0.0   
...         ...       ...          ...         ...    ...          ...   
4697    30000.0    0.1240       673.42    140480.0  15.74          0.0   
4698    24000.0    0.0756       747.22     50000.0  26.81          0.0   
4699    10000.0    0.2305       387.36     33000.0  38.51          0.0   
4700     8000.0    0.1862       205.86     38000.0  16.36          0.0   
4701    30000.0    0.2055      1123.34    180000.0  12.06          0.0   

      inq_last_6mths  open_acc  pub_rec  revol_bal  ...  pymnt_plan_n  \
0                1.0       8.0      0.

In [63]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = pd.get_dummies(test_df.drop("target", axis=1))

#create target for 2020 data
y_test= test_df["target"]

In [64]:
# add missing dummy variables to testing set
def add_column(d, columns):
    new_column = set(columns)- set(d.columns)
    for i in new_column:
        d[i]=0
def new_added_columns(d, columns):
    add_column(d, columns)
    assert(set(columns)-set(d.columns)==set())
    d = d[columns]
    return d
X_test = new_added_columns(X_test, X_2019.columns)
X_test

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.1033,856.40,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,0,1,0,1,1,0,0,1,1,0
1,24450.0,0.1430,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,1,0,1,1,0,1,0,1,0
2,13500.0,0.1430,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,1,0,1,1,0,0,1,1,0
3,10625.0,0.1774,268.31,60000.0,15.70,0.0,4.0,17.0,0.0,6216.0,...,1,1,0,1,1,0,1,0,1,0
4,6375.0,0.1862,232.46,60000.0,35.50,0.0,0.0,13.0,0.0,12681.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,23215.0,...,0,1,1,0,1,0,1,0,1,0
4698,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,0.0,459.0,...,0,1,0,1,1,0,1,0,1,0
4699,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,6342.0,...,1,1,1,0,1,0,1,0,1,0
4700,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,11636.0,...,0,1,0,1,1,0,1,0,1,0


In [65]:
X_test.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
count,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,...,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0
mean,16959.410889,0.148018,504.334028,89862.94,23.033847,0.207571,0.686729,12.729902,0.108039,18313.320927,...,0.110379,1.0,0.06678,0.93322,0.881752,0.118248,0.981285,0.018715,1.0,0.0
std,10155.556866,0.058096,295.725642,125378.1,21.33624,0.668909,0.897366,6.090486,0.314547,22514.741749,...,0.313394,0.0,0.249667,0.249667,0.322935,0.322935,0.135533,0.135533,0.0,0.0
min,1000.0,0.0646,31.43,100.0,0.26,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,9062.5,0.1033,279.13,52000.0,14.8,0.0,0.0,8.0,0.0,6658.25,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
50%,15000.0,0.143,446.01,75000.0,21.205,0.0,0.0,12.0,0.0,12636.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
75%,24000.0,0.1862,689.3875,105000.0,28.22,0.0,1.0,16.0,0.0,22439.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
max,40000.0,0.288,1604.18,6503700.0,999.0,9.0,5.0,57.0,2.0,512728.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [66]:
X_test.shape

(4702, 92)

Prediction: I predict random forest would perform better because the random forest model combines randomly, multiple decision trees to generate an output.  It randomly chooses the features and doesn't depend on any specific feature.  Random forest works well on large datasets and these two csv have 92 features, I think random forest will outperform logistic regression.  

In [68]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression(max_iter=1000)
classifier

LogisticRegression(max_iter=1000)

In [70]:
classifier.fit(X_2019, y_2019)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=1000)

In [71]:
print(f'Predict:  {classifier.predict(X_test)}')

Predict:  ['high_risk' 'low_risk' 'high_risk' ... 'low_risk' 'low_risk' 'high_risk']


In [73]:
print(f"Training Data Score: {classifier.score(X_2019, y_2019)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.696551724137931
Testing Data Score: 0.5650786899191833


In [74]:
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(classifier.predict(X_test[:10]))}')

Actual:		['low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk']
Predicted:	['high_risk', 'low_risk', 'high_risk', 'high_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk', 'low_risk']


In [75]:
y_true = y_test
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[ 848, 1503],
       [ 542, 1809]], dtype=int64)

In [76]:
tn, fp, fn, tp = cm.ravel()

In [77]:
tp

1809

In [78]:
# Calculate the precision of the model based on the confusion matrix
precision = tp / (tp + fp)
precision

0.5461956521739131

In [79]:
# Calculate the sensitivity of the model based on the confusion matrix
sensitivity = tp / (tp + fn)
sensitivity

0.7694598043385793

In [81]:
f1 = 2*precision*sensitivity / (precision + sensitivity)
f1

0.638883983754194

In [80]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

   high_risk       0.61      0.36      0.45      2351
    low_risk       0.55      0.77      0.64      2351

    accuracy                           0.57      4702
   macro avg       0.58      0.57      0.55      4702
weighted avg       0.58      0.57      0.55      4702



In [82]:
# Train a Random Forest Classifier model and print the model score
new_model = RandomForestClassifier(random_state=1, n_estimators=500)
new_model.fit(X_2019, y_2019)
print(f'Predict:  {new_model.predict(X_test)}')

Predict:  ['high_risk' 'high_risk' 'high_risk' ... 'high_risk' 'low_risk'
 'high_risk']


In [84]:
print(f'Training Score: {new_model.score(X_2019, y_2019)}')
print(f'Testing Score: {new_model.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.646958740961293


The logistic regression model was the better model even though the scores were not that great. We shouldn't use the random forest model because it caused an overfit of the dataset because the training score was much higher than the testing score.  

In [86]:
# Scale the data
scaler = StandardScaler().fit(X_2019)
X_train_scaled = scaler.transform(X_2019)
X_test_scaled = scaler.transform(X_test)

In [89]:
# Train the Logistic Regression model on the scaled data and print the model score
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_2019)
print(f'Predict:  {model.predict(X_test_scaled)}')

Predict:  ['high_risk' 'low_risk' 'high_risk' ... 'high_risk' 'high_risk'
 'high_risk']


In [90]:
print(f'Training Score: {model.score(X_train_scaled, y_2019)}')
print(f'Testing Score: {model.score(X_test_scaled, y_test)}')

Training Score: 0.710919540229885
Testing Score: 0.7598894087622289


In [97]:
y_pred = model.predict(X_test_scaled)

In [98]:
print(classification_report(y_test, y_pred))
print(f'Training Score: {model.score(X_train_scaled, y_2019)}')
print(f'Testing Score: {model.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

   high_risk       0.77      0.75      0.76      2351
    low_risk       0.75      0.77      0.76      2351

    accuracy                           0.76      4702
   macro avg       0.76      0.76      0.76      4702
weighted avg       0.76      0.76      0.76      4702

Training Score: 0.710919540229885
Testing Score: 0.7598894087622289


In [91]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500)
clf.fit(X_train_scaled, y_2019)
print(f'Predict:  {clf.predict(X_test_scaled)}')

Predict:  ['high_risk' 'high_risk' 'high_risk' ... 'high_risk' 'low_risk'
 'high_risk']


In [92]:
print(f'Training Score: {clf.score(X_train_scaled, y_2019)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6480221182475542


In [99]:
print(classification_report(y_test, y_pred))
print(f'Training Score: {clf.score(X_train_scaled, y_2019)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

   high_risk       0.77      0.75      0.76      2351
    low_risk       0.75      0.77      0.76      2351

    accuracy                           0.76      4702
   macro avg       0.76      0.76      0.76      4702
weighted avg       0.76      0.76      0.76      4702

Training Score: 1.0
Testing Score: 0.6480221182475542


In [56]:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(clf)
sel.fit(X_train_scaled, y_2019_train)
sel.get_support()

array([ True,  True,  True, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False])

Conclusions:
    Logistic Regression performed better than Random forest contrary to my predictions.  The random forest was not a good model for these datasets.  From both the unscaled and scaled random forest models, we can see the models were overfit because the training and the testing scores are very far apart with the training measuring at 1.0 and the testing scores around 0.65. 
    For the Logistic Regression model, the scaled data performed better than the unscaled.  The testing socre for the unscaled model was only around 0.57 and after using scaled data, the score increased to 0.76, increasing by around 25%.  Different from the random forest model which showed an overfit, logisitic regression testing and training scores were much closer and overall showed as the better model to use.  