In [39]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt 

In [40]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))
#train_df.head()
train_df.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_ol

#### Preprocessing Data
###### Our 2019 data set will be our Training Set and Our 2020 will be our Testing Set

In [41]:
# Convert categorical data to numeric and separate target feature for training data
X_dummies_train = pd.get_dummies(train_df.drop(['Unnamed: 0', 'index','loan_status'], axis=1))
y_dummies_train = train_df['loan_status']

y_labels_train = LabelEncoder().fit_transform(y_dummies_train)

In [42]:
# Convert categorical data to numeric and separate target feature for testing data
X_dummies_test = pd.get_dummies(test_df.drop(['Unnamed: 0', 'index','loan_status'], axis=1))
y_dummies_test = test_df['loan_status']

y_labels_test = LabelEncoder().fit_transform(y_dummies_test)

In [43]:
#Check the shape of the data sets
print(f" Training: {X_dummies_train.shape} Testing: {X_dummies_test.shape} ")

 Training: (12180, 92) Testing: (4702, 91) 


In [44]:
# add missing dummy variables to testing set
for col in X_dummies_train:
    if col not in X_dummies_test.columns:
        X_dummies_test[col] = 0
print(f" Training: {X_dummies_train.shape} Testing: {X_dummies_test.shape} ")

 Training: (12180, 92) Testing: (4702, 92) 


#### Predictions
Random Forest Classifier will perform better due to the dataset containing mostly categorical data. While, Logistic Regression performs superior with linear data. 

In [45]:
# Train the Logistic Regression model on the unscaled data and print the model score
model = LogisticRegression()
model.fit(X_dummies_train, y_dummies_train)

print(f"Training Score: {model.score(X_dummies_train, y_dummies_train)}")
print(f"Testing Score: {model.score(X_dummies_test, y_dummies_test)}")

Training Score: 0.6509852216748768
Testing Score: 0.5163760102084219


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [46]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_dummies_train, y_labels_train)
print(f'Training Score: {clf.score(X_dummies_train, y_labels_train)}')
print(f'Testing Score: {clf.score(X_dummies_test, y_labels_test)}')


Training Score: 1.0
Testing Score: 0.6433432581880051


#### Results
Random Forest Classifier performed better than Logistic Regression with a testing score of 0.64. However, the RFC seems to be overfitting on the training set calling for normalization of the data-set.

In [47]:
# Scale the data
scaler = StandardScaler().fit(X_dummies_train)
X_train_scaled = scaler.transform(X_dummies_train)
X_test_scaled = scaler.transform(X_dummies_test)

#### Predictions
The Random Forest Classifier testing score will not change. However, the Logistic Regression will improve due to scaling.

In [49]:
# Train the Logistic Regression model on the scaled data and print the model score
clf_lr = LogisticRegression().fit(X_train_scaled, y_labels_train)
print(f'Training Score: {clf_lr.score(X_train_scaled, y_labels_train)}')
print(f'Testing Score: {clf_lr.score(X_test_scaled, y_labels_test)}')

Training Score: 0.7078817733990148
Testing Score: 0.767333049766057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [52]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf_rf = RandomForestClassifier(random_state=42, n_estimators=500).fit(X_train_scaled, y_labels_train)

print(f'Training Score: {clf.score(X_train_scaled, y_labels_train)}')
print(f'Testing Score: {clf_rf.score(X_test_scaled, y_labels_test)}')

Training Score: 0.5
Testing Score: 0.6458953636750319


#### Results
Ultimately, the Logistic Regression improved significantly from 0.50 to 0.75 performing better than the Random Forest Classifier with a score of 0.64. This reveals that scaled data can be more beneficial than complex data sets.