In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

# Predictions:
- I think the Random Forest Model will be a better model because its utilizing a bunch of random samples and averaging the results together. This allows it to better account for the 'noise' within the data. The Logistic Regression Model assumes there is a linear relationship which may not yeild accurate predictions. 

In [3]:
# Convert categorical data to numeric and separate target feature for training data
X_train_df= train_df.drop(['loan_status', 'Unnamed: 0'], axis=1)
X_train_df.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,n,29.99,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,n,11.26,2.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,n,11.28,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,n,18.08,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,n,27.77,0.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
X_train= pd.get_dummies(X_train_df)
X_train.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,...,0,1,0,1,1,0,1,0,1,0


In [5]:
y_train= train_df['loan_status']
y_train.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: loan_status, dtype: object

In [6]:
# Convert categorical data to numeric and separate target feature for testing data
X_test_df= test_df.drop(['loan_status', 'Unnamed: 0'], axis=1)
X_test_df.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,n,19.75,0.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,n,11.52,2.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,n,6.74,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,n,12.13,0.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,n,16.08,0.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [7]:
X_test= pd.get_dummies(X_test_df)
X_test.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,0,1,0,1,1,0,1,0,1


In [8]:
y_test= test_df['loan_status']
y_test.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: loan_status, dtype: object

In [9]:
# add missing dummy variables to testing set
for column in X_train:
    if column not in X_test:
        X_test[column] = 0
        print(column)

debt_settlement_flag_Y


In [11]:
X_test.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0


In [12]:
# X_test['debt_settlement_flag_Y'] = 0
for index, value in enumerate(X_test['debt_settlement_flag_N']):
    if value == 0:
        X_test.at[index, 'debt_settlement_flag_Y'] = 1
    else:
        pass

In [13]:
X_test.head()

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0


In [14]:
# Train the Logistic Regression model on the unscaled data and print the model score
clf = LogisticRegression()
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [15]:
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score: {clf.score(X_test, y_test)}")

Training Data Score: 0.6575533661740558
Testing Data Score: 0.5204168438962143


In [16]:
# Train a Random Forest Classifier model and print the model score
rfc = RandomForestClassifier(random_state=1, n_estimators=500)
rfc.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=1)

In [17]:
print(f' Random Forest Training Score: {rfc.score(X_train, y_train)}')
print(f' Random Forest Testing Score: {rfc.score(X_test, y_test)}')

 Random Forest Training Score: 1.0
 Random Forest Testing Score: 0.6631220757124627


# Results 1:
- The Random Forest model scored higher for both the train and test data

In [18]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_train_scaled

array([[-1.31172014, -0.39311205,  0.73658452, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-0.46579523,  0.35168119, -0.19171582, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.3364188 ,  0.25400339, -0.32080462, ..., -0.17149859,
         0.02026518, -0.02026518],
       ...,
       [ 1.67571549, -1.34791257,  0.85997823, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.67600634, -0.23438563, -1.00231755, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.67906533, -0.23438563,  0.69292214, ..., -0.17149859,
         0.02026518, -0.02026518]])

In [19]:
X_test_scaled=scaler.transform(X_test)
X_test_scaled

array([[-1.20255948,  2.20755943, -1.12001617, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.62943343, -1.11348584,  0.21833096, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.49837845, -1.34791257,  0.54295132, ..., -0.17149859,
         0.02026518, -0.02026518],
       ...,
       [-1.10927546, -0.72277464,  1.7009538 , ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.10922531, -0.91813024,  0.85997823, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.1091551 ,  1.23078141,  1.22636262, ..., -0.17149859,
         0.02026518, -0.02026518]])

In [20]:
# Train the Logistic Regression model on the scaled data and print the model score
clf_scaled = LogisticRegression()
clf_scaled.fit(X_train_scaled, y_train)
print(f' Scaled Logistic Regression Score: {clf_scaled.score(X_train_scaled, y_train)}')
print(f' Scaled Logistic Regression  Score: {clf_scaled.score(X_test_scaled, y_test)}')

 Scaled Logistic Regression Score: 0.7130541871921182
 Scaled Logistic Regression  Score: 0.7216078264568269


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rfc_scaled = RandomForestClassifier(random_state=1, n_estimators=500)
rfc_scaled.fit(X_train_scaled, y_train)
print(f' Scaled Random Forest Train Score: {rfc_scaled.score(X_train_scaled, y_train)}')
print(f' Scaled Random Forest Test Score: {rfc_scaled.score(X_test_scaled, y_test)}')

 Scaled Random Forest Train Score: 1.0
 Scaled Random Forest Test Score: 0.6635474266269672


# Results 2:
- The Logistical regression model out performed the unscaled model and both Random Forest Models for the test data.
- Scaling the model allowed the data to relate to itself in a way that works better in a Logistical Regression model.
- Random Forest model results were unchanged after scaling the data. This is likely due to the random nature of the sampling/aggregating of data. 