In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
# read csv data
df19 = pd.read_csv('Resources/2019loans.csv')
df19.dropna()
df20 = pd.read_csv('Resources/2020Q1loans.csv')
df20.dropna()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.70,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.70,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.1240,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,high_risk,n,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N
4698,77291,77291,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N
4699,77292,77292,10000.0,0.2305,387.36,RENT,33000.0,Verified,high_risk,n,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N
4700,77297,77297,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,high_risk,n,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N


In [3]:
# preprocessing and assign training variables to df19

# drop 'loan_status' for x data
x_train = df19.drop('loan_status', axis=1)

# convert categorical data
x_train = pd.get_dummies(x_train)

# fit and scale data
scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)

# encode y data from 'loan_status'
y_train = LabelEncoder().fit_transform(df19['loan_status'])

In [4]:
# preprocessing assign test variables to df20

# drop 'loan_status' for x data
x_test = df20.drop('loan_status', axis=1)

# convert categorical data
x_test = pd.get_dummies(x_test)

# test data does not contain any entries with debt_settlement_flag of 'Y' adding column to match training data
x_test['debt_settlement_flag_Y'] = 0

# scale data
x_test_scaled = scaler.transform(x_test)

# encode y data from 'loan_status'
y_test = LabelEncoder().fit_transform(df20['loan_status'])

LogisticRegression vs RandomForestClassifier:
    I would expect RandomForestClassifier to perform better as it accounts for certain features having greater weight.

In [5]:
# unscaled results
# LogisticRegression
lr = LogisticRegression(max_iter=300)
lr.fit(x_train, y_train)
print(f'LogisticRegression Train score: {lr.score(x_train, y_train)}')
print(f'LogisticRegression Test score: {lr.score(x_test, y_test)}')

# RandomForestClassifier
rf = RandomForestClassifier(random_state=1, n_estimators=300)
rf.fit(x_train, y_train)
print(f'RandomForestClassifier Train score: {rf.score(x_train, y_train)}')
print(f'RandomForestClassifier Test score: {rf.score(x_test, y_test)}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression Train score: 0.6622331691297209
LogisticRegression Test score: 0.5270097830710336
RandomForestClassifier Train score: 1.0
RandomForestClassifier Test score: 0.6256911952360698


LogisticRegression vs RandomForestClassifier:
    RandomForestClassifier (train: 100%, test: 63%) did indeed perform better on the test data than LogitisticRegression (train: 66%, test: 53%); however, it appears to have been over fit to the training data.
    
Scaling data:
    Scaling will improve the accuracy of the predicitons as this data contains monetary values which can have very large differences. The LogisticRegression model even warns us that the solver failed to converge (even after massive increases to the max iterations) and scaling will be necessary.

In [6]:
# scaled results
# LogisticRegression
lr.fit(x_train_scaled, y_train)
print(f'LogisticRegression Train score: {lr.score(x_train_scaled, y_train)}')
print(f'LogisticRegression Test score: {lr.score(x_test_scaled, y_test)}')

# RandomForescClassifier
rf.fit(x_train_scaled, y_train)
print(f'RandomForescClassifier Train score: {rf.score(x_train_scaled, y_train)}')
print(f'RandomForescClassifier Test score: {rf.score(x_test_scaled, y_test)}')

LogisticRegression Train score: 0.7125615763546798
LogisticRegression Test score: 0.7205444491705657
RandomForescClassifier Train score: 1.0
RandomForescClassifier Test score: 0.6246278179498086


Scaling data:
    Scaling the data greatly improved the LogisticRegression scores (train: 71%, test: 72%) to the extent that it now performs better than RandomForestClassifier (train: 100%, test: 62%), but it did not really affect the RandomForestClassifier scores at all. This is likely due to LogisticRegression relying on the convergence of data points.