In [1]:
#imports
import numpy as np
import pandas as pd
from pathlib import Path
#advanced ML imports
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.feature_selection import SelectFromModel

In [2]:
#read csvs
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
cols = [0,1]
train_df = pd.get_dummies(train_df)
train_df.drop(train_df.columns[cols],axis=1,inplace=True)
train_df.drop('loan_status_high_risk',axis=1,inplace=True)
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,1,1,0,1,1,0,1,0,1,0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,1,1,0,1,1,0,1,0,1,0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,1,1,0,1,1,0,1,0,1,0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,1,1,0,1,1,0,1,0,1,0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,1,1,0,1,1,0,1,0,1,0


In [4]:
# Convert categorical data to numeric and separate target feature for testing data

test_df = pd.get_dummies(test_df)
test_df.drop(test_df.columns[cols],axis=1,inplace=True)
test_df.drop('loan_status_high_risk',axis=1,inplace=True)
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,1,1,0,1,1,0,1,0,1
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,1,1,0,1,1,0,1,0,1
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,1,1,0,1,1,0,1,0,1
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,1,1,0,1,1,0,1,0,1
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,1,1,0,1,1,0,1,0,1


In [5]:
#read length
len(train_df.columns)

93

In [6]:
#read length
len(test_df.columns)

92

In [7]:
# Get missing columns in the training test
missing_cols = set( train_df.columns ) - set( test_df.columns )

# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_df[c] = 0
    
# Ensure the order of column in the test set is in the same order than in train set
test_df = test_df[train_df.columns]

In [8]:
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,1,1,0,1,1,0,1,0,1,0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,1,1,0,1,1,0,1,0,1,0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,1,1,0,1,1,0,1,0,1,0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,1,1,0,1,1,0,1,0,1,0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,1,1,0,1,1,0,1,0,1,0


In [9]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,1,1,0,1,1,0,1,0,1,0
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,1,1,0,1,1,0,1,0,1,0
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,1,1,0,1,1,0,1,0,1,0
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,1,1,0,1,1,0,1,0,1,0
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,1,1,0,1,1,0,1,0,1,0


In [10]:
# add missing dummy variables to testing set

# Designate Train Variables
X_train = train_df.drop('loan_status_low_risk', 1)
y_train = train_df['loan_status_low_risk']

# Designate Test Variables
X_test = test_df.drop('loan_status_low_risk', 1)
y_test = test_df['loan_status_low_risk']


In [11]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Print training score from 2019 and use it for test score against 2020 data
print(f"Logistic Model Unscaled Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Logistic Model Unscaled Testing Data Score: {classifier.score(X_test, y_test)}")

Logistic Model Unscaled Training Data Score: 0.6508210180623973
Logistic Model Unscaled Testing Data Score: 0.5168013611229264


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state = 1, n_estimators=50).fit(X_train, y_train)
print(f'Random Forest Unscaled Training Score: {clf.score(X_train, y_train)}')
print(f'Random Forest Unscaled Testing Score: {clf.score(X_test, y_test)}')

Random Forest Unscaled Training Score: 0.9999178981937603
Random Forest Unscaled Testing Score: 0.6412165036154828
