In [31]:
import numpy as np
import pandas as pd
from pathlib import Path

In [32]:
# The data
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [33]:
# Preview train_df
train_df.head(2)

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N


In [34]:
# Preview test_df
test_df.head(2)

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N


In [35]:
# find out data types for train_df $ test_df
# train_df.info()
# test_df.info()

train_df.columns.difference(test_df.columns).tolist()

[]

Both DataFrames contained the same amount of columns

In [36]:
# Convert categorical data to numeric and separate target feature for training data
y_train = train_df["loan_status"]
X_raw = train_df.drop(columns = ["loan_status"])
X_train = pd.get_dummies(X_raw)
X_train.head(2)

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,0,1,0,1,1,0,1,0,1,0


In [37]:
# Convert categorical data to numeric and separate target feature for testing data
y_test = test_df["loan_status"]
X_raw = test_df.drop(columns = ["loan_status"])
X_test = pd.get_dummies(X_raw)
X_test.head(2)

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,0,0,1,0,1,1,0,1,0,1


In [38]:
# Missing columnn
result = X_train.columns.difference(X_test.columns).tolist()
print (f'After converting the data, a missing column emerged{result}')

After converting the data, a missing column emerged['debt_settlement_flag_Y']


In [50]:
# add missing dummy variables to testing set
X_test[result] = 0
X_test.head(2)

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,67991,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,25429,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,...,0,1,0,1,1,0,1,0,1,0


In [54]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 10000)
classifier.fit(X_train, y_train)

print(f"Training Data Score: {clf_logistic.score(X_train, y_train)}")
print(f"Testing Data Score: {clf_logistic.score(X_test, y_test)}");

Training Data Score: 0.6485221674876848
Testing Data Score: 0.5253083794130158


In [55]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.9999178981937603
Testing Score: 0.6352615908124203


# Prediction - Scaled
Prediction: The score for Logistic Regression will improve due to scaling whereas the score for Random Forest will remain the same. Graphical-model classifiers like Random Forest are invariant to feature scaling.

In [57]:
# Scale the data
from sklearn.preprocessing import StandardScaler, LabelEncoder
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [59]:
# Preview X_train_scaled
X_train_scaled

array([[-1.31172014, -1.31172014, -0.39311205, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-0.46579523, -0.46579523,  0.35168119, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.3364188 ,  1.3364188 ,  0.25400339, ..., -0.17149859,
         0.02026518, -0.02026518],
       ...,
       [ 1.67571549,  1.67571549, -1.34791257, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.67600634,  1.67600634, -0.23438563, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 1.67906533,  1.67906533, -0.23438563, ..., -0.17149859,
         0.02026518, -0.02026518]])

In [60]:
# Preview X_test_scaled
X_test_scaled

array([[-1.20255948, -1.20255948,  2.20755943, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.62943343, -1.62943343, -1.11348584, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.49837845, -1.49837845, -1.34791257, ..., -0.17149859,
         0.02026518, -0.02026518],
       ...,
       [-1.10927546, -1.10927546, -0.72277464, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.10922531, -1.10922531, -0.91813024, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-1.1091551 , -1.1091551 ,  1.23078141, ..., -0.17149859,
         0.02026518, -0.02026518]])

In [62]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.712807881773399
Testing Data Score: 0.7203317737133135


In [67]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9999178981937603
Testing Score: 0.6337728626116547
