In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
train_df = pd.read_csv(Path('../Resources/2019loans.csv'))
test_df = pd.read_csv(Path('../Resources/2020Q1loans.csv'))

### Prediction. Based on some reading at https://www.numpyninja.com/post/feature-scaling

The models which calculate some kind of distance as part of the algorithm needs the data to be scaled.

Example: Linear Regression, Logistic Regression, SVM, KNN, K-Means clustering, PCA etc.

Tree based models where each node is split based on the condition doesn’t need the features to be scaled because the model accuracy don’t depend on the range. Moreover, if we scale the features here to the range 0 to 1 then many values are decimal values near to each other and constructing the tree takes more time.

Example: Decision Trees, Random Forest, XGBoost etc.


#### Therefore the scaling will significantly affect the training of the logistic regression model but not the random forest model.

* Random Forest Model will work better on unscaled data.
* Logistic Regression Model will work better on scaled data.


In [3]:
#preview data
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [4]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [5]:
## in train set drop the unnamed and index columns which appear to be identifiers and not features
train_df = train_df.drop(["Unnamed: 0","index"], axis=1)
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,29.99,0.0,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,11.26,2.0,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,11.28,0.0,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,3000.0,0.124,100.22,RENT,45000.0,Not Verified,low_risk,n,18.08,0.0,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,27.77,0.0,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N


In [6]:
## in test set drop the unnamed and index columns which appear to be identifiers and not features
test_df = test_df.drop(["Unnamed: 0","index"], axis=1)
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,19.75,0.0,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,11.52,2.0,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,6.74,0.0,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,12.13,0.0,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,16.08,0.0,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [7]:
# Convert categorical data to numeric and separate target feature for training data

X_train = train_df.drop("loan_status", axis=1)
X_train_encoded = pd.get_dummies(X_train)
y_train_encoded = LabelEncoder().fit_transform(train_df["loan_status"])
X_train_encoded.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,0,1,0,1,1,0,1,0,1,0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,0,1,0,1,1,0,1,0,1,0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,0,1,0,1,1,0,1,0,1,0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,0,1,0,1,1,0,1,0,1,0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,0,1,0,1,1,0,1,0,1,0


In [8]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop("loan_status", axis=1)
X_test_encoded = pd.get_dummies(X_test)
y_test_encoded = LabelEncoder().fit_transform(test_df["loan_status"])
X_test_encoded.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,0,1,0,1,1,0,1,0,1
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,0,1,0,1,1,0,1,0,1
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,0,1,0,1,1,0,1,0,1
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,0,1,0,1,1,0,1,0,1
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,0,1,0,1,1,0,1,0,1


In [9]:
# add missing dummy variables to testing set
cols = X_train_encoded.columns.tolist()
X_test_encoded = X_test_encoded.reindex(columns=cols).fillna(0)
X_test_encoded.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,1,0,1,1,0,1,0,1,0.0
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,1,0,1,1,0,1,0,1,0.0
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,1,0,1,1,0,1,0,1,0.0
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,1,0,1,1,0,1,0,1,0.0
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,1,0,1,1,0,1,0,1,0.0


In [10]:
# Train the Logistic Regression model on the unscaled data and print the model score
# create logistic regression model
lr_clf = LogisticRegression(max_iter = 400)

#fit model using the training data
lr_clf.fit(X_train_encoded, y_train_encoded)

print(f"Training Data Score: {lr_clf.score(X_train_encoded, y_train_encoded)}")
print(f"Testing Data Score: {lr_clf.score(X_test_encoded, y_test_encoded)}")

Training Data Score: 0.6850574712643678
Testing Data Score: 0.5493407060825181


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Train a Random Forest Classifier model and print the model score
rf_clf = RandomForestClassifier(random_state=1, n_estimators=500)
rf_clf.fit(X_train_encoded, y_train_encoded)

print(f'Training Score: {rf_clf.score(X_train_encoded, y_train_encoded)}')
print(f'Testing Score: {rf_clf.score(X_test_encoded, y_test_encoded)}')

Training Score: 1.0
Testing Score: 0.6433432581880051


In [12]:
# Scale the train data
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train_encoded)
X_train_scaled = scaler.transform(X_train_encoded)
X_train_scaled

array([[-0.39311205,  0.73658452, -0.08760946, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 0.35168119, -0.19171582, -0.10342722, ..., -0.17149859,
         0.02026518, -0.02026518],
       [ 0.25400339, -0.32080462, -0.20434179, ..., -0.17149859,
         0.02026518, -0.02026518],
       ...,
       [-1.34791257,  0.85997823, -1.28263075, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-0.23438563, -1.00231755, -0.11361032, ..., -0.17149859,
         0.02026518, -0.02026518],
       [-0.23438563,  0.69292214,  0.10586953, ..., -0.17149859,
         0.02026518, -0.02026518]])

In [13]:
#scale test data
scaler = StandardScaler().fit(X_test_encoded)
X_test_scaled = scaler.transform(X_test_encoded)
X_test_scaled

array([[ 2.27394814, -1.13282185,  1.06131775, ..., -0.1323959 ,
         0.        ,  0.        ],
       [-1.08518753,  0.08480141, -1.00546758, ..., -0.1323959 ,
         0.        ,  0.        ],
       [-1.32230299,  0.38013981, -1.27977706, ..., -0.1323959 ,
         0.        ,  0.        ],
       ...,
       [-0.6899951 ,  1.43368618, -0.39614106, ..., -0.1323959 ,
         0.        ,  0.        ],
       [-0.88759132,  0.66856972, -1.0151535 , ..., -0.1323959 ,
         0.        ,  0.        ],
       [ 1.28596706,  1.00190488,  2.11394584, ..., -0.1323959 ,
         0.        ,  0.        ]])

In [14]:
# Train the Logistic Regression model on the scaled data and print the model score
lr_clf = LogisticRegression(max_iter = 400)

#fit model using the training data
lr_clf.fit(X_train_scaled, y_train_encoded)

print(f"Training Data Score: {lr_clf.score(X_train_scaled, y_train_encoded)}")
print(f"Testing Data Score: {lr_clf.score(X_test_scaled, y_test_encoded)}")

Training Data Score: 0.7079638752052545
Testing Data Score: 0.662484049340706


In [15]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf_clf = RandomForestClassifier(random_state=1, n_estimators=500)
rf_clf.fit(X_train_scaled, y_train_encoded)

print(f'Training Score: {rf_clf.score(X_train_scaled, y_train_encoded)}')
print(f'Testing Score: {rf_clf.score(X_test_scaled, y_test_encoded)}')

Training Score: 1.0
Testing Score: 0.560187154402382


### Predicted vs Actual - on Unscaled Data and Scaled Data

#### Logistic Regression
Unscaled Scores:
* Training Data Score: 0.6850574712643678
* Testing Data Score: 0.5493407060825181

Scaled Scores:
* Training Data Score: 0.7079638752052545
* Testing Data Score: 0.662484049340706

######  For the logsitic regression model, we can see the scaled data produced a better training result and was able to accurately predict the label on test data more often compared to the unscaled data.

#### Random Forest Classifier
Unscaled Scores:
* Training Score: 1.0
* Testing Score: 0.6433432581880051

Scaled Scores:
* Training Score: 1.0
* Testing Score: 0.560187154402382

######  For the random forest model, we can see the scaled data had no effect on the training result and actually reduced the % of accurate label predictions on the test data