In [27]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [28]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [29]:
# combining dataframes
combined=pd.concat([train_df,test_df])
combined

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.1240,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N
3,11778,11778,3000.0,0.1240,100.22,RENT,45000.0,Not Verified,low_risk,n,...,100.0,16.7,1.0,0.0,42006.0,20761.0,19900.0,15406.0,N,N
4,169382,169382,30000.0,0.1612,1056.49,MORTGAGE,133000.0,Source Verified,low_risk,n,...,100.0,66.7,0.0,0.0,283248.0,109056.0,79500.0,58778.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,77282,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,high_risk,n,...,100.0,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N
4698,77291,77291,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,high_risk,n,...,100.0,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N
4699,77292,77292,10000.0,0.2305,387.36,RENT,33000.0,Verified,high_risk,n,...,100.0,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N
4700,77297,77297,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,high_risk,n,...,95.0,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N


In [30]:
# Convert categorical data to numeric and separate target feature for training data
cnvt_num=pd.get_dummies(combined)
cnvt_num.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc'

In [31]:
# Convert categorical data to numeric and separate target feature for testing data
X=cnvt_num.drop(columns='loan_status_low_risk')
y=cnvt_num['loan_status_low_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=38)

In [32]:
# Train the Logistic Regression model on the unscaled data and print the model score
clf = LogisticRegression(random_state=38,max_iter=1200000).fit(X_train, y_train)
clf_acc=clf.score(X_test,y_test)
print(f"Trained Data Score: {clf_acc}")

Trained Data Score: 0.7374371859296482


In [33]:
# Train a Random Forest Classifier model and print the model score
rfclf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
rfclf_acc=rfclf.score(X_test,y_test)
print(f"Trained Data Score: {rfclf_acc}")

Trained Data Score: 0.9847451543431442


In [34]:
# Scale the data
X_train_scaled=StandardScaler().fit_transform(X_train)
X_test_scaled=StandardScaler().fit_transform(X_test)

In [35]:
# Train the Logistic Regression model on the scaled data and print the model score
clf_scaled = LogisticRegression(random_state=38,max_iter=1200000).fit(X_train_scaled, y_train)
clf_acc_scaled=clf.score(X_test_scaled,y_test)
print(f"Trained Data Score: {clf_acc_scaled}")

Trained Data Score: 0.6956209619526202




In [36]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rfclf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train_scaled, y_train)
rfclf_acc=rfclf.score(X_test_scaled,y_test)
print(f"Trained Data Score: {rfclf_acc}")

Trained Data Score: 0.9868987796123474
