#### Simple logistic regression with accuracy of ~.80 with an a FP of 221,552

#### key next step: how to improve the FP score? Change the proba threshold value, check the ROC curve for assistance
#### note: df['days_cr_line'] used in this model

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 50)

In [6]:
%%time
df = pd.read_parquet('../data/approved.parquet', engine='fastparquet')

Wall time: 6.2 s


In [7]:
df.sample(5)

Unnamed: 0_level_0,addr_state,annual_inc,application_type,disbursement_method,dti,earliest_cr_line,emp_length,emp_title,fico_range_high,fico_range_low,grade,home_ownership,initial_list_status,installment,int_rate,issue_d,loan_amnt,open_acc,pub_rec,pub_rec_bankruptcies,purpose,sub_grade,term,verification_status,zip_code,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
37640560,TX,35000.0,Individual,Cash,24.04,2011-07-01,3,Office Manager,704.0,700.0,C,RENT,f,207.97,14.99,2014-12-01,6000.0,8.0,0.0,0.0,debt_consolidation,C5,36,Source Verified,770xx,1
5645070,FL,33000.0,Individual,Cash,29.75,2002-01-01,10,wal mart,689.0,685.0,C,RENT,f,315.1,15.31,2013-06-01,9050.0,8.0,0.0,0.0,credit_card,C2,36,Verified,323xx,0
61438898,CA,85000.0,Individual,Cash,14.68,1991-02-01,10,Sales manager,679.0,675.0,C,RENT,w,1207.3,14.65,2015-10-01,35000.0,7.0,0.0,0.0,debt_consolidation,C5,36,Source Verified,945xx,1
94255259,NY,93000.0,Individual,Cash,2.98,1988-03-01,10,Executive assistant,674.0,670.0,C,RENT,f,34.18,13.99,2016-11-01,1000.0,5.0,0.0,0.0,other,C3,36,Verified,113xx,1
35663407,SC,50000.0,Individual,Cash,15.07,2001-06-01,10,Medical Line Operator,789.0,785.0,B,MORTGAGE,f,160.15,9.49,2014-11-01,5000.0,11.0,0.0,0.0,other,B2,36,Not Verified,293xx,1


### Feature Engineering

In [8]:
df['days_cr_line']=df['issue_d']-df['earliest_cr_line']
#convert to int (https://www.codegrepper.com/code-examples/python/pandas+timedelta64+ns+to+float)
df['days_cr_line'] = (df['days_cr_line'] / np.timedelta64(1,'D')).astype(int)

### Convert Ordinals

In [9]:
# sub_grade
df.sub_grade.replace({'A1':1, 'A2':2, 'A3':3, 'A4':4, 'A5':5, 'B1':6, 'B2':7, 
                      'B3':8, 'B4':9, 'B5':10, 'C1':11, 'C2':12, 'C3':13, 'C4':14, 
                      'C5':15, 'D1':16, 'D2':17, 'D3':18, 'D4':19, 'D5':20, 'E1':21, 
                      'E2':22, 'E3':23, 'E4':24, 'E5':25}, inplace=True)

# disbursement_method
df.disbursement_method.replace({'Cash':0,'DirectPay':1},inplace=True)

# verification_status
df.verification_status.replace({'Source Verified':'Verified'},inplace=True)
df.verification_status.replace({'Not Verified':0,'Verified':1}, inplace=True)

# initial_list_status
df.initial_list_status.replace({'w':0,'f':1}, inplace=True)

# application_type
df.application_type.replace({'Individual':0,'Joint App':1}, inplace=True)

### Dummification

In [10]:
dummies=['purpose','addr_state','home_ownership']

for d in dummies:
    temp = pd.get_dummies(df[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(df[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    df = pd.concat([df.drop(d, axis=1),temp], axis=1)
    #merge back with main df

### Drop Columns

In [11]:
df.drop(['emp_title','zip_code','grade','issue_d','earliest_cr_line'],axis=1, inplace=True)
#remove earliest_cr_line when fixed

### Drop Rows

In [12]:
df.drop(df[df['emp_length']==-1].index, inplace=True)

# Modelling

### create target and features

In [13]:
target = df['loan_status']

In [14]:
features = df.drop(['loan_status'], axis=1)

### note the reduced train_size of .10 = ~120,000 obs

In [15]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, KFold

Xtrain, Xtest, ytrain, ytest = train_test_split(features, target, train_size=.10 )
logr = LogisticRegression()
logr_bal = LogisticRegression(class_weight='balanced')

In [38]:
# propotion of train set that is the default class
print(f'default (train): {ytrain.value_counts()[0]/np.sum(ytrain.value_counts()):.2f}')
print(f'paid-down (train): {ytrain.value_counts()[1]/np.sum(ytrain.value_counts()):.2f}')
print(f'default (test): {ytest.value_counts()[0]/np.sum(ytest.value_counts()):.2f}')
print(f'paid-down (test): {ytest.value_counts()[1]/np.sum(ytest.value_counts()):.2f}')

default (train): 0.19
paid-down (train): 0.81
default (test): 0.19
paid-down (test): 0.81


In [41]:
print(ytrain.value_counts())
print(ytest.value_counts())

1    99928
0    22774
Name: loan_status, dtype: int64
1    897706
0    206614
Name: loan_status, dtype: int64


In [17]:
%%time
logr.fit(Xtrain, ytrain)

Wall time: 1.59 s


LogisticRegression()

In [19]:
print(logr.score(Xtrain,ytrain))
print(logr.score(Xtest,ytest))

0.8137112679499927
0.8126593740944654


In [20]:
coefs = pd.DataFrame(abs(logr.coef_).T, index=features.columns)

In [21]:
coefs.sort_values(0, ascending=False).head(10)

Unnamed: 0,0
sub_grade,0.038166
term,0.034149
int_rate,0.027782
dti,0.022185
open_acc,0.006699
fico_range_low,0.003223
fico_range_high,0.002659
installment,0.001806
verification_status,0.000914
home_ownership__RENT,0.000902


### Confusion Matrix

In [22]:
from sklearn.metrics import confusion_matrix

In [51]:
cmat = confusion_matrix(y,logr.predict(features))

#Label the confusion matrix
pd.DataFrame(cmat, columns = [f'Pred_{label}' for label in logr.classes_],
             index = [f'True_{label}' for label in logr.classes_])


Unnamed: 0,Pred_0,Pred_1
True_0,7836,221552
True_1,8403,989231


In [79]:
best_model.classes_

array([0, 1])

In [80]:
target.value_counts()

1    997634
0    229388
Name: loan_status, dtype: int64

### change the threshold to improve the FP score

In [43]:
probs_positive_class = logr.predict_proba(Xtest)[:, 1]
prediction = probs_positive_class > .9

In [47]:
cmat2 = confusion_matrix(ytest, prediction, labels=[0, 1])
pd.DataFrame(cmat2, columns = [f'Pred_{label}' for label in logr.classes_],
             index = [f'True_{label}' for label in logr.classes_])

Unnamed: 0,Pred_0,Pred_1
True_0,192577,14037
True_1,685377,212329


In [50]:
logr.predict_proba(Xtest)[:5]
# [:, 1]

array([[0.19569081, 0.80430919],
       [0.36302736, 0.63697264],
       [0.04864295, 0.95135705],
       [0.12750868, 0.87249132],
       [0.089395  , 0.910605  ]])

In [51]:
logr.predict(Xtest)[:5]

array([1, 1, 1, 1, 1])

In [52]:
prediction[:5]

array([False, False,  True, False,  True])

### SGD classifier

### Save model in pickle

In [222]:
filename = 'grid_logr_model.sav'
pickle.dump(grid_logr, open(filename, 'wb'))

NameError: name 'pickle' is not defined

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(Xtest, ytest)
print(result)