### Model notes:
- Only 36 mo product
- Use SMOTE to upsample minority class and gridsearch

#### Results:
loan approval rate: 0.60\
precision pred_1: 0.90\
recall pred_1: 0.64

In [76]:
#Installing imblearn
#!pip install -U imbalanced-learn

In [77]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 50)

In [78]:
%%time
df = pd.read_parquet('../data/approved.parquet', engine='fastparquet')

Wall time: 3.4 s


In [79]:
df.sample(5)

Unnamed: 0_level_0,addr_state,annual_inc,application_type,disbursement_method,dti,earliest_cr_line,emp_length,emp_title,fico_range_high,fico_range_low,grade,home_ownership,initial_list_status,installment,int_rate,issue_d,loan_amnt,open_acc,pub_rec,pub_rec_bankruptcies,purpose,sub_grade,term,verification_status,zip_code,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1345430,VA,65000.0,Individual,Cash,26.4,1994-09-01,10,U S Army,689.0,685.0,C,RENT,f,626.72,15.31,2012-06-01,18000.0,6.0,1.0,0.0,debt_consolidation,C3,36,Verified,238xx,1
66006534,TX,65000.0,Individual,Cash,0.81,2008-12-01,1,SAFE Finance Mgr,679.0,675.0,C,MORTGAGE,f,102.06,13.67,2015-12-01,3000.0,6.0,0.0,0.0,home_improvement,C4,36,Source Verified,797xx,0
6815121,GA,36000.0,Individual,Cash,12.0,2005-01-01,3,Armstrong Atlantic,679.0,675.0,C,OWN,f,86.94,15.22,2013-08-01,2500.0,6.0,0.0,0.0,major_purchase,C3,36,Not Verified,314xx,1
22381900,OH,115000.0,Individual,Cash,10.52,1996-09-01,3,Divisional Director,669.0,665.0,C,RENT,w,456.67,14.99,2014-07-01,19200.0,13.0,2.0,2.0,credit_card,C5,60,Source Verified,430xx,1
55079869,GA,85000.0,Individual,Cash,15.87,1999-11-01,10,recruiter,669.0,665.0,C,RENT,w,503.18,12.69,2015-07-01,15000.0,15.0,1.0,1.0,debt_consolidation,C2,36,Source Verified,301xx,1


### Feature Engineering

In [80]:
df['days_cr_line']=df['issue_d']-df['earliest_cr_line']
#convert to int (https://www.codegrepper.com/code-examples/python/pandas+timedelta64+ns+to+float)
df['days_cr_line'] = (df['days_cr_line'] / np.timedelta64(1,'D')).astype(int)

### Convert Ordinals

In [81]:
# sub_grade
df.sub_grade.replace({'A1':1, 'A2':2, 'A3':3, 'A4':4, 'A5':5, 'B1':6, 'B2':7, 
                      'B3':8, 'B4':9, 'B5':10, 'C1':11, 'C2':12, 'C3':13, 'C4':14, 
                      'C5':15, 'D1':16, 'D2':17, 'D3':18, 'D4':19, 'D5':20, 'E1':21, 
                      'E2':22, 'E3':23, 'E4':24, 'E5':25}, inplace=True)

# disbursement_method
df.disbursement_method.replace({'Cash':0,'DirectPay':1},inplace=True)

# verification_status
df.verification_status.replace({'Source Verified':'Verified'},inplace=True)
df.verification_status.replace({'Not Verified':0,'Verified':1}, inplace=True)

# initial_list_status
df.initial_list_status.replace({'w':0,'f':1}, inplace=True)

# application_type
df.application_type.replace({'Individual':0,'Joint App':1}, inplace=True)

### Dummification

In [82]:
dummies=['purpose','addr_state','home_ownership']

for d in dummies:
    temp = pd.get_dummies(df[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(df[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    df = pd.concat([df.drop(d, axis=1),temp], axis=1)
    #merge back with main df

### Drop Columns

In [83]:
df.drop(['emp_title','zip_code','grade','issue_d','earliest_cr_line'],axis=1, inplace=True)
#remove earliest_cr_line when fixed

### Drop Rows

In [84]:
df.drop(df[df['emp_length']==-1].index, inplace=True)

### Filter for years

In [85]:
# df= df.loc[df['issue_d'].dt.year.between(2007,2014)]

### Drop the 60month product

In [86]:
df = df.loc[df['term']==36]
df.drop('term', axis=1, inplace=True)

# Modelling

### create target and features

In [87]:
target = df['loan_status']

In [88]:
target.value_counts()

1    801834
0    144112
Name: loan_status, dtype: int64

In [89]:
features = df.drop(['loan_status'], axis=1)

### note the reduced train_size of .10 = ~120,000 obs

In [90]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, KFold

Xtrain, Xtest, ytrain, ytest = train_test_split(features, target, train_size=.10 )
logr = LogisticRegression()

In [91]:
# logr.get_params()

### SMOTE to oversample minority class

In [92]:
%%time
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 101)
X_oversample, y_oversample = smote.fit_resample(Xtrain, ytrain)

Wall time: 14 s


In [93]:
print(y_oversample.value_counts())
print(ytest.value_counts())

1    80115
0    80115
Name: loan_status, dtype: int64
1    721719
0    129633
Name: loan_status, dtype: int64


In [94]:
X_oversample, y_oversample = smote.fit_resample(Xtrain, ytrain)

### gridsearch

In [95]:
Cs = [0.0001,0.001,0.01,1,10,100,1000,10000]
params = [{'C':Cs, 'penalty':['l1','l2']}]
grid_logr = GridSearchCV(estimator=logr, param_grid=params, cv=3)

In [96]:
%%time
grid_logr.fit(X_oversample, y_oversample)

Wall time: 34.6 s


GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid=[{'C': [0.0001, 0.001, 0.01, 1, 10, 100, 1000, 10000],
                          'penalty': ['l1', 'l2']}])

In [97]:
print(grid_logr.best_params_)
print(grid_logr.best_score_)

{'C': 100, 'penalty': 'l2'}
0.6288709979404605


In [98]:
best_model = grid_logr.best_estimator_
best_model.score(Xtest,ytest)

0.6397071951437243

In [99]:
coefs = pd.DataFrame(abs(best_model.coef_).T, index=features.columns)

In [100]:
coefs.sort_values(0, ascending=False).head(10)

Unnamed: 0,0
installment,0.019272
sub_grade,0.003677
int_rate,0.003144
dti,0.002632
open_acc,0.00081
loan_amnt,0.000637
emp_length,0.000393
initial_list_status,0.000239
purpose__credit_card,0.000229
home_ownership__RENT,0.000149


### Confusion Matrix

In [101]:
from sklearn.metrics import confusion_matrix

In [102]:
cmat = confusion_matrix(ytest,best_model.predict(Xtest))

#Label the confusion matrix
pd.DataFrame(cmat, columns = [f'Pred_{label}' for label in best_model.classes_],
             index = [f'True_{label}' for label in best_model.classes_])


Unnamed: 0,Pred_0,Pred_1
True_0,76703,52930
True_1,253806,467913


### Best model is one that strikes good balance of a high "pred_1 precision" and a high "loan approval rate"

In [103]:
cmat=cmat
print(f'loan approval rate: {sum(cmat[:,1])/sum(sum(cmat)):.2f}')
print(f'precision pred_1: {cmat[1][1]/sum(cmat[:,1]):.2f}')
print(f'recall pred_1: {cmat[1][1]/sum(cmat[1,:]):.2f}')

loan approval rate: 0.61
precision pred_1: 0.90
recall pred_1: 0.65


In [104]:
from sklearn.metrics import classification_report
print(classification_report(ytest, best_model.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.23      0.59      0.33    129633
           1       0.90      0.65      0.75    721719

    accuracy                           0.64    851352
   macro avg       0.57      0.62      0.54    851352
weighted avg       0.80      0.64      0.69    851352



### change the threshold to improve the FP score

In [105]:
probs_positive_class = best_model.predict_proba(Xtest)[:, 1]
prediction = probs_positive_class > .6

In [106]:
cmat2 = confusion_matrix(ytest, prediction, labels=[0, 1])
pd.DataFrame(cmat2, columns = [f'Pred_{label}' for label in best_model.classes_],
             index = [f'True_{label}' for label in best_model.classes_])

Unnamed: 0,Pred_0,Pred_1
True_0,114197,15436
True_1,510447,211272


In [107]:
from sklearn.metrics import classification_report
print(classification_report(ytest, prediction))

              precision    recall  f1-score   support

           0       0.18      0.88      0.30    129633
           1       0.93      0.29      0.45    721719

    accuracy                           0.38    851352
   macro avg       0.56      0.59      0.37    851352
weighted avg       0.82      0.38      0.42    851352



In [108]:
cmat=cmat2
print(f'loan approval rate: {sum(cmat[:,1])/sum(sum(cmat)):.2f}')
print(f'precision pred_1: {cmat[1][1]/sum(cmat[:,1]):.2f}')
print(f'recall pred_1: {cmat[1][1]/sum(cmat[1,:]):.2f}')

loan approval rate: 0.27
precision pred_1: 0.93
recall pred_1: 0.29


## ROC Curve

In [None]:
from sklearn.metrics import roc_curve

In [None]:
# predict probabilities
yhat = logr.predict_proba(Xtest)

In [None]:
# keep probabilities for the positive outcome only
yhat = yhat[:, 1]

In [None]:
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

In [None]:
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, prediction)

In [None]:
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

### SGD classifier

### Save model in pickle

In [None]:
filename = 'grid_logr_model.sav'
pickle.dump(grid_logr, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(Xtest, ytest)
print(result)