### Model notes:
- Only 36 mo product
- Use SMOTE to upsample minority class and gridsearch

#### Results:
loan approval rate: 0.60\
precision pred_1: 0.90\
recall pred_1: 0.64

In [1]:
#Installing imblearn
#!pip install -U imbalanced-learn

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 50)

In [3]:
%%time
df = pd.read_parquet('../data/approved.parquet', engine='fastparquet')

Wall time: 13 s


In [4]:
df.sample(5)

Unnamed: 0_level_0,addr_state,annual_inc,application_type,disbursement_method,dti,earliest_cr_line,emp_length,emp_title,fico_range_high,fico_range_low,grade,home_ownership,initial_list_status,installment,int_rate,issue_d,loan_amnt,open_acc,pub_rec,pub_rec_bankruptcies,purpose,sub_grade,term,verification_status,zip_code,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
55707197,FL,47000.0,Individual,Cash,22.88,2002-09-01,10,Teacher,669.0,665.0,D,MORTGAGE,w,406.47,15.61,2015-07-01,11625.0,8.0,0.0,0.0,debt_consolidation,D1,36,Not Verified,337xx,1
88999857,OK,45000.0,Individual,Cash,14.77,2007-04-01,5,Claims clerical supervisor,694.0,690.0,B,MORTGAGE,f,305.24,8.99,2016-09-01,9600.0,8.0,0.0,0.0,credit_card,B1,36,Source Verified,731xx,1
50605719,KY,104000.0,Individual,Cash,24.83,1993-04-01,10,Captain,679.0,675.0,A,MORTGAGE,w,563.15,7.89,2015-06-01,18000.0,8.0,0.0,0.0,debt_consolidation,A5,36,Not Verified,401xx,1
2054569,GA,45600.0,Individual,Cash,5.05,1999-12-01,-1,,704.0,700.0,B,MORTGAGE,f,199.3,11.14,2012-11-01,6075.0,6.0,0.0,0.0,debt_consolidation,B2,36,Source Verified,301xx,1
120247566,MO,68508.0,Individual,Cash,11.75,2004-09-01,4,Web Development Manager,674.0,670.0,C,RENT,f,169.9,13.59,2017-10-01,5000.0,9.0,0.0,0.0,credit_card,C2,36,Not Verified,651xx,1


### Feature Engineering

In [5]:
df['days_cr_line']=df['issue_d']-df['earliest_cr_line']
#convert to int (https://www.codegrepper.com/code-examples/python/pandas+timedelta64+ns+to+float)
df['days_cr_line'] = (df['days_cr_line'] / np.timedelta64(1,'D')).astype(int)

### Convert Ordinals

In [6]:
# sub_grade
df.sub_grade.replace({'A1':1, 'A2':2, 'A3':3, 'A4':4, 'A5':5, 'B1':6, 'B2':7, 
                      'B3':8, 'B4':9, 'B5':10, 'C1':11, 'C2':12, 'C3':13, 'C4':14, 
                      'C5':15, 'D1':16, 'D2':17, 'D3':18, 'D4':19, 'D5':20, 'E1':21, 
                      'E2':22, 'E3':23, 'E4':24, 'E5':25}, inplace=True)

# disbursement_method
df.disbursement_method.replace({'Cash':0,'DirectPay':1},inplace=True)

# verification_status
df.verification_status.replace({'Source Verified':'Verified'},inplace=True)
df.verification_status.replace({'Not Verified':0,'Verified':1}, inplace=True)

# initial_list_status
df.initial_list_status.replace({'w':0,'f':1}, inplace=True)

# application_type
df.application_type.replace({'Individual':0,'Joint App':1}, inplace=True)

### Dummification

In [7]:
dummies=['purpose','addr_state','home_ownership']

for d in dummies:
    temp = pd.get_dummies(df[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(df[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    df = pd.concat([df.drop(d, axis=1),temp], axis=1)
    #merge back with main df

### Filter for years

In [8]:
df= df.loc[df['issue_d'].dt.year.between(2007,2015)]

In [9]:
df.issue_d.dt.year.count()

800364

### Drop Columns

In [10]:
df.drop(['emp_title','zip_code','grade','issue_d','earliest_cr_line'],axis=1, inplace=True)
#remove earliest_cr_line when fixed

### Drop Rows

In [11]:
df.drop(df[df['emp_length']==-1].index, inplace=True)

### Drop the 60month product

In [12]:
df = df.loc[df['term']==36]
df.drop('term', axis=1, inplace=True)

# Modelling

### create target and features

In [13]:
target = df['loan_status']

In [14]:
target.value_counts()

1    500426
0     77022
Name: loan_status, dtype: int64

In [15]:
features = df.drop(['loan_status'], axis=1)

### note the reduced train_size of .10 = ~120,000 obs

In [16]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, KFold

Xtrain, Xtest, ytrain, ytest = train_test_split(features, target, train_size=.10 )
logr = LogisticRegression()

In [17]:
# logr.get_params()

### SMOTE to oversample minority class

In [18]:
%%time
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 101)
X_oversample, y_oversample = smote.fit_resample(Xtrain, ytrain)

Wall time: 3.22 s


In [19]:
print(y_oversample.value_counts())
print(ytest.value_counts())

1    49860
0    49860
Name: loan_status, dtype: int64
1    450566
0     69138
Name: loan_status, dtype: int64


In [20]:
X_oversample, y_oversample = smote.fit_resample(Xtrain, ytrain)

### gridsearch

In [21]:
Cs = [0.0001,0.001,0.01,1,10,100,1000,10000]
params = [{'C':Cs, 'penalty':['l1','l2']}]
grid_logr = GridSearchCV(estimator=logr, param_grid=params, cv=3)

In [22]:
%%time
grid_logr.fit(X_oversample, y_oversample)

Wall time: 18.9 s


GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid=[{'C': [0.0001, 0.001, 0.01, 1, 10, 100, 1000, 10000],
                          'penalty': ['l1', 'l2']}])

In [23]:
print(grid_logr.best_params_)
print(grid_logr.best_score_)

{'C': 0.0001, 'penalty': 'l2'}
0.6258724428399519


In [24]:
best_model = grid_logr.best_estimator_
best_model.score(Xtest,ytest)

0.6108477133137324

In [25]:
coefs = pd.DataFrame(abs(best_model.coef_).T, index=features.columns)

In [26]:
coefs.sort_values(0, ascending=False).head(10)

Unnamed: 0,0
installment,0.020047
sub_grade,0.004245
int_rate,0.003359
dti,0.003063
open_acc,0.000953
loan_amnt,0.000664
emp_length,0.0005
purpose__credit_card,0.000297
initial_list_status,0.000274
home_ownership__RENT,0.000185


### Confusion Matrix

In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
cmat = confusion_matrix(ytest,best_model.predict(Xtest))

#Label the confusion matrix
pd.DataFrame(cmat, columns = [f'Pred_{label}' for label in best_model.classes_],
             index = [f'True_{label}' for label in best_model.classes_])


Unnamed: 0,Pred_0,Pred_1
True_0,43414,25724
True_1,176520,274046


### Best model is one that strikes good balance of a high "pred_1 precision" and a high "loan approval rate"

In [29]:
cmat=cmat
print(f'loan approval rate: {sum(cmat[:,1])/sum(sum(cmat)):.2f}')
print(f'precision pred_1: {cmat[1][1]/sum(cmat[:,1]):.2f}')
print(f'recall pred_1: {cmat[1][1]/sum(cmat[1,:]):.2f}')

loan approval rate: 0.58
precision pred_1: 0.91
recall pred_1: 0.61


In [30]:
from sklearn.metrics import classification_report
print(classification_report(ytest, best_model.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.20      0.63      0.30     69138
           1       0.91      0.61      0.73    450566

    accuracy                           0.61    519704
   macro avg       0.56      0.62      0.52    519704
weighted avg       0.82      0.61      0.67    519704



### change the threshold to improve the pred_1 precision

In [31]:
probs_positive_class = best_model.predict_proba(Xtest)[:, 1]
prediction = probs_positive_class > .6

In [32]:
cmat2 = confusion_matrix(ytest, prediction, labels=[0, 1])
pd.DataFrame(cmat2, columns = [f'Pred_{label}' for label in best_model.classes_],
             index = [f'True_{label}' for label in best_model.classes_])

Unnamed: 0,Pred_0,Pred_1
True_0,60518,8620
True_1,317610,132956


In [33]:
from sklearn.metrics import classification_report
print(classification_report(ytest, prediction))

              precision    recall  f1-score   support

           0       0.16      0.88      0.27     69138
           1       0.94      0.30      0.45    450566

    accuracy                           0.37    519704
   macro avg       0.55      0.59      0.36    519704
weighted avg       0.84      0.37      0.43    519704



In [34]:
cmat=cmat2
print(f'loan approval rate: {sum(cmat[:,1])/sum(sum(cmat)):.2f}')
print(f'precision pred_1: {cmat[1][1]/sum(cmat[:,1]):.2f}')
print(f'recall pred_1: {cmat[1][1]/sum(cmat[1,:]):.2f}')

loan approval rate: 0.27
precision pred_1: 0.94
recall pred_1: 0.30


### Loop to check thresholds

In [35]:
thresholds = np.linspace(0.5,1,19)

for i in thresholds:
    probs_positive_class = best_model.predict_proba(Xtest)[:, 1]
    prediction = probs_positive_class > i
    cmat = confusion_matrix(ytest, prediction, labels=[0, 1])
    print(f'threshold:{i:.2f}')
    print(f'loan approval rate: {sum(cmat[:,1])/sum(sum(cmat)):.2f}')
    print(f'precision pred_1: {cmat[1][1]/sum(cmat[:,1]):.2f}')
    print(f'recall pred_1: {cmat[1][1]/sum(cmat[1,:]):.2f}')
    print("-"*50)

threshold:0.50
loan approval rate: 0.58
precision pred_1: 0.91
recall pred_1: 0.61
--------------------------------------------------
threshold:0.53
loan approval rate: 0.47
precision pred_1: 0.92
recall pred_1: 0.50
--------------------------------------------------
threshold:0.56
loan approval rate: 0.39
precision pred_1: 0.93
recall pred_1: 0.41
--------------------------------------------------
threshold:0.58
loan approval rate: 0.31
precision pred_1: 0.94
recall pred_1: 0.34
--------------------------------------------------
threshold:0.61
loan approval rate: 0.25
precision pred_1: 0.94
recall pred_1: 0.27
--------------------------------------------------
threshold:0.64
loan approval rate: 0.19
precision pred_1: 0.94
recall pred_1: 0.21
--------------------------------------------------
threshold:0.67
loan approval rate: 0.15
precision pred_1: 0.95
recall pred_1: 0.17
--------------------------------------------------
threshold:0.69
loan approval rate: 0.12
precision pred_1: 0.95

## ROC Curve

In [None]:
from sklearn.metrics import roc_curve

In [None]:
# predict probabilities
yhat = logr.predict_proba(Xtest)

In [None]:
# keep probabilities for the positive outcome only
yhat = yhat[:, 1]

In [None]:
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

In [None]:
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, prediction)

In [None]:
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

### SGD classifier

### Save model in pickle

In [None]:
filename = 'grid_logr_model.sav'
pickle.dump(grid_logr, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(Xtest, ytest)
print(result)