# Import File

In [6]:
import pandas as pd
import numpy as np

In [7]:
import sys
sys.path.append('../lending_club')
import config
from data_prep import get_lending_club_data, refine_features

In [8]:
%%time
approved = pd.read_pickle('../data/approved.pickle')

Wall time: 6.36 s


In [9]:
approved.dtypes

addr_state                         object
annual_inc                        float64
application_type                   object
disbursement_method                object
dti                               float64
earliest_cr_line           datetime64[ns]
emp_length                          int32
emp_title                          object
fico_range_high                   float64
fico_range_low                    float64
grade                              object
home_ownership                     object
initial_list_status                object
inq_last_6mths                    float64
installment                       float64
int_rate                          float64
issue_d                    datetime64[ns]
last_pymnt_d               datetime64[ns]
loan_amnt                         float64
loan_status                         int32
open_acc                          float64
pub_rec                           float64
pub_rec_bankruptcies              float64
purpose                           

### Removed NA's

In [10]:
approved.drop(['emp_title', 'earliest_cr_line', 'grade', 'zip_code', 'last_pymnt_d', 'fico_range_high', 'fico_range_low', 'total_pymnt', 'PnL'], axis = 1, inplace = True)

In [11]:
approved.dropna(axis = 0, inplace = True)

### Dummification

In [14]:
%%time
for d in ['addr_state', 'application_type', 'disbursement_method', 'home_ownership', \
          'initial_list_status', 'purpose','verification_status']:
    temp = pd.get_dummies(approved[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(approved[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    approved = pd.concat([approved.drop(d, axis=1),temp], axis=1)
    #merge back with main df

Wall time: 9.31 s


### Get 5 year loan DataFrame


In [15]:
from model_prep import divide_by_term

five_year = divide_by_term(approved, 60)

In [16]:
five_year.drop('issue_d', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### Linear 

In [17]:
five_year.shape

(47783, 88)

In [18]:
target = five_year['loan_status']
five_year_features = five_year.drop('loan_status', axis = 1)

In [19]:
five_year['loan_status'].value_counts()

1    36525
0    11258
Name: loan_status, dtype: int64

In [20]:
from model_prep import split_data

Xtrain,Xtest,ytrain,ytest = split_data(five_year, test_size = 0.9)

In [21]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix

In [22]:
%%time
svm = LinearSVC()
svm.set_params(max_iter = 1e8)

Wall time: 0 ns


LinearSVC(max_iter=100000000.0)

In [None]:
%%time
svm.fit(Xtrain, ytrain)

In [None]:
%%time
svm.score(Xtrain, ytrain)

In [None]:
%%time
svm.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svm.predict(Xtest))

In [12]:
from joblib import dump, load

In [None]:
dump(svm, 'linear5yr.model')

In [13]:
# For loading model back in
svm = load('linear5yr.model')

### Linear GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(1, 10000, 10)}
grid = GridSearchCV(svm, param_dict, return_train_score = True, n_jobs = -1)
grid.fit(Xtrain, ytrain)

In [None]:
%%time
grid.best_params_

In [None]:
%%time
grid.best_score_

In [None]:
%%time
svc_best = grid.best_estimator_

In [None]:
%%time
svc_best.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svc_best.predict(Xtest))

In [None]:
dump(svc_best, 'linear5yr_best.model')

In [None]:
svc_best = load('linear5yr_best.model')

# Merge test results back into the df to investigate the 'fully paid fails'

In [23]:
model_results = pd.Series(svm.predict(five_year_features), index = five_year_features.index, name = 'loan_status_predict')

In [24]:
df2 = pd.concat([five_year_features, target, model_results], axis = 1)

In [25]:
model_port = df2.loc[df2['loan_status_predict'] == 1]

In [26]:
approved['PnL'] = approved['total_pymnt'] - approved['loan_amnt']

In [27]:
df_full = approved.copy(deep = True)

In [28]:
PnL = df_full['PnL']

In [29]:
model_port = model_port.merge(PnL, how = 'inner', left_index = True, right_index = True)

In [30]:
print(f'Total investment for Model Portfolio: {model_port.loan_amnt.sum():.0f}')
print(f'Total PnL for Model Portfolio: {model_port.PnL.sum():.0f}')
print(f'Return for Model Portfolio: {(model_port.PnL.sum()/model_port.loan_amnt.sum())*100:.2f}%')

Total investment for Model Portfolio: 759669350
Total PnL for Model Portfolio: 247602631
Return for Model Portfolio: 32.59%


### Polynomial

In [None]:
%%time
svmPoly = SVC(kernel = 'poly', max_iter = 1e8)

In [None]:
%%time
svmPoly.fit(Xtrain, ytrain)

In [None]:
%%time
svmPoly.score(Xtrain, ytrain)

In [None]:
svmPoly.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svmPoly.predict(Xtest))

### Polynomial GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(1700, 1800, 10)}
gridPoly = GridSearchCV(svmPoly, param_dict, return_train_score = True, n_jobs = -1)
gridPoly.fit(Xtrain, ytrain)

In [None]:
%%time
gridPoly.best_params_

In [None]:
%%time
gridPoly.best_score_

In [None]:
%%time
Poly_best = gridPoly.best_estimator_

In [None]:
%%time
Poly_best.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, Poly_best.predict(Xtest))

In [None]:
dump(Poly_best, 'poly_best.model')

### Radial

In [None]:
%%time
svmRbf = SVC(kernel = 'rbf', max_iter = 1e8)

In [None]:
%%time
svmRbf.fit(Xtrain, ytrain)

In [None]:
%%time
svmRbf.score(Xtrain, ytrain)

In [None]:
%%time
confusion_matrix(ytest, svmRbf.predict(Xtest))

### Radial GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(0.001,0.1,5), 'gamma':np.linspace(0.0001,0.1, 10)}
gridRbf = GridSearchCV(svmRbf, param_dict, return_train_score = True, n_jobs = -1)
gridRbf.fit(Xtrain, ytrain)

In [None]:
%%time
gridRbf.best_params_

In [None]:
%%time
gridRbf.best_score_

In [None]:
%%time
Rbf_best = gridRbf.best_estimator_

In [None]:
%%time
confusion_matrix(ytest, Rbf_best.predict(Xtest))

In [None]:
dump(Rbf_best, 'rbf_best.model')