# Import File

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../lending_club')
import config
from data_prep import get_lending_club_data, refine_features

In [3]:
%%time
approved = pd.read_pickle('../data/approved.pickle')

Wall time: 4.73 s


In [None]:
approved.dtypes

### Removed NA's

In [4]:
approved.drop(['emp_title', 'earliest_cr_line', 'grade', 'zip_code', 'last_pymnt_d', 'fico_range_high', 'fico_range_low', 'total_pymnt', 'PnL'], axis = 1, inplace = True)

In [5]:
approved.dropna(axis = 0, inplace = True)

### Dummification

In [6]:
%%time
for d in ['addr_state', 'application_type', 'disbursement_method', 'home_ownership', \
          'initial_list_status', 'purpose','verification_status']:
    temp = pd.get_dummies(approved[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(approved[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    approved = pd.concat([approved.drop(d, axis=1),temp], axis=1)
    #merge back with main df

Wall time: 11.6 s


### Get 5 year loan DataFrame


In [7]:
from model_prep import divide_by_term

five_year = divide_by_term(approved, 60)

In [8]:
five_year.drop('issue_d', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
five_year_sample = five_year.sample(1000)

### Linear 

In [10]:
five_year.shape

(47783, 88)

In [11]:
target = five_year_sample['loan_status']
five_year_sample_features = five_year_sample.drop('loan_status', axis = 1)

In [12]:
five_year_sample['loan_status'].value_counts()

1    761
0    239
Name: loan_status, dtype: int64

In [13]:
from model_prep import split_data

Xtrain,Xtest,ytrain,ytest = split_data(five_year_sample, test_size = 0.9)

In [14]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix

In [15]:
%%time
svm = LinearSVC(class_weight = 'balanced')
svm.set_params(max_iter = 1e8)

Wall time: 0 ns


LinearSVC(class_weight='balanced', max_iter=100000000.0)

In [16]:
%%time
svm.fit(Xtrain, ytrain)

Wall time: 23min




LinearSVC(class_weight='balanced', max_iter=100000000.0)

In [17]:
%%time
svm.score(Xtrain, ytrain)

Wall time: 36.1 ms


0.52

In [18]:
%%time
svm.score(Xtest, ytest)

Wall time: 33.5 ms


0.3477777777777778

In [19]:
%%time
confusion_matrix(ytest, svm.predict(Xtest))

Wall time: 44 ms


array([[144,  70],
       [517, 169]], dtype=int64)

In [19]:
from joblib import dump, load

In [20]:
dump(svm, 'linear5yr.model')

['linear5yr.model']

In [21]:
# For loading model back in
svm = load('linear5yr.model')

### Linear GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(1, 10000, 5)}
grid = GridSearchCV(svm, param_dict, return_train_score = True, n_jobs = -1)
grid.fit(Xtrain, ytrain)

In [None]:
%%time
grid.best_params_

In [None]:
%%time
grid.best_score_

In [None]:
%%time
svc_best = grid.best_estimator_

In [None]:
%%time
svc_best.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svc_best.predict(Xtest))

In [None]:
dump(svc_best, 'linear5yr_best.model')

In [None]:
svc_best = load('linear5yr_best.model')

# Merge test results back into the df to investigate the 'fully paid fails'

In [None]:
model_results = pd.Series(svm.predict(five_year_features), index = five_year_features.index, name = 'loan_status_predict')

In [None]:
df2 = pd.concat([five_year_features, target, model_results], axis = 1)

In [None]:
model_port = df2.loc[df2['loan_status_predict'] == 1]

In [None]:
approved['PnL'] = approved['total_pymnt'] - approved['loan_amnt']

In [None]:
df_full = approved.copy(deep = True)

In [None]:
PnL = df_full['PnL']

In [None]:
model_port = model_port.merge(PnL, how = 'inner', left_index = True, right_index = True)

In [None]:
print(f'Total investment for Model Portfolio: {model_port.loan_amnt.sum():.0f}')
print(f'Total PnL for Model Portfolio: {model_port.PnL.sum():.0f}')
print(f'Return for Model Portfolio: {(model_port.PnL.sum()/model_port.loan_amnt.sum())*100:.2f}%')

### Polynomial

In [None]:
%%time
svmPoly = SVC(kernel = 'poly', max_iter = 1e8)

In [None]:
%%time
svmPoly.fit(Xtrain, ytrain)

In [None]:
%%time
svmPoly.score(Xtrain, ytrain)

In [None]:
svmPoly.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svmPoly.predict(Xtest))

### Polynomial GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(1700, 1800, 10)}
gridPoly = GridSearchCV(svmPoly, param_dict, return_train_score = True, n_jobs = -1)
gridPoly.fit(Xtrain, ytrain)

In [None]:
%%time
gridPoly.best_params_

In [None]:
%%time
gridPoly.best_score_

In [None]:
%%time
Poly_best = gridPoly.best_estimator_

In [None]:
%%time
Poly_best.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, Poly_best.predict(Xtest))

In [None]:
dump(Poly_best, 'poly_best.model')

### Radial

In [None]:
%%time
svmRbf = SVC(kernel = 'rbf', max_iter = 1e8)

In [None]:
%%time
svmRbf.fit(Xtrain, ytrain)

In [None]:
%%time
svmRbf.score(Xtrain, ytrain)

In [None]:
%%time
confusion_matrix(ytest, svmRbf.predict(Xtest))

### Radial GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(0.001,0.1,5), 'gamma':np.linspace(0.0001,0.1, 10)}
gridRbf = GridSearchCV(svmRbf, param_dict, return_train_score = True, n_jobs = -1)
gridRbf.fit(Xtrain, ytrain)

In [None]:
%%time
gridRbf.best_params_

In [None]:
%%time
gridRbf.best_score_

In [None]:
%%time
Rbf_best = gridRbf.best_estimator_

In [None]:
%%time
confusion_matrix(ytest, Rbf_best.predict(Xtest))

In [None]:
dump(Rbf_best, 'rbf_best.model')