# Import File

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../lending_club')
import config
from data_prep import get_lending_club_data, refine_features

In [None]:
%%time
approved_loans = get_lending_club_data(config.APPROVED_LOANS_CSV, 
                                       clean_file=True)
approved_loans_df = approved_loans.compute()

In [None]:
%%time
approved_loans_df = refine_features(approved_loans_df)

In [3]:
%%time
approved = pd.read_pickle('../data/approved.pickle')

Wall time: 6.31 s


In [None]:
approved.dtypes

### Removed NA's

In [4]:
approved.drop(['emp_title', 'earliest_cr_line', 'grade', 'zip_code', 'last_pymnt_d'], axis = 1, inplace = True)

### Dummification

In [5]:
%%time
for d in ['addr_state', 'application_type', 'disbursement_method', 'home_ownership', \
          'initial_list_status', 'purpose', 'sub_grade','verification_status']:
    temp = pd.get_dummies(approved[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(approved[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    approved = pd.concat([approved.drop(d, axis=1),temp], axis=1)
    #merge back with main df

Wall time: 23.6 s


### Get 5 year loan DataFrame


In [6]:
from model_prep import divide_by_term

three_year = divide_by_term(approved, 36)

In [7]:
three_year.drop('issue_d', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### Linear 

In [8]:
three_year.shape

(613120, 114)

In [9]:
target = three_year['loan_status']
three_year_features = three_year.drop('loan_status', axis = 1)

In [10]:
three_year['loan_status'].value_counts()

1    528884
0     84236
Name: loan_status, dtype: int64

In [12]:
from model_prep import split_data

Xtrain,Xtest,ytrain,ytest = split_data(three_year, test_size = 0.9)

In [14]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix

In [None]:
%%time
svm = LinearSVC()
svm.set_params(max_iter = 1e8)

In [None]:
%%time
svm.fit(Xtrain, ytrain)

In [17]:
%%time
svm.score(Xtrain, ytrain)

Wall time: 2min 32s


0.9474491127348643

In [None]:
%%time
svm.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svm.predict(Xtest))

In [15]:
from joblib import dump, load

In [None]:
dump(svm, 'svm.model')

In [16]:
# For loading model back in
svm = load('svm.model')

### Linear GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(8500, 8900, 10)}
grid = GridSearchCV(svm, param_dict, return_train_score = True, n_jobs = -1)
grid.fit(Xtrain, ytrain)

In [None]:
%%time
grid.best_params_

In [None]:
%%time
grid.best_score_

In [None]:
%%time
svc_best = grid.best_estimator_

In [None]:
%%time
svc_best.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svc_best.predict(Xtest))

In [None]:
dump(svc_best, 'svc_best.model')

In [None]:
svc_best = load('svc_best.model')

### Polynomial

In [None]:
%%time
svmPoly = SVC(kernel = 'poly', max_iter = 1e8)

In [None]:
%%time
svmPoly.fit(Xtrain, ytrain)

In [None]:
%%time
svmPoly.score(Xtrain, ytrain)

In [None]:
svmPoly.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svmPoly.predict(Xtest))

### Polynomial GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(1700, 1800, 10)}
gridPoly = GridSearchCV(svmPoly, param_dict, return_train_score = True, n_jobs = -1)
gridPoly.fit(Xtrain, ytrain)

In [None]:
%%time
gridPoly.best_params_

In [None]:
%%time
gridPoly.best_score_

In [None]:
%%time
Poly_best = gridPoly.best_estimator_

In [None]:
%%time
Poly_best.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, Poly_best.predict(Xtest))

In [None]:
dump(Poly_best, 'poly_best.model')

### Radial

In [None]:
%%time
svmRbf = SVC(kernel = 'rbf', max_iter = 1e8)

In [None]:
%%time
svmRbf.fit(Xtrain, ytrain)

In [None]:
%%time
svmRbf.score(Xtrain, ytrain)

In [None]:
%%time
confusion_matrix(ytest, svmRbf.predict(Xtest))

### Radial GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(0.001,0.1,5), 'gamma':np.linspace(0.0001,0.1, 10)}
gridRbf = GridSearchCV(svmRbf, param_dict, return_train_score = True, n_jobs = -1)
gridRbf.fit(Xtrain, ytrain)

In [None]:
%%time
gridRbf.best_params_

In [None]:
%%time
gridRbf.best_score_

In [None]:
%%time
Rbf_best = gridRbf.best_estimator_

In [None]:
%%time
confusion_matrix(ytest, Rbf_best.predict(Xtest))

In [None]:
dump(Rbf_best, 'rbf_best.model')