# Import File

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../lending_club')
import config
from data_prep import get_lending_club_data, refine_features

In [3]:
%%time
approved = pd.read_parquet(config.DATAPATH / 'approved.parquet', engine='fastparquet')

Wall time: 6.76 s


In [4]:
approved.dtypes

addr_state                         object
annual_inc                        float64
application_type                   object
disbursement_method                object
dti                               float64
earliest_cr_line           datetime64[ns]
emp_length                          int32
emp_title                          object
fico_range_high                   float64
fico_range_low                    float64
grade                              object
home_ownership                     object
initial_list_status                object
installment                       float64
int_rate                          float64
issue_d                    datetime64[ns]
last_pymnt_d               datetime64[ns]
loan_amnt                         float64
loan_status                         int32
open_acc                          float64
pub_rec                           float64
pub_rec_bankruptcies              float64
purpose                            object
sub_grade                         

### Removed NA's

In [5]:
approved.drop(['emp_title', 'earliest_cr_line', 'grade', 'zip_code', 'last_pymnt_d'], axis = 1, inplace = True)

### Dummification

In [6]:
%%time
for d in ['addr_state', 'application_type', 'disbursement_method', 'home_ownership', \
          'initial_list_status', 'purpose', 'sub_grade','verification_status']:
    temp = pd.get_dummies(approved[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(approved[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    approved = pd.concat([approved.drop(d, axis=1),temp], axis=1)
    #merge back with main df

Wall time: 7.03 s


### Split into 3 and 5 year DataFrames

In [7]:
from model_prep import divide_by_term
five_year = divide_by_term(approved, 60)

In [8]:
five_year.drop('issue_d', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### 5 Year Loans

In [9]:
five_year.shape

(47783, 113)

In [10]:
target = five_year['loan_status']
five_year_features = five_year.drop('loan_status', axis = 1)

In [11]:
five_year['loan_status'].value_counts()

1    36525
0    11258
Name: loan_status, dtype: int64

In [12]:
from model_prep import split_data

Xtrain,Xtest,ytrain,ytest = split_data(five_year, test_size = 0.3, random_state = 10)

In [13]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix

In [14]:
%%time
svm = SVC()
svm.set_params(max_iter = 1e8)

Wall time: 0 ns


SVC(max_iter=100000000.0)

In [15]:
%%time
svm.fit(Xtrain, ytrain)

Wall time: 1min 15s


SVC(max_iter=100000000.0)

In [16]:
%%time
svm.score(Xtrain, ytrain)

Wall time: 39.4 s


0.9231643147572351

In [17]:
%%time
confusion_matrix(ytest, svm.predict(Xtest))

Wall time: 20.8 s


array([[ 2450,   928],
       [  128, 10829]], dtype=int64)

In [17]:
from joblib import dump, load
dump(svm, 'svm60.model')

['svm.model']

In [None]:
# For loading model back in
svm = load('svm60.model')

### GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV

## Polynomial and Radial Kernels for SVC

In [21]:
grid2 = [
    {'C': np.linspace(0.001,0.1,5),
     'kernel': ['poly'],
     'degree': [1, 2, 3]},
    {'C': np.linspace(0.001,0.1,5),
     'gamma': np.linspace(0.0001,0.1, 5),
     'kernel': ['rbf']}
]

In [None]:
%%time 
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5)
svm = SVC(class_weight='balanced', max_iter=-1)

grid_svc = GridSearchCV(estimator = svm,
                        param_grid = grid2,
                        cv = skf,
                        n_jobs = -1)

grid_svc.fit(Xtrain, ytrain)

In [None]:
grid_svc.best_params_

In [None]:
grid_svc.best_score_

In [None]:
confusion_matrix(ytrain, grid_svc.best_estimator_.predict(Xtest))

In [None]:
bestgridsvm = grid_svc.best_estimator_

In [None]:
dump(bestgridsvm, 'bestgridsvm60.model')