In [None]:
import pandas as pd
import numpy as np

In [None]:
import sys
sys.path.append('../lending_club')
import config
from data_prep import get_lending_club_data, refine_features

In [None]:
%%time
approved_loans = get_lending_club_data(config.APPROVED_LOANS_CSV, 
                                       clean_file=True, 
                                       filename_to_save='dd.approved.parquet')
approved_loans_df = approved_loans.compute()

In [None]:
%%time
approved_loans_df = refine_features(approved_loans_df)

In [None]:
approved_loans_df.to_parquet(config.DATAPATH / 'approved.parquet', engine='fastparquet', compression = 'GZIP')

In [None]:
%%time
approved = pd.read_parquet('../data/approved.parquet', engine='fastparquet')

### Removed NA's

In [None]:
approved.drop(['emp_title', 'earliest_cr_line', 'grade', 'zip_code'], axis = 1, inplace = True)

### Dummification

In [None]:
%%time
for d in ['addr_state', 'application_type', 'disbursement_method', 'home_ownership', \
          'initial_list_status', 'purpose', 'sub_grade','verification_status']:
    temp = pd.get_dummies(approved[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(approved[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    approved = pd.concat([approved.drop(d, axis=1),temp], axis=1)
    #merge back with main df

### Split into 3 and 5 year DataFrames

In [None]:
from model_prep import divide_by_term

three_year = divide_by_term(approved, 36)
five_year = divide_by_term(approved, 60)

In [None]:
three_year.drop('issue_d', axis = 1, inplace = True)
five_year.drop('issue_d', axis = 1, inplace = True)

### Train Test Split

In [None]:
target = three_year['loan_status']
three_year_features = three_year.drop('loan_status', axis = 1)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# take sample / train_test_split small randomized sample

### Trim down to a small sample and train_test_split

In [None]:
three_year_sample = three_year.sample(100000)

In [None]:
%%time
from model_prep import split_data

Xtrain,Xtest,ytrain,ytest = split_data(three_year_sample, test_size = 0.3)

In [None]:
%%time
ss = StandardScaler()
ss.fit_transform(Xtrain.values, ytrain)

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix

In [None]:
%%time
svm = SVC()
svm.set_params(verbose = 5, max_iter = 1e8)

In [None]:
%%time
svm.fit(Xtrain, ytrain)

In [None]:
%%time
svm.score(Xtrain, ytrain)

In [None]:
%%time
svm.fit(Xtest, ytest)

In [None]:
%%time
svm.score(Xtest,ytest)

In [None]:
%%time
confusion_matrix(svm.predict(Xtrain), ytrain)

### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_dict = {'C':np.linspace(0.001,0.1,10), 'gamma':np.linspace(0.0001,0.1, 10)}
grid = GridSearchCV(svm, param_dict, cv=3, return_train_score = True, n_jobs = -1)
grid.fit(Xtrain, ytrain)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
svc_best = grid.best_estimator_

In [None]:
confusion_matrix(svc_best.predict(Xtrain), ytrain)

In [None]:
grid2 = [
    {'C': np.linspace(0.001,0.1,10),
     'kernel': ['poly'],
     'degree': [1, 2, 3]},
    {'C': np.linspace(0.001,0.1,10),
     'gamma': np.linspace(0.0001,0.1, 10),
     'kernel': ['rbf']}
]

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 2)


grid_svc = GridSearchCV(estimator = svm,
                        param_grid = grid2,
                        cv = skf,
                        n_jobs = -1)

grid_svc.fit(Xtrain, ytrain)

In [None]:
grid_svc.best_params_

In [None]:
grid_svc.best_score_

In [None]:
confusion_matrix(grid_svc.best_estimator_.predict(Xtrain), ytrain)

### 5 Year Loans

In [None]:
target = five_year['loan_status']
five_year_features = five_year.drop('loan_status', axis = 1)

In [None]:
svm.fit(five_year_features, target)
svm.score(five_year_features, target)