In [None]:
%%time
import coiled
cluster = coiled.Cluster(n_workers = 20)

from dask.distributed import Client
client = Client(cluster)
print('Dashboard:', client.dashboard_link)

In [None]:
import dask.dataframe as dd

In [None]:
raw_data = dd.read_csv(
    "s3://lending-club/accepted_2007_to_2018Q4.csv",
    dtype={'desc': 'object', 
            'id': 'object',
            'sec_app_earliest_cr_line': 'object'}, 
    parse_dates = ['issue_d','earliest_cr_line'],
    low_memory=False,
    storage_options={"anon": True},
    blocksize="16 MiB",
)


In [None]:
import pandas as pd
import numpy as np

In [None]:
import sys
sys.path.append('../lending_club')
import config
from data_prep import get_lending_club_data, refine_features, clean

In [None]:
raw_data = raw_data.loc[:, config.RAW_FEATURES]

In [None]:
raw_data = clean(raw_data)

In [None]:
approved = raw_data.compute()

In [None]:
approved = refine_features(approved)

In [None]:
approved.columns

In [None]:
approved.drop(['fico_range_high', 'fico_range_low', 'emp_title', 'earliest_cr_line', 'grade', 'zip_code', 'last_pymnt_d'], axis = 1, inplace = True)

In [None]:
approved.columns

In [None]:
%%time
for d in ['addr_state', 'application_type', 'disbursement_method', 'home_ownership', \
          'initial_list_status', 'purpose', 'sub_grade','verification_status']:
    temp = pd.get_dummies(approved[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(approved[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    approved = pd.concat([approved.drop(d, axis=1),temp], axis=1)
    #merge back with main df

In [None]:
from model_prep import divide_by_term

five_year = divide_by_term(approved, 60)

In [None]:
five_year.drop('issue_d', axis = 1, inplace = True)

In [None]:
target = five_year['loan_status']
five_year_features = five_year.drop('loan_status', axis = 1)

In [None]:
from model_prep import split_data

Xtrain,Xtest,ytrain,ytest = split_data(five_year, test_size = 0.3, random_state = 10)

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix

In [None]:
%%time
svm = LinearSVC(max_iter = -1)

In [None]:
%%time
svm.fit(Xtrain, ytrain)

In [None]:
%%time
svm.score(Xtrain, ytrain)

In [None]:
confusion_matrix(svm.predict(Xtrain), ytrain)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits = 5)

In [None]:
svm_bal = LinearSVC(class_weight = 'balanced', max_iter = 1e8)

In [None]:
%%time
param_dict = {'C':np.linspace(0.001,0.1,5)}
grid = GridSearchCV(svm_bal, param_dict, cv=skf, return_train_score = True, n_jobs = -1)

In [None]:
import joblib

In [None]:
with joblib.parallel_backend('dask', n_jobs=-1, scatter=[Xtrain,ytrain]
):
    print('fitting now')
    grid.fit(Xtrain, ytrain)

In [None]:
confusion_matrix(ytrain, grid_svc.best_estimator_.predict(Xtest))

In [None]:
bestgridsvm = grid_svc.best_estimator_

In [None]:
dump(bestgridsvm, 'bestgridsvm.model')