In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../lending_club')
import config
from data_prep import get_lending_club_data, refine_features

In [None]:
%%time
approved_loans = get_lending_club_data(config.APPROVED_LOANS_CSV, 
                                       clean_file=True, 
                                       filename_to_save='dd.approved.parquet')
approved_loans_df = approved_loans.compute()

In [None]:
%%time
approved_loans_df = refine_features(approved_loans_df)

In [None]:
approved_loans_df.to_parquet(config.DATAPATH / 'approved.parquet', engine='fastparquet', compression = 'GZIP')

In [3]:
%%time
approved = pd.read_parquet('../data/approved.parquet', engine='fastparquet')

Wall time: 12 s


### Removed NA's

In [4]:
approved.drop(['emp_title', 'earliest_cr_line', 'grade', 'zip_code'], axis = 1, inplace = True)

### Dummification

In [7]:
%%time
for d in ['addr_state', 'application_type', 'disbursement_method', 'home_ownership', \
          'initial_list_status', 'purpose', 'sub_grade','verification_status']:
    temp = pd.get_dummies(approved[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(approved[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    approved = pd.concat([approved.drop(d, axis=1),temp], axis=1)
    #merge back with main df

Wall time: 11.9 s


### Split into 3 and 5 year DataFrames

In [8]:
from model_prep import divide_by_term

three_year = divide_by_term(approved, 36)
five_year = divide_by_term(approved, 60)

In [9]:
three_year.drop('issue_d', axis = 1, inplace = True)
five_year.drop('issue_d', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### Train Test Split

In [10]:
target = three_year['loan_status']
three_year_features = three_year.drop('loan_status', axis = 1)

In [11]:
from sklearn.preprocessing import StandardScaler

In [None]:
# take sample / train_test_split small randomized sample

### Trim down to a small sample and train_test_split

In [30]:
three_year_sample = three_year.sample(200)

In [31]:
%%time
from model_prep import split_data

Xtrain,Xtest,ytrain,ytest = split_data(three_year_sample, test_size = 0.95)

Wall time: 6 ms


In [32]:
%%time
ss = StandardScaler()
ss.fit_transform(Xtrain.values, ytrain)

Wall time: 6.01 ms


array([[-1.25157495,  1.46717913,  0.27109463, ...,  0.        ,
        -0.74535599,  1.45296631],
       [ 0.71626842, -0.03622906,  1.05095589, ...,  0.        ,
         1.34164079, -0.6882472 ],
       [-0.04059442, -0.01718351,  1.05095589, ...,  0.        ,
        -0.74535599, -0.6882472 ],
       ...,
       [-0.3649642 ,  1.91474959, -0.50876663, ...,  0.        ,
        -0.74535599,  1.45296631],
       [-0.04059442,  1.38980658,  1.05095589, ...,  0.        ,
        -0.74535599,  1.45296631],
       [ 0.64821564,  1.75762379,  1.05095589, ...,  0.        ,
        -0.74535599,  1.45296631]])

In [33]:
from sklearn.svm import SVC, SVR, LinearSVC
from sklearn.metrics import confusion_matrix

In [34]:
%%time
svm = LinearSVC()
svm.set_params(verbose = 5, max_iter = 1e8)

Wall time: 0 ns


LinearSVC(max_iter=100000000.0, verbose=5)

In [35]:
svm.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 100000000.0,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 5}

In [36]:
%%time
svm.fit(Xtrain, ytrain)

[LibLinear]Wall time: 13min 39s




LinearSVC(max_iter=100000000.0, verbose=5)

In [37]:
%%time
svm.score(Xtrain, ytrain)

Wall time: 7 ms


0.8428571428571429

In [38]:
svm.fit(Xtest, ytest)

[LibLinear]



LinearSVC(max_iter=100000000.0, verbose=5)

In [39]:
svm.score(Xtest,ytest)

0.8666666666666667

In [None]:
target = five_year['loan_status']
five_year_features = five_year.drop('loan_status', axis = 1)

In [None]:
svm.fit(five_year_features, target)
svm.score(five_year_features, target)