# Import File

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../lending_club')
import config
from data_prep import get_lending_club_data, refine_features

In [32]:
%%time
approved = pd.read_pickle('../data/approved.pickle')

Wall time: 3 s


In [53]:
approved.dtypes

addr_state                         object
annual_inc                        float64
application_type                   object
disbursement_method                object
dti                               float64
earliest_cr_line           datetime64[ns]
emp_length                          int32
emp_title                          object
fico_range_high                   float64
fico_range_low                    float64
grade                              object
home_ownership                     object
initial_list_status                object
inq_last_6mths                    float64
installment                       float64
int_rate                          float64
issue_d                    datetime64[ns]
last_pymnt_d               datetime64[ns]
loan_amnt                         float64
loan_status                         int32
open_acc                          float64
pub_rec                           float64
pub_rec_bankruptcies              float64
purpose                           

### Removed NA's

In [33]:
approved.drop(['emp_title', 'earliest_cr_line', 'grade', 'zip_code', 'last_pymnt_d', 'fico_range_high', 'fico_range_low', 'total_pymnt'], axis = 1, inplace = True)

In [5]:
approved.dtypes

addr_state                         object
annual_inc                        float64
application_type                   object
disbursement_method                object
dti                               float64
emp_length                          int32
home_ownership                     object
initial_list_status                object
inq_last_6mths                    float64
installment                       float64
int_rate                          float64
issue_d                    datetime64[ns]
loan_amnt                         float64
loan_status                         int32
open_acc                          float64
pub_rec                           float64
pub_rec_bankruptcies              float64
purpose                            object
sub_grade                           int32
term                                int32
verification_status                object
days_since_first_credit             int64
fico_score_average                float64
dtype: object

In [34]:
approved.dropna(axis = 0, inplace = True)

### Dummification

In [35]:
%%time
for d in ['addr_state', 'application_type', 'disbursement_method', 'home_ownership', \
          'initial_list_status', 'purpose','verification_status']:
    temp = pd.get_dummies(approved[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(approved[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    approved = pd.concat([approved.drop(d, axis=1),temp], axis=1)
    #merge back with main df

Wall time: 10.2 s


### Split into 3 and 5 year loan DataFrame

In [36]:
from model_prep import divide_by_term

three_year = divide_by_term(approved, 36)
five_year = divide_by_term(approved, 60)
combined = pd.concat([three_year,five_year],axis=0)

In [8]:
combined.shape

(660903, 89)

In [40]:
combined.drop('issue_d', axis = 1, inplace = True)

In [10]:
combined_sample = combined.sample(1000)

### Linear 

In [12]:
target = combined_sample['loan_status']
combined_features = combined_sample.drop('loan_status', axis = 1)

In [30]:
combined_sample['loan_status'].value_counts()

1    861
0    139
Name: loan_status, dtype: int64

In [22]:
from model_prep import split_data

Xtrain,Xtest,ytrain,ytest = split_data(combined_sample, test_size = 0.3)

In [23]:
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix

In [24]:
%%time
svm_full = LinearSVC()
svm_full.set_params(max_iter = 1e8)

Wall time: 0 ns


LinearSVC(max_iter=100000000.0)

In [25]:
%%time
svm_full.fit(Xtrain, ytrain)

Wall time: 1h 43min 1s




LinearSVC(max_iter=100000000.0)

In [26]:
%%time
svm_full.score(Xtrain, ytrain)

Wall time: 13.5 ms


0.8557142857142858

In [27]:
%%time
confusion_matrix(ytrain, svm_full.predict(Xtrain))

Wall time: 22 ms


array([[  1, 101],
       [  0, 598]], dtype=int64)

In [28]:
%%time
svm_full.score(Xtest, ytest)

Wall time: 12 ms


0.8766666666666667

In [29]:
%%time
confusion_matrix(ytest, svm_full.predict(Xtest))

Wall time: 18 ms


array([[  0,  37],
       [  0, 263]], dtype=int64)

### Saving model

In [44]:
from joblib import dump, load

In [45]:
dump(svm_full, 'svm_full.model')

['svm_full.model']

In [6]:
# For loading model back in
svm_full = load('svm_full.model')

### Testing model on full model

In [41]:
target = combined['loan_status']
combinedfull_features = combined.drop('loan_status', axis = 1)

In [42]:
combined['loan_status'].value_counts()

1    565409
0     95494
Name: loan_status, dtype: int64

In [43]:
confusion_matrix(target, svm_full.predict(combinedfull_features))

array([[   250,  95244],
       [   565, 564844]], dtype=int64)

# Merge test results back into the df to investigate the 'fully paid fails'

In [47]:
model_results = pd.Series(svm_full.predict(combinedfull_features), index = combinedfull_features.index, name = 'loan_status_predict')

In [48]:
df2 = pd.concat([combinedfull_features, target, model_results], axis = 1)

# PnL analysis

In [49]:
model_port = df2.loc[df2['loan_status_predict'] == 1]

In [50]:
approved['PnL'] = approved['total_pymnt'] - approved['loan_amnt']

KeyError: 'total_pymnt'

In [None]:
df_full = approved.copy(deep = True)

In [None]:
PnL = df_full['PnL']

In [None]:
model_port = model_port.merge(PnL, how = 'inner', left_index = True, right_index = True)

In [None]:
print(f'Total investment for Model Portfolio: {model_port.loan_amnt.sum():.0f}')
print(f'Total PnL for Model Portfolio: {model_port.PnL.sum():.0f}')
print(f'Return for Model Portfolio: {(model_port.PnL.sum()/model_port.loan_amnt.sum())*100:.2f}%')

### Linear GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_dict = {'C':np.linspace(8500, 8900, 10)}
grid = GridSearchCV(svm, param_dict, return_train_score = True, n_jobs = -1)
grid.fit(Xtrain, ytrain)

In [None]:
%%time
grid.best_params_

In [None]:
%%time
grid.best_score_

In [None]:
%%time
svc_best = grid.best_estimator_

In [None]:
%%time
svc_best.score(Xtest, ytest)

In [None]:
%%time
confusion_matrix(ytest, svc_best.predict(Xtest))

In [None]:
dump(svc_best, 'svc_best.model')

In [None]:
svc_best = load('svc_best.model')