# 05 Fit and Save Complex Models Separately

### Purpose of Notebook
- Try some of the more complex models:
- Export fit models for later review

## Imports

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
import random
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
import pickle

random.seed(42)

## Pull in train and test data

In [10]:
with open('../Data/X_train_clean.pkl', 'rb') as f:
    X_train = pickle.load(f)
    
with open('../Data/X_test_clean.pkl', 'rb') as f:
    X_test = pickle.load(f)
    
with open('../Data/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
    
with open('../Data/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

## Random Forest with GridSearch

In [80]:
%%time
rf = RandomForestClassifier(random_state=42, min_samples_split=2, min_samples_leaf=1)
rf_params = {
    'n_estimators': [10, 30, 50, 100],
    'min_samples_split': [2,3],
    'min_samples_leaf': [1,2]
}
rf_gs = GridSearchCV(rf, param_grid=rf_params, verbose=2, n_jobs=3)
rf_gs.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=10 ........
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=10 ........
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=10 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=10, total=   3.3s
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=10, total=   3.4s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=30 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=10, total=   3.4s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=30 ........
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=30 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=30, total=   3.1s
[CV]  min_samples_leaf=1, min_samples_split=2, n_estimators=30, total=   3.1s
[CV] min_samples_leaf=1, min_samples_split=2, n_estimators=50 ........
[CV]  min_samples_leaf=1, min_samples_split=2, n_est

[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   55.8s


[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=10 ........
[CV]  min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   3.7s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=10 ........
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=10, total=   3.2s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=30 ........
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=10, total=   3.2s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=30 ........
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=10, total=   2.9s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=30 ........
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=30, total=   3.3s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=50 ........
[CV]  min_samples_leaf=2, min_samples_split=3, n_estimators=30, total=   3.3s
[CV] min_samples_leaf=2, min_samples_split=3, n_estimators=50 ........
[CV]  min_samples_leaf=2, min_samp

[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:  1.2min finished


CPU times: user 29.6 s, sys: 767 ms, total: 30.4 s
Wall time: 1min 16s


In [84]:
with open('../Models/RandomForest_gs.pkl', 'wb') as f:
    pickle.dump(rf_gs, f)

## Try SVM

In [14]:
%%time
sv = SVC()
param_grid = {
    'gamma': np.logspace(-5, 2, 20)
}

gs_sv = GridSearchCV(sv, param_grid,scoring='accuracy', verbose=1, n_jobs=3)
gs_sv.fit(X_train, y_train)
print('Best Score:', gs_sv.best_score_)
print('Best Parameters:', gs_sv.best_params_)
print('Test:',gs_sv.score(X_test,y_test))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  5.4min
[Parallel(n_jobs=3)]: Done  60 out of  60 | elapsed:  7.3min finished


Best Score: 0.8278388278388278
Best Parameters: {'gamma': 0.6158482110660255}
Test: 0.8444444444444444
CPU times: user 56.7 s, sys: 943 ms, total: 57.6 s
Wall time: 7min 39s


## Logistic Regression with Grid Search

In [15]:
%%time
lr = LogisticRegression(random_state=42)
lr_params = {
    'C': [0.1, 1, 10],
    'penalty': ['l2']
}
lr_gs = GridSearchCV(lr, param_grid=lr_params, verbose=2, n_jobs=3)
lr_gs.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] C=0.1, penalty=l2 ...............................................
[CV] C=0.1, penalty=l2 ...............................................
[CV] C=0.1, penalty=l2 ...............................................
[CV] ................................ C=0.1, penalty=l2, total=   3.3s
[CV] ................................ C=0.1, penalty=l2, total=   3.2s
[CV] C=1, penalty=l2 .................................................
[CV] C=1, penalty=l2 .................................................
[CV] ................................ C=0.1, penalty=l2, total=   3.2s
[CV] C=1, penalty=l2 .................................................
[CV] .................................. C=1, penalty=l2, total=   3.1s
[CV] .................................. C=1, penalty=l2, total=   3.1s
[CV] C=10, penalty=l2 ................................................
[CV] C=10, penalty=l2 ................................................
[CV] .............

[Parallel(n_jobs=3)]: Done   9 out of   9 | elapsed:   15.1s remaining:    0.0s
[Parallel(n_jobs=3)]: Done   9 out of   9 | elapsed:   15.1s finished


CPU times: user 5.59 s, sys: 268 ms, total: 5.86 s
Wall time: 15.9 s


In [16]:
print('Train:', lr_gs.score(X_train,y_train))
print('Test:', lr_gs.score(X_test,y_test))

Train: 1.0
Test: 0.9094017094017094


In [17]:
lr_gs.best_params_

{'C': 10, 'penalty': 'l2'}

In [18]:
with open('../Models/LogisticRegression_gs.pkl', 'wb') as f:
    pickle.dump(lr_gs, f)

## Bagging Classifier with Decision Trees

In [4]:
%%time
bag = BaggingClassifier(random_state=42, base_estimator=DecisionTreeClassifier())
bag_params = {
    'n_estimators':[10, 15],
    'base_estimator__max_depth': list(range(4,10,2)),
    'base_estimator__min_samples_split': [2,3]
}
bag_gs = GridSearchCV(bag, param_grid=bag_params, verbose=2, n_jobs=3)
bag_gs.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=10 
[CV] base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=10 
[CV] base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=10 
[CV]  base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=10, total=   4.1s
[CV]  base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=10, total=   4.2s
[CV] base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=15 
[CV]  base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=10, total=   4.3s
[CV] base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=15 
[CV] base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=15 
[CV]  base_estimator__max_depth=4, base_estimator__min_samples_split=2, n_estimators=15, total=   4.6s

[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed:  1.3min finished


CPU times: user 23.7 s, sys: 704 ms, total: 24.4 s
Wall time: 1min 21s


In [5]:
print('Best Params:', bag_gs.best_params_)
print('Train:', bag_gs.score(X_train,y_train))
print('Test:', bag_gs.score(X_test,y_test))

Best Params: {'base_estimator__max_depth': 8, 'base_estimator__min_samples_split': 2, 'n_estimators': 10}
Train: 0.9457875457875458
Test: 0.8854700854700854


In [7]:
with open('../Models/Bag_Trees_gs.pkl', 'wb') as f:
    pickle.dump(bag_gs, f)

## Gradient Boosting with Grid Search

In [95]:
%%time
gb = GradientBoostingClassifier(random_state=42)
gb_params = {
    'learning_rate': [0.05, 0.1, 0.3],
    'max_depth': [3,4,5]
}
gb_gs = GridSearchCV(gb, param_grid=gb_params, verbose=2, n_jobs=3 )
gb_gs.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] learning_rate=0.05, max_depth=3 .................................
[CV] learning_rate=0.05, max_depth=3 .................................
[CV] learning_rate=0.05, max_depth=3 .................................
[CV] .................. learning_rate=0.05, max_depth=3, total=  15.4s
[CV] .................. learning_rate=0.05, max_depth=3, total=  15.5s
[CV] learning_rate=0.05, max_depth=4 .................................
[CV] .................. learning_rate=0.05, max_depth=3, total=  15.7s
[CV] learning_rate=0.05, max_depth=4 .................................
[CV] learning_rate=0.05, max_depth=4 .................................
[CV] .................. learning_rate=0.05, max_depth=4, total=  19.8s
[CV] learning_rate=0.05, max_depth=5 .................................
[CV] .................. learning_rate=0.05, max_depth=4, total=  19.8s
[CV] learning_rate=0.05, max_depth=5 .................................
[CV] ............

[Parallel(n_jobs=3)]: Done  27 out of  27 | elapsed:  2.8min finished


CPU times: user 38.9 s, sys: 597 ms, total: 39.5 s
Wall time: 3min 12s


In [99]:
print('Best Params:', gb_gs.best_params_)
print('Train:', gb_gs.score(X_train,y_train))
print('Test:', gb_gs.score(X_test,y_test))

Best Params: {'learning_rate': 0.1, 'max_depth': 4}
Train: 1.0
Test: 0.9145299145299145


In [100]:
with open('../Models/GradientBoost_gs.pkl', 'wb') as f:
    pickle.dump(gb_gs, f)

## AdaBoost with Grid Search

In [11]:
%%time
ada = AdaBoostClassifier(random_state=42)
ada_params = {
    'n_estimators':[50,100],
    'learning_rate': [0.5, 1.0]
}
ada_gs = GridSearchCV(ada, param_grid=ada_params, verbose=2, n_jobs=3)
ada_gs.fit(X_train, y_train)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] learning_rate=0.5, n_estimators=50 ..............................
[CV] ............... learning_rate=0.5, n_estimators=50, total=   4.6s
[CV] learning_rate=0.5, n_estimators=100 .............................
[CV] ............... learning_rate=0.5, n_estimators=50, total=   4.6s
[CV] learning_rate=0.5, n_estimators=100 .............................
[CV] ............... learning_rate=0.5, n_estimators=50, total=   4.7s
[CV] learning_rate=0.5, n_estimators=100 .............................
[CV] .............. learning_rate=0.5, n_estimators=100, total=   6.2s
[CV] .............. learning_rate=0.5, n_estimators=100, total=   6.2s
[CV] learning_rate=1.0, n_estimators=50 ..............................
[CV] learning_rate=1.0, n_estimators=50 ..............................
[CV] ............

[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:   29.4s finished


CPU times: user 9.29 s, sys: 272 ms, total: 9.56 s
Wall time: 32.3 s


In [12]:
print('Best Score:', ada_gs.best_score_)
print('Best Parameters:', ada_gs.best_params_)
print('Train:',ada_gs.score(X_train,y_train))
print('Test:',ada_gs.score(X_test,y_test))

Best Score: 0.906959706959707
Best Parameters: {'learning_rate': 0.5, 'n_estimators': 50}
Train: 0.9472527472527472
Test: 0.9128205128205128


In [14]:
with open('../Models/AdaBoost_gs.pkl','wb') as f:
    pickle.dump(ada_gs, f)

## Final Try Before Moving On:  Voting Classifier with the best Models

In [76]:
%%time
vote = VotingClassifier([
    ('ada', AdaBoostClassifier(learning_rate=0.5)),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('lr', LogisticRegression(C=10)),
    ('gb', GradientBoostingClassifier(max_depth=5))
])
vote_params = {
    'weights': [[0.25,0.25,0.25, 0.25],
                [0.4, 0.4, 0.1, 0.1],
                [0.1, 0.1, 0.4, 0.4]]
}
vote_gs = GridSearchCV(vote, param_grid=vote_params, verbose=2, n_jobs=2)
vote_gs.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] weights=[0.25, 0.25, 0.25, 0.25] ................................
[CV] weights=[0.25, 0.25, 0.25, 0.25] ................................
[CV] weights=[0.25, 0.25, 0.25, 0.25] ................................
[CV] weights=[0.4, 0.4, 0.1, 0.1] ....................................


Process ForkPoolWorker-17:
Process ForkPoolWorker-16:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/yibingchen/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/yibingchen/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/yibingchen/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yibingchen/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yibingchen/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Users/yibingchen/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/Users/yibingchen/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
 

KeyboardInterrupt: 

In [75]:
print('Best Score:', vote_gs.best_score_)
print('Best Parameters:', vote_gs.best_params_)
print('Test:',vote_gs.score(X_train,y_train))
print('Test:',vote_gs.score(X_test,y_test))

Best Score: 0.906959706959707
Best Parameters: {'learning_rate': 0.5, 'n_estimators': 50}
Test: 0.9472527472527472
Test: 0.9128205128205128
