In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
import numpy as np
import datetime as dt
from sklearn.metrics import auc

In [3]:
train_df = pd.read_csv('train.csv')
train_x = train_df.drop(columns=['ID_code', 'target'])
train_y = train_df['target']

x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.3)
x_train, x_eval, y_train, y_eval = train_test_split(x_train, y_train, test_size = 0.3)

print('Train Size:{}, {}\nTest Size:{}, {}\nEval Size:{}, {}'
      .format(x_train.shape, y_train.shape, x_test.shape, y_test.shape, 
              x_eval.shape, y_eval.shape))

Train Size:(98000, 200), (98000,)
Test Size:(60000, 200), (60000,)
Eval Size:(42000, 200), (42000,)


In [7]:
rf_model = RandomForestClassifier()

params = {'criterion':'gini', 'min_samples_split':2, 'min_samples_leaf':1, 'oob_score':True, 
          'verbose':2}

params_grid = {'n_estimators':np.arange(200,1000,200), 'max_depth':np.arange(4,20,4)}

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1011)
grid_search = GridSearchCV(rf_model, param_grid=params_grid, cv=kfold, scoring='roc_auc', 
                           n_jobs=4, verbose=20)
grid_search = grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  8.9min
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:  8.9min
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:  8.9min
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:  8.9min
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed: 13.1min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed: 13.2min
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed: 17.7min
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed: 17.7min
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed: 21.8min
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed: 21.9min
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed: 26.1min
[Parallel(

[Parallel(n_jobs=4)]: Done 133 tasks      | elapsed: 548.9min
[Parallel(n_jobs=4)]: Done 134 tasks      | elapsed: 548.9min
[Parallel(n_jobs=4)]: Done 135 tasks      | elapsed: 558.7min
[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed: 558.9min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 562.5min
[Parallel(n_jobs=4)]: Done 138 tasks      | elapsed: 562.5min
[Parallel(n_jobs=4)]: Done 139 tasks      | elapsed: 572.3min
[Parallel(n_jobs=4)]: Done 140 tasks      | elapsed: 572.6min
[Parallel(n_jobs=4)]: Done 141 tasks      | elapsed: 582.6min
[Parallel(n_jobs=4)]: Done 142 tasks      | elapsed: 582.6min
[Parallel(n_jobs=4)]: Done 143 tasks      | elapsed: 592.3min
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed: 592.5min
[Parallel(n_jobs=4)]: Done 145 tasks      | elapsed: 602.9min
[Parallel(n_jobs=4)]: Done 146 tasks      | elapsed: 602.9min
[Parallel(n_jobs=4)]: Done 147 tasks      | elapsed: 612.8min
[Parallel(n_jobs=4)]: Done 148 tasks      | elapsed: 613.0min
[Paralle

In [8]:
grid_search.best_score_, grid_search.best_params_

(0.8328370149204536, {'max_depth': 16, 'n_estimators': 800})

In [13]:
def summarize_cv_results(grid_result):
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

In [14]:
summarize_cv_results(grid_search)

Best: 0.832837 using {'max_depth': 16, 'n_estimators': 800}
0.779388 (0.006674) with: {'max_depth': 4, 'n_estimators': 200}
0.782735 (0.007561) with: {'max_depth': 4, 'n_estimators': 400}
0.783249 (0.007458) with: {'max_depth': 4, 'n_estimators': 600}
0.783844 (0.008355) with: {'max_depth': 4, 'n_estimators': 800}
0.799945 (0.008123) with: {'max_depth': 8, 'n_estimators': 200}
0.804778 (0.007770) with: {'max_depth': 8, 'n_estimators': 400}
0.805350 (0.008884) with: {'max_depth': 8, 'n_estimators': 600}
0.805731 (0.007818) with: {'max_depth': 8, 'n_estimators': 800}
0.815799 (0.008312) with: {'max_depth': 12, 'n_estimators': 200}
0.821043 (0.006710) with: {'max_depth': 12, 'n_estimators': 400}
0.820412 (0.008178) with: {'max_depth': 12, 'n_estimators': 600}
0.822805 (0.007790) with: {'max_depth': 12, 'n_estimators': 800}
0.824525 (0.007524) with: {'max_depth': 16, 'n_estimators': 200}
0.831391 (0.006990) with: {'max_depth': 16, 'n_estimators': 400}
0.832656 (0.006855) with: {'max_depth'

In [1]:
import pickle
pickle.dump(grid_search, open('rf_model.modelcv', 'wb'))

NameError: name 'grid_search' is not defined