In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn import preprocessing
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
X_train = pd.read_csv("X_train_df.csv")
y_train = pd.read_csv("y_train_df.csv")

  * n_estimator:  number of trees in the forest
  * max_features: maximum number of features considered for splitting a node
  * max_depth:    maximum number of levels in each decision trees
  * min_sample_leafs: min. number of data point allowed in a leaf node
  * min_sample_spilts: min. number of data points placed in a leaf before before the node is split
  * bootstrap: method of sampling data point (with or without replacement)

In [3]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 700, stop = 1000, num = 2)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(700, 1000, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [ 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [ 4, 8]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [6]:
from pprint import pprint
pprint(random_grid)

{'bootstrap': [True],
 'max_depth': [700, 1000, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [4, 8],
 'min_samples_split': [5, 10],
 'n_estimators': [700, 1000]}


In [7]:
rf_cls = RandomForestClassifier(random_state=0, oob_score=True)
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_cls, param_distributions = random_grid, n_iter = 10, cv = 5, random_state=0) 

In [8]:
t_start = datetime.now()
# Fit the random search model
rf_random.fit(X_train, y_train.values.ravel())
t_end = datetime.now()
print("Execution time: {}".format(t_end  - t_start))

Execution time: 0:12:37.209220


In [9]:
rf_random.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=None,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

In [10]:
rf_random.best_score_

0.8456040909370957

In [11]:
rf_random.best_params_

{'n_estimators': 700,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

In [12]:
import joblib

In [13]:
joblib.dump(rf_random, "random_forest.pkl")

['random_forest.pkl']