In [2]:
"""
Resources:
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
https://stats.stackexchange.com/questions/27730/choice-of-k-in-k-fold-cross-validation
https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
"""
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from IPython.core.display import display, HTML                                    
display(HTML("<style>.container { width:100% !important; }</style>"))                                  #change width of Jupyer Notebook to use the whole window resolution available

def code_timer(tstart,tstop):
    process_time = (tstop-tstart)
    mins, sec = divmod(process_time, 60)                                                        # split to hours and seconds
    return '{:02.0f}m:{:02.0f}s'.format(mins,sec) 


dataset = pd.read_csv('./dataset.csv',header=None)
#display(dataset)
X_train = dataset.iloc[:, 1:] 
y_train = dataset.iloc[:, 0]
#display(X)
#display(y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

"""
X_reduced = PCA(n_components=3).fit_transform(X_train)
pca = PCA(4)
pca.fit(X_train)
print(pca.explained_variance_ratio_)
"""

timer_start = time.monotonic()
rfc = RandomForestClassifier()
param_random = { 
    'n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 
    'bootstrap': [True, False]}

rfc_random_search = RandomizedSearchCV(estimator = rfc, param_distributions = param_random, 
                                       n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rfc_random_search.fit(X_train, y_train)
best_random_search = list(rfc_random_search.best_params_.values())
print(rfc_random_search.best_params_)

param_grid = { 
    'n_estimators': [int(best_random_search[0]-100), int(best_random_search[0]-50), int(best_random_search[0]-25), int(best_random_search[0]),
                                                int(best_random_search[0]+25), int(best_random_search[0])+50, int(best_random_search[0])+100],
    'min_samples_split': [int(best_random_search[1]/3), int(best_random_search[1]/2), int(best_random_search[1]), 
                                                     int(best_random_search[1]*2), int(best_random_search[1]*3)],
    'min_samples_leaf': [best_random_search[2]-1, best_random_search[1], best_random_search[1]+1],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [best_random_search[4]-10, best_random_search[4]-5, best_random_search[4], 
                                        best_random_search[4]+5, best_random_search[4]+10],
    'bootstrap': [best_random_search[5]]}

rfc_grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                               cv = 5, n_jobs = -1, verbose = 2)
rfc_grid_search.fit(X_train, y_train)
best_grid_search = list(rfc_grid_search.best_params_.values())
print(rfc_grid_search.best_params_)

rfc_optimised = RandomForestClassifier(n_jobs=-1, bootstrap=best_grid_search[0], max_depth=best_grid_search[1], max_features=best_grid_search[2],
                                       min_samples_leaf=best_grid_search[3], min_samples_split=best_grid_search[4], n_estimators=best_grid_search[5])
rfc_optimised.fit(X_train,y_train)
pickle.dump(rfc_optimised, open("randomforest_clf_optimised.p", "wb"))

timer_stop = time.monotonic()
print("The entire optimisation process took: ", code_timer(timer_start, timer_stop))


Fitting 8 folds for each of 100 candidates, totalling 800 fits




{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 90, 'bootstrap': True}
Fitting 8 folds for each of 1575 candidates, totalling 12600 fits




{'bootstrap': True, 'max_depth': 85, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 400}
The entire optimisation process took:  04m:30s
