Random Search Cross Validation in Scikit-Learn


```
`# This is formatted as code`
```
![alt text](https://)


In [6]:
import pandas as pd
import numpy as np

data = pd.read_csv('outband_QoS_ITU-QoE.csv')

X = data[['outbandQoS_DL_TP (kbps)', 'outbandQoS_UL_TP (kbps)', 'outbandQoS_RTT (ms)', 'outbandQoS_LOSS (ratio)']]
y = data[['QoE_ITU_JT_046_VP9_1280x780']]


In [7]:
from sklearn.model_selection import train_test_split

# Setting random seed
seed = 100
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.34, random_state = seed )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = seed)

display(X_train.head(5))
display(y_train.shape)
display(y_test.shape)

Unnamed: 0,outbandQoS_DL_TP (kbps),outbandQoS_UL_TP (kbps),outbandQoS_RTT (ms),outbandQoS_LOSS (ratio)
40301,3203.772705,2828.515869,23.46121,0.008517
88730,4243.652344,3920.080078,144.101181,0.015958
34734,994.548096,75.91349,5.422786,0.014154
12143,861.184021,473.565552,28.700308,0.019957
37671,1480.579102,38.121124,6.428771,0.001927


(99278, 1)

(5226, 1)

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor(random_state = 100)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(4, 20, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Altogether, there are 2 * 12 * 2 * 3 * 3 * 10 = 4320 settings
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 100,
 'verbose': 0,
 'warm_start': False}
{'bootstrap': [True, False],
 'max_depth': [4, 5, 7, 9, 11, 12, 14, 16, 18, 20, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 3],
 'min_samples_split': [2, 5, 7],
 'n_estimators': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000]}


  from numpy.core.umath_tests import inner1d


In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=100, n_jobs = -1)
# Fit the random search model
y_train = np.ravel(y_train)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] n_estimators=670, min_samples_leaf=2, min_samples_split=5, max_depth=12, max_features=auto, bootstrap=True 
[CV] n_estimators=670, min_samples_leaf=2, min_samples_split=5, max_depth=12, max_features=auto, bootstrap=True 
[CV] n_estimators=670, min_samples_leaf=2, min_samples_split=5, max_depth=12, max_features=auto, bootstrap=True 
[CV] n_estimators=670, min_samples_leaf=2, min_samples_split=5, max_depth=12, max_features=auto, bootstrap=True 
[CV] n_estimators=670, min_samples_leaf=2, min_samples_split=5, max_depth=12, max_features=auto, bootstrap=True 
[CV] n_estimators=890, min_samples_leaf=1, min_samples_split=5, max_depth=12, max_features=auto, bootstrap=True 
[CV] n_estimators=890, min_samples_leaf=1, min_samples_split=5, max_depth=12, max_features=auto, bootstrap=True 
[CV] n_estimators=890, min_samples_leaf=1, min_samples_split=5, max_depth=12, max_features=auto, bootstrap=True 
[CV] n_estimators=890, min_sample

[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.4min


[CV]  n_estimators=120, min_samples_leaf=1, min_samples_split=2, max_depth=18, max_features=auto, bootstrap=True, total= 1.3min
[CV] n_estimators=230, min_samples_leaf=3, min_samples_split=2, max_depth=18, max_features=auto, bootstrap=False 
[CV]  n_estimators=120, min_samples_leaf=1, min_samples_split=2, max_depth=18, max_features=auto, bootstrap=True, total= 1.3min
[CV] n_estimators=230, min_samples_leaf=3, min_samples_split=2, max_depth=18, max_features=auto, bootstrap=False 
[CV]  n_estimators=450, min_samples_leaf=2, min_samples_split=5, max_depth=5, max_features=sqrt, bootstrap=True, total=  53.2s
[CV] n_estimators=230, min_samples_leaf=3, min_samples_split=2, max_depth=18, max_features=auto, bootstrap=False 
[CV]  n_estimators=450, min_samples_leaf=2, min_samples_split=5, max_depth=5, max_features=sqrt, bootstrap=True, total=  53.6s
[CV] n_estimators=230, min_samples_leaf=3, min_samples_split=2, max_depth=18, max_features=auto, bootstrap=False 
[CV]  n_estimators=450, min_sample

[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed: 28.9min


[CV]  n_estimators=10, min_samples_leaf=1, min_samples_split=2, max_depth=20, max_features=auto, bootstrap=False, total=  11.3s
[CV] n_estimators=10, min_samples_leaf=1, min_samples_split=2, max_depth=20, max_features=auto, bootstrap=False 
[CV]  n_estimators=120, min_samples_leaf=1, min_samples_split=5, max_depth=14, max_features=sqrt, bootstrap=True, total=  34.3s
[CV] n_estimators=10, min_samples_leaf=1, min_samples_split=2, max_depth=20, max_features=auto, bootstrap=False 
[CV]  n_estimators=450, min_samples_leaf=1, min_samples_split=2, max_depth=16, max_features=sqrt, bootstrap=False, total= 3.7min
[CV] n_estimators=10, min_samples_leaf=1, min_samples_split=2, max_depth=20, max_features=auto, bootstrap=False 
[CV]  n_estimators=10, min_samples_leaf=1, min_samples_split=2, max_depth=20, max_features=auto, bootstrap=False, total=  11.3s
[CV] n_estimators=10, min_samples_leaf=1, min_samples_split=2, max_depth=20, max_features=auto, bootstrap=False 
[CV]  n_estimators=10, min_samples_

[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 86.2min


[CV]  n_estimators=230, min_samples_leaf=2, min_samples_split=2, max_depth=20, max_features=sqrt, bootstrap=True, total= 1.4min
[CV] n_estimators=230, min_samples_leaf=2, min_samples_split=2, max_depth=9, max_features=sqrt, bootstrap=False 
[CV]  n_estimators=230, min_samples_leaf=2, min_samples_split=2, max_depth=20, max_features=sqrt, bootstrap=True, total= 1.4min
[CV] n_estimators=560, min_samples_leaf=3, min_samples_split=5, max_depth=11, max_features=auto, bootstrap=False 
[CV]  n_estimators=230, min_samples_leaf=2, min_samples_split=2, max_depth=9, max_features=sqrt, bootstrap=False, total= 1.1min
[CV] n_estimators=560, min_samples_leaf=3, min_samples_split=5, max_depth=11, max_features=auto, bootstrap=False 
[CV]  n_estimators=230, min_samples_leaf=2, min_samples_split=2, max_depth=9, max_features=sqrt, bootstrap=False, total= 1.1min
[CV] n_estimators=560, min_samples_leaf=3, min_samples_split=5, max_depth=11, max_features=auto, bootstrap=False 
[CV]  n_estimators=230, min_sampl

[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 132.2min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 5, 7], 'max_depth': [4, 5, 7, 9, 11, 12, 14, 16, 18, 20, None], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=100, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [10]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 9,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 230}

In [6]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    y_test = np.ravel(y_test)
    residules = predictions - y_test
    errors = abs(residules)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    MSE = (1/(len(y_test)))*np.sum((residules)**2)
    RMSE_1234= np.sqrt(MSE)
    print('MSE = {:.4f}, RMSE = {:.4f}'.format(MSE,RMSE_1234))
    #print('R^2 = {}'.format(linreg_1234.score(X, y)))

    return accuracy

# rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"], random_state=False, verbose=False)
base_model = RandomForestRegressor(n_estimators = 10, random_state = 100)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

random_grid = rf_random.best_estimator_
random_grid_accuracy = evaluate(random_grid, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_grid_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 0.3526 degrees.
Accuracy = 89.52%.
MSE = 0.2348, RMSE = 0.4845
Model Performance
Average Error: 0.3314 degrees.
Accuracy = 90.11%.
MSE = 0.2036, RMSE = 0.4512
Improvement of 0.65%.


**Grid Search with Cross Validation**


In [11]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [3, 5, 6, 10, 15, 20],
    'max_features': [2, 3],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [200, 300, 450, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV] n_estimators=200, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=200, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=200, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=200, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=200, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=300, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=300, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=300, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=300, bootstrap=True, min_samples_split=3, max_

[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   23.6s


[CV]  n_estimators=200, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=2, total=  15.4s
[CV] n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=2 
[CV]  n_estimators=450, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2, total=  34.2s
[CV]  n_estimators=450, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2, total=  34.2s
[CV] n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=2 
[CV]  n_estimators=450, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=2, total=  34.2s
[CV] n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=2 
[CV] n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=2 
[CV]  n_estimators=450, bootstrap=True, min_samples_split=3, max_dep

[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  5.3min


[CV]  n_estimators=450, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=4, total=  34.1s
[CV] n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=4 
[CV]  n_estimators=450, bootstrap=True, min_samples_split=3, max_depth=3, max_features=2, min_samples_leaf=4, total=  34.2s
[CV] n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=4 
[CV]  n_estimators=1000, bootstrap=True, min_samples_split=7, max_depth=3, max_features=2, min_samples_leaf=3, total= 1.3min
[CV] n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=4 
[CV]  n_estimators=1000, bootstrap=True, min_samples_split=7, max_depth=3, max_features=2, min_samples_leaf=3, total= 1.3min
[CV] n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=3, max_features=2, min_samples_leaf=4 
[CV]  n_estimators=1000, bootstrap=True, min_samples_split=7, max_

[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 15.6min


[CV]  n_estimators=1000, bootstrap=True, min_samples_split=3, max_depth=3, max_features=3, min_samples_leaf=4, total= 1.7min
[CV] n_estimators=300, bootstrap=True, min_samples_split=7, max_depth=3, max_features=3, min_samples_leaf=4 
[CV]  n_estimators=1000, bootstrap=True, min_samples_split=3, max_depth=3, max_features=3, min_samples_leaf=4, total= 1.8min
[CV] n_estimators=450, bootstrap=True, min_samples_split=7, max_depth=3, max_features=3, min_samples_leaf=4 
[CV]  n_estimators=200, bootstrap=True, min_samples_split=7, max_depth=3, max_features=3, min_samples_leaf=4, total=  21.5s
[CV] n_estimators=450, bootstrap=True, min_samples_split=7, max_depth=3, max_features=3, min_samples_leaf=4 
[CV]  n_estimators=1000, bootstrap=True, min_samples_split=3, max_depth=3, max_features=3, min_samples_leaf=4, total= 1.8min
[CV] n_estimators=450, bootstrap=True, min_samples_split=7, max_depth=3, max_features=3, min_samples_leaf=4 
[CV]  n_estimators=200, bootstrap=True, min_samples_split=7, max_

[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed: 34.6min


[CV]  n_estimators=200, bootstrap=True, min_samples_split=5, max_depth=5, max_features=3, min_samples_leaf=3, total=  33.4s
[CV] n_estimators=450, bootstrap=True, min_samples_split=5, max_depth=5, max_features=3, min_samples_leaf=3 
[CV]  n_estimators=200, bootstrap=True, min_samples_split=5, max_depth=5, max_features=3, min_samples_leaf=3, total=  33.2s
[CV] n_estimators=450, bootstrap=True, min_samples_split=5, max_depth=5, max_features=3, min_samples_leaf=3 
[CV]  n_estimators=200, bootstrap=True, min_samples_split=5, max_depth=5, max_features=3, min_samples_leaf=3, total=  33.2s
[CV] n_estimators=450, bootstrap=True, min_samples_split=5, max_depth=5, max_features=3, min_samples_leaf=3 
[CV]  n_estimators=1000, bootstrap=True, min_samples_split=7, max_depth=5, max_features=3, min_samples_leaf=2, total= 2.8min
[CV] n_estimators=1000, bootstrap=True, min_samples_split=5, max_depth=5, max_features=3, min_samples_leaf=3 
[CV]  n_estimators=300, bootstrap=True, min_samples_split=5, max_d

[Parallel(n_jobs=-1)]: Done 981 tasks      | elapsed: 64.8min


[CV]  n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=6, max_features=3, min_samples_leaf=3, total=  59.7s
[CV] n_estimators=1000, bootstrap=True, min_samples_split=5, max_depth=6, max_features=3, min_samples_leaf=3 
[CV]  n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=6, max_features=3, min_samples_leaf=3, total=  58.8s
[CV] n_estimators=1000, bootstrap=True, min_samples_split=5, max_depth=6, max_features=3, min_samples_leaf=3 
[CV]  n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=6, max_features=3, min_samples_leaf=3, total=  59.2s
[CV] n_estimators=1000, bootstrap=True, min_samples_split=5, max_depth=6, max_features=3, min_samples_leaf=3 
[CV]  n_estimators=300, bootstrap=True, min_samples_split=5, max_depth=6, max_features=3, min_samples_leaf=3, total=  59.1s
[CV] n_estimators=200, bootstrap=True, min_samples_split=7, max_depth=6, max_features=3, min_samples_leaf=3 
[CV]  n_estimators=450, bootstrap=True, min_samples_split=5, max_

KeyboardInterrupt: 

In [12]:
grid_search.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    y_test = np.ravel(y_test)
    residules = predictions - y_test
    errors = abs(residules)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    MSE = (1/(len(y_test)))*np.sum((residules)**2)
    RMSE_1234= np.sqrt(MSE)
    print('MSE = {:.4f}, RMSE = {:.4f}'.format(MSE,RMSE_1234))
    #print('R^2 = {}'.format(linreg_1234.score(X, y)))

    return accuracy

# rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"], random_state=False, verbose=False)
base_model = RandomForestRegressor(n_estimators = 10, random_state = 100)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

In [None]:

# def save_model(file, model):
#   """
#   The pickle module implements binary protocols
#   for serializing and de-serializing a Python object structure.
#   """
#   fluff, id = file.split('=')
#   #filename = file

#   import pickle
#   _model_str = pickle.dumps(grid_search.best_estimator_)

#   upload = drive.CreateFile({'id': id})
#   upload.SetContentString(str(_model_str))
#   upload.Upload()


# def load_model(file):
#   """
#   The pickle module implements binary protocols
#   for serializing and de-serializing a Python object structure.
#   """
#   fluff, id = file.split('=')

#   import pickle

#   downloaded = drive.CreateFile({'id':id}) 
#   path_to_pickle = 'itu_model.pickle'
#   downloaded.GetContentFile(path_to_pickle)
#   try:
#       with open(path_to_pickle, 'rb') as handle:
#         my_str_as_bytes = str.encode(handle)
#         model = pickle.load(my_str_as_bytes)
#         return model
#   except pickle.UnpicklingError as error:
#       raise Exception(error)
  
#   return None

# file = 'https://drive.google.com/open?id=16ZEqlBFr_5__mN0gV-zxPBiJi9lnRsCR'
# save_model(file, grid_search.best_estimator_)

# model = load_model(file)
# display(model)