### Random Search Cross Validation in Scikit-Learn

In [20]:
import pandas as pd
import numpy as np

#data = pd.read_csv('Dataset/20000_outband_QoS_ITU-QoE.csv', nrows = 10)
data = pd.read_csv('Dataset/20000_outband_QoS_ITU-QoE.csv')
#data = pd.read_csv('outband_QoS_ITU-QoE.csv')

X = data[['outbandQoS_DL_TP (kbps)', 'outbandQoS_UL_TP (kbps)', 'outbandQoS_RTT (ms)', 'outbandQoS_LOSS (ratio)']]
y = data[['QoE_ITU_JT_046_VP9_1280x780']]

In [21]:
from sklearn.model_selection import train_test_split

# Setting random seed
seed = 100
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.34, random_state = seed )
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = seed)

display(X_train.head(5))
display(y_train.shape)
display(y_test.shape)

Unnamed: 0,outbandQoS_DL_TP (kbps),outbandQoS_UL_TP (kbps),outbandQoS_RTT (ms),outbandQoS_LOSS (ratio)
3906,4599.458008,867.078674,14.489874,0.015907
18164,4744.419434,558.679443,27.36326,0.024002
3829,4705.260742,1084.92749,39.144135,0.016064
14589,9713.149414,1555.960815,48.297508,0.008926
391,3951.575195,1042.235596,36.710537,0.024447


(18000, 1)

(2000, 1)

In [23]:
from sklearn.ensemble import RandomForestRegressor

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    y_test = np.ravel(y_test)
    residules = predictions - y_test
    errors = abs(residules)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    MSE = (1/(len(y_test)))*np.sum((residules)**2)
    RMSE_1234= np.sqrt(MSE)
    print('MSE = {:.4f}, RMSE = {:.4f}'.format(MSE,RMSE_1234))
    #print('R^2 = {}'.format(linreg_1234.score(X, y)))

    return accuracy

# Convert to 1d array
y_test = np.ravel(y_test)
y_train = np.ravel(y_train)

# Parameters
best_params = {'bootstrap': True,
 'max_depth': 10,
 'max_features': 2,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 1000}

base_model = RandomForestRegressor(n_estimators = 10, random_state = 100)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

rfr = RandomForestRegressor(max_depth=best_params["max_depth"], max_features=best_params["max_features"], n_estimators=best_params["n_estimators"], \
                            min_samples_leaf=best_params["min_samples_leaf"], min_samples_split=best_params["min_samples_split"], random_state=100, verbose=False)
print(rfr)

rfr_model = rfr.fit(X_train, y_train)
random_grid_accuracy = evaluate(rfr_model, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_grid_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 0.3438 degrees.
Accuracy = 89.74%.
MSE = 0.2234, RMSE = 0.4726
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features=2, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=100,
                      verbose=False, warm_start=False)
Model Performance
Average Error: 0.3246 degrees.
Accuracy = 90.24%.
MSE = 0.1983, RMSE = 0.4453
Improvement of 0.56%.


In [24]:
import os
import pickle

def save_model(file, model):
    """
    The pickle module implements binary protocols
    for serializing and de-serializing a Python object structure.
    """
    path_to_pickle = os.path.join(os.getcwd(), file)
    try:
        #FileHelper.create_folder(FileHelper.get_folder_path(path_to_pickle))
        with open(path_to_pickle, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    except pickle.PickleError as error:
        raise Exception(error)
    else:
        return True

def load_model(file):
    """
    Read .pickle file
    """
    path_to_pickle = os.path.join(os.getcwd(), file)
    try:
        with open(path_to_pickle, 'rb') as handle:
            model = pickle.load(handle)
    except pickle.UnpicklingError as error:
        raise Exception(error)
    else:
        return model

### Save Model to Pickle

In [25]:
file_name = 'Dataset/20000_rfr_model.pickle'
save_model(file_name, rfr_model)

True

### Load Model and Train

In [26]:
model = load_model(file_name)
random_grid_accuracy = evaluate(rfr_model, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_grid_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 0.3246 degrees.
Accuracy = 90.24%.
MSE = 0.1983, RMSE = 0.4453
Improvement of 0.56%.
