# Energy Dataset Model Validation and Selection

In [71]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [72]:
df_loaded = pd.read_csv("../Part5_Feature_Selection/energydata_complete_hyperpara.csv")
df = df_loaded

# Creating Metrics Dataframe
metrics_df = pd.DataFrame(index = ['n_estimators', 'RSquared_train', 'RSquared_test', 'RMS_train', 'RMS_test', 'MAE_train', 'MAE_test', 'MAPE_train', 'MAPE_test'])

In [73]:
X = df.drop(['Appliances'],axis=1)
y = df['Appliances']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [74]:
# Function to print metrics
def print_training_metrics(**kwargs):
    for name, value in kwargs.items():
        value = float("{0:.2f}".format(value))
        print('{0} for Training Dataset is: {1}'.format(name, value))
def print_testing_metrics(**kwargs):
    for name, value in kwargs.items():
        value = float("{0:.2f}".format(value))
        print('{0} for Testing Dataset is: {1}'.format(name, value))

# Function to print and add metrics to dataframe
def print_metrics(df, model, estimators, r2_train, rms_train, mae_train, mape_train, r2_test, rms_test, mae_test, mape_test):
    #print('R Squared for Training Data:',float("{0:.2f}".format(r2_train)))
    #print('R Squared for Testing Data:',float("{0:.2f}".format(r2_test)))
    #print('RMS for Training Data:',float("{0:.2f}".format(rms_train)))
    #print('RMS for Testing Data:',float("{0:.2f}".format(rms_test)))
    #print('MAE for Training Data:',float("{0:.2f}".format(mae_train)))
    #print('MAE for Testing Data:',float("{0:.2f}".format(mae_test)))
    #print('MAPE for Training Data:',float("{0:.2f}".format(mape_train)))
    #print('MAPE for Testing Data:',float("{0:.2f}".format(mape_test)))
    
    model_name = str.join('_', (model, str(estimators)))
    df[model_name] = [estimators,
                 float("{0:.2f}".format(r2_train)), float("{0:.2f}".format(r2_test)),
                 float("{0:.2f}".format(rms_train)), float("{0:.2f}".format(rms_test)),
                 float("{0:.2f}".format(mae_train)), float("{0:.2f}".format(mae_test)),
                 float("{0:.2f}".format(mape_train)), float("{0:.2f}".format(mape_test))]
    return df

## First Run of Random Forest Model

In [75]:
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

def model_randomforest(X_train, y_train, metrics_df, estimators = 10):
    rf = RandomForestRegressor(n_estimators = estimators)
    rf.fit(X_train, y_train)
    
    # Predicting and Calculating the Metrices for Prediction of Testing Dataset
    prediction_test_rf = rf.predict(X_test)
    r2_test_rf = r2_score(y_test, prediction_test_rf)
    rms_test_rf = sqrt(mean_squared_error(y_test, prediction_test_rf))
    mae_test_rf = mean_absolute_error(y_test,prediction_test_rf)
    mape_test_rf = np.mean(np.abs((y_test - prediction_test_rf) / y_test)) * 100
    
    # Predicting and Calculating the Metrices for Prediction of Training Dataset
    prediction_train_rf = rf.predict(X_train)
    r2_train_rf = r2_score(y_train, prediction_train_rf)
    rms_train_rf = sqrt(mean_squared_error(y_train, prediction_train_rf))
    mae_train_rf = mean_absolute_error(y_train,prediction_train_rf)
    mape_train_rf = np.mean(np.abs((y_train - prediction_train_rf) / y_train)) * 100
    
    # Printing the training and testing metrices
    print('Random Forest Model\n')
    metrics_df = print_metrics(metrics_df, 'RF_Model', estimators, r2_train_rf, rms_train_rf, mae_train_rf, mape_train_rf, r2_test_rf, rms_test_rf, mae_test_rf, mape_test_rf)
    return metrics_df

In [76]:
metrics_df = model_randomforest(X_train, y_train, metrics_df, estimators = 10)
metrics_df

Random Forest Model



Unnamed: 0,RF_Model_10
n_estimators,10.0
RSquared_train,0.92
RSquared_test,0.51
RMS_train,29.3
RMS_test,70.3
MAE_train,12.77
MAE_test,33.69
MAPE_train,12.94
MAPE_test,34.58


## Model Validation via Cross-Validation
> In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:
 * A model is trained using k-1 of the folds as training data;
 * the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).
The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as it is the case when fixing an arbitrary test set), which is a major advantage in problem such as inverse inference where the number of samples is very small.

![Cross-Validation](./Images/05.03-5-fold-CV.png)

In [6]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(rf, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Accuracy: -0.14 (+/- 0.49)


Above, is the mean score and the 95% confidence interval of the score estimate

In [7]:
df_loaded_original = pd.read_csv("../Part3_Feature_Engineering/energydata_complete_transformed.csv")
df_original = df_loaded_original
X_original = df_original.drop(['Appliances'],axis=1)
y_original = df_original['Appliances']

In [8]:
scores = cross_val_score(rf, X_original, y_original, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: -0.71 (+/- 2.38)


## The Bias-variance trade-off ~ Selecting the Best Model
>Fundamentally, the question of "the best model" is about finding a sweet spot in the tradeoff between bias and variance. Consider the following figure, which presents two regression fits to the same dataset:
* The model on the left attempts to find a straight-line fit through the data. 
* The model on the right attempts to fit a high-order polynomial through the data.

>The score here is the R2 score, or coefficient of determination, which measures how well a model performs relative to a simple mean of the target values. R2=1 indicates a perfect match, R2=0 indicates the model does no better than simply taking the mean of the data, and negative values mean even worse models. From the scores associated with these two models, we can make an observation that holds more generally:
* For high-bias models, the performance of the model on the validation set is similar to the performance on the training set.
* For high-variance models, the performance of the model on the validation set is far worse than the performance on the training set.

![Bias-variance trade-off](./Images/05.03-bias-variance-2.png)

If we imagine that we have some ability to tune the model complexity, we would expect the training score and validation score to behave as illustrated in the following figure:

![Bias-variance trade-off](./Images/05.03-validation-curve.png)

The diagram shown here is often called a validation curve, and we see the following essential features:
* The training score is everywhere higher than the validation score. This is generally the case: the model will be a better fit to data it has seen than to data it has not seen.
* For very low model complexity (a high-bias model), the training data is under-fit, which means that the model is a poor predictor both for the training data and for any previously unseen data.
* For very high model complexity (a high-variance model), the training data is over-fit, which means that the model predicts the training data very well, but fails for any previously unseen data.
* For some intermediate value, the validation curve has a maximum. This level of complexity indicates a suitable trade-off between bias and variance.

In [77]:
for i in range(10, 510, 10):
    metrics_df = model_randomforest(X_train, y_train, metrics_df, estimators = i)

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest Model

Random Forest

In [79]:
metrics_df

Unnamed: 0,RF_Model_10,RF_Model_1,RF_Model_11,RF_Model_21,RF_Model_31,RF_Model_41,RF_Model_51,RF_Model_61,RF_Model_71,RF_Model_81,...,RF_Model_401,RF_Model_411,RF_Model_421,RF_Model_431,RF_Model_441,RF_Model_451,RF_Model_461,RF_Model_471,RF_Model_481,RF_Model_491
n_estimators,10.0,1.0,11.0,21.0,31.0,41.0,51.0,61.0,71.0,81.0,...,401.0,411.0,421.0,431.0,441.0,451.0,461.0,471.0,481.0,491.0
RSquared_train,0.92,0.71,0.92,0.94,0.94,0.94,0.94,0.95,0.94,0.95,...,0.95,0.95,0.95,0.95,0.95,0.95,0.95,0.95,0.95,0.95
RSquared_test,0.51,0.02,0.52,0.53,0.54,0.55,0.56,0.55,0.55,0.56,...,0.56,0.56,0.56,0.56,0.56,0.56,0.56,0.56,0.56,0.56
RMS_train,29.3,55.43,28.74,25.88,25.67,24.65,24.1,24.01,24.38,23.95,...,23.47,23.46,23.32,23.55,23.42,23.52,23.36,23.4,23.46,23.44
RMS_test,70.3,99.16,69.11,68.68,67.99,67.12,66.41,67.16,67.2,66.4,...,66.28,66.61,66.3,66.57,66.37,66.41,66.48,66.5,66.48,66.54
MAE_train,12.77,14.52,12.62,11.97,11.83,11.62,11.39,11.31,11.47,11.33,...,11.11,11.12,11.04,11.13,11.06,11.12,11.1,11.08,11.1,11.09
MAE_test,33.69,42.91,32.95,33.08,32.4,31.94,31.58,32.01,31.99,31.7,...,31.54,31.71,31.59,31.68,31.55,31.58,31.71,31.64,31.59,31.71
MAPE_train,12.94,14.58,12.65,12.23,12.01,11.99,11.67,11.64,11.74,11.6,...,11.34,11.37,11.3,11.37,11.29,11.39,11.37,11.38,11.36,11.34
MAPE_test,34.58,41.92,34.09,34.33,33.14,32.81,32.7,32.92,33.08,32.66,...,32.61,32.68,32.63,32.74,32.54,32.58,32.75,32.65,32.63,32.76


In [82]:
metrics_df

Unnamed: 0,RF_Model_10,RF_Model_11,RF_Model_21,RF_Model_31,RF_Model_41,RF_Model_51,RF_Model_61,RF_Model_71,RF_Model_81,RF_Model_91,...,RF_Model_411,RF_Model_421,RF_Model_431,RF_Model_441,RF_Model_451,RF_Model_461,RF_Model_471,RF_Model_481,RF_Model_491,RF_Model_501
n_estimators,10.0,11.0,21.0,31.0,41.0,51.0,61.0,71.0,81.0,91.0,...,411.0,421.0,431.0,441.0,451.0,461.0,471.0,481.0,491.0,501.0
RSquared_train,0.92,0.92,0.94,0.94,0.94,0.94,0.95,0.94,0.95,0.95,...,0.95,0.95,0.95,0.95,0.95,0.95,0.95,0.95,0.95,0.95
RSquared_test,0.51,0.52,0.53,0.54,0.55,0.56,0.55,0.55,0.56,0.55,...,0.56,0.56,0.56,0.56,0.56,0.56,0.56,0.56,0.56,0.56
RMS_train,29.3,28.74,25.88,25.67,24.65,24.1,24.01,24.38,23.95,23.94,...,23.46,23.32,23.55,23.42,23.52,23.36,23.4,23.46,23.44,23.38
RMS_test,70.3,69.11,68.68,67.99,67.12,66.41,67.16,67.2,66.4,66.89,...,66.61,66.3,66.57,66.37,66.41,66.48,66.5,66.48,66.54,66.37
MAE_train,12.77,12.62,11.97,11.83,11.62,11.39,11.31,11.47,11.33,11.29,...,11.12,11.04,11.13,11.06,11.12,11.1,11.08,11.1,11.09,11.09
MAE_test,33.69,32.95,33.08,32.4,31.94,31.58,32.01,31.99,31.7,31.96,...,31.71,31.59,31.68,31.55,31.58,31.71,31.64,31.59,31.71,31.59
MAPE_train,12.94,12.65,12.23,12.01,11.99,11.67,11.64,11.74,11.6,11.5,...,11.37,11.3,11.37,11.29,11.39,11.37,11.38,11.36,11.34,11.34
MAPE_test,34.58,34.09,34.33,33.14,32.81,32.7,32.92,33.08,32.66,32.98,...,32.68,32.63,32.74,32.54,32.58,32.75,32.65,32.63,32.76,32.62


## Validation in Practice: Grid Search

In [70]:
model = 'hello'
estimator = 1
yo = str.join('_', (model, str(estimator)))
yo

'hello_1'