We would like to explore Random Forest Regressors and Support Vector Machines along with some input variables to see if we can make a model that accurately forecasts energy demand. We aim to achieve better results than simply saying 'The demand in 30 minutes time will be the same as it is right now'. This is calculated below as having a mean average loss of 218.

In [27]:

# Suppress annoying deprecation warnings
from warnings import filterwarnings  # noqa
filterwarnings(action='ignore',
                        category=DeprecationWarning,
                        module='sklearn')  # noqa

# Import libraries
import os, sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
import math
from pprint import pprint
from sklearn.metrics import make_scorer
from datetime import datetime, timedelta
import forestci as fci




# Set path to current directory
sys.path.append(os.path.abspath(os.getcwd()))

# read data
data = pd.read_csv('./Cleaned_Data_mkII.csv')

# Set random state
STATE = 69

# If set to true, models will perform hyperparameter tuning. If false they will use
# pre-defined values
SEARCH = False

# Makes sure datetime is in datetime format
data['DATETIME'] = pd.to_datetime(data['DATETIME'])

# We want to test a time column as an input
data['time'] = (data['DATETIME'].dt.strftime("%H%M%S"))

# Also would like to test demand and tmeperature 30, 60 and 90 mins before the current time as input
# These were found to have diminishing return after 60 mins out for demand and 30 mins out for temp
data['demand_30'] = data.TOTALDEMAND.shift(1)
data['demand_60'] = data.TOTALDEMAND.shift(2)
data['demand_90'] = data.TOTALDEMAND.shift(3)

data['temp_30'] = data.TEMPERATURE.shift(1)
data['temp_60'] = data.TEMPERATURE.shift(2)
data['temp_90'] = data.TEMPERATURE.shift(3)

######### Change the number of years
# Select only data from the past 1 years to make hyperparameter tuning take less time
mask = (data['DATETIME'] >= '2020-03-16') & (data['DATETIME'] < '2021-03-16')
data = data.loc[mask]

# The loss that we aim to beat
#round(math.sqrt(mean_squared_error(data['TOTALDEMAND'], data['demand_30'])))


221

First we will try a random forrest regressor with just demand and temperature data for a benchmark

In [43]:
# Create features and target sets
base_features = data[['time', 'temp_30', 'demand_30', 'demand_60']]
base_target = data['TOTALDEMAND']

# Convert to numpy arrays and split training/test data
base_features_np = pd.DataFrame(base_features).to_numpy()
base_target_np = np.ravel(pd.DataFrame(base_target).to_numpy())

base_features_train, base_features_test, base_target_train, base_target_test = train_test_split(base_features_np,
                                                                            base_target_np, random_state = STATE)

# Implement Random Forest
base_rnd_clf = RandomForestRegressor(random_state = STATE)
base_rnd_clf.fit(base_features_train, base_target_train)

# Print error
base_rf_predicted = base_rnd_clf.predict(base_features_test)
base_rf_error = round(math.sqrt(mean_squared_error(base_target_test, base_rf_predicted)))
#print("Baseline Random Forest Error: ", base_rf_error)

# Look at parameters used by our current forest
#print('Parameters currently in use:\n')
#pprint(base_rnd_clf.get_params())

Next, try to include an indicator of if it is a weekday or not, and weather data to see if these increase performance

In [42]:
# And an indictor if it is a weekend or not
data['is_weekday'] = data['DATETIME'].dt.weekday
data['is_weekday'] = np.where(data['is_weekday'] < 5, 1, 0)

# import weather data
weather = pd.read_csv('weather.csv')

# Convert datetime to date
weather['DATE'] = pd.to_datetime(weather[['Year','Month','Day']]).dt.date

# Extract just day and rain columns
weather.columns = ['drop1','drop2','drop3','drop4','drop5','rain','drop6','drop7','DAY']
weather = weather[['DAY','rain']]

# Fill NA values
weather['rain'].fillna(method='ffill', inplace=True)

# Create dummy variables for rain for better model use
weather['RAIN'] = pd.cut(weather['rain'],bins=[-1,0.2,4,10,999999],labels=['NONE','LIGHT','MODERATE','HEAVY'])
weather_dummy = pd.get_dummies(weather,columns=['RAIN'])

# Create day column in data and merge using it
data['DAY'] = data['DATETIME'].dt.date

data = data.merge(weather_dummy, on='DAY')

# Create features and target sets
features = data[['time', 'temp_30', 'demand_30', 'demand_60', 
                 'RAIN_NONE','RAIN_LIGHT','RAIN_MODERATE','RAIN_HEAVY', 'is_weekday']]
target = data['TOTALDEMAND']

# Convert to numpy arrays and split training/test data
features_np = pd.DataFrame(features).to_numpy()
target_np = np.ravel(pd.DataFrame(target).to_numpy())

features_train, features_test, target_train, target_test = train_test_split(features_np, target_np, random_state = STATE)

# Implement Random Forest
rnd_clf = RandomForestRegressor(random_state = STATE)
rnd_clf.fit(features_train, target_train)

# Print error
rf_predicted = rnd_clf.predict(features_test)
rf_error = round(math.sqrt(mean_squared_error(target_test, rf_predicted)))

# Look at parameters used by our current forest
#print('Parameters currently in use:\n')
#pprint(rnd_clf.get_params())


KeyError: "['RAIN_NONE', 'RAIN_LIGHT', 'RAIN_MODERATE', 'RAIN_HEAVY'] not in index"

Weekday and weather data aren't contributing to the basline model. Try random search of optimizing hyperparamters as a quick way to tell if our baseline model can perform any better.

In [39]:
# Tune hyperparameters

# Create the random grid
random_grid = {'n_estimators': list(range(2, 20, 2)),
               'max_features': ['auto', 'sqrt'],
               'max_depth': list(range(1, 20, 1)),
               'min_samples_split': list(range(2, 10, 1)),
               'min_samples_leaf': list(range(2, 10, 1)),
               'bootstrap': [True, False]}

# Random parameter search (3 fold CV)

if SEARCH:
    rf_random = RandomizedSearchCV(estimator = RandomForestRegressor, param_distributions = random_grid,
                                n_iter = 10, cv = 5, verbose = 2, random_state = STATE, n_jobs = -1)
    rf_random.fit(features_train, target_train)
    best_random_pred = rf_random.best_estimator_.predict(base_features_test)
    best_random_error = round(math.sqrt(mean_squared_error(base_target_test,best_random_pred)))
else:
    rf_random = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=19,
                                      max_features='sqrt', max_leaf_nodes=None,
                                      min_impurity_decrease=0.0, min_impurity_split=None,
                                      min_samples_leaf=7, min_samples_split=5,
                                      min_weight_fraction_leaf=0.0, n_estimators=18, n_jobs=None,
                                      oob_score=False, random_state=STATE, verbose=0, warm_start=False)
    rf_random.fit(base_features_train, base_target_train)
    best_random_pred = rf_random.predict(base_features_test)
    best_random_error = round(math.sqrt(mean_squared_error(base_target_test,best_random_pred)))

# Print error
#print("Randomized Hyp Tuning Error: ", best_random_error)

# Look at parameters used
#print('Parameters currently in use:\n')
#pprint(rf_random.best_estimator_)

This tuning is only trivially better than our base model. No point in continuing hyperparpmeter tuning further, or in continuing with the model, unless we can think of other inputs.

We will now start with SVM modelling

In [31]:
# Implement Support Vector Machine Regressor
from sklearn.svm import SVR

regressor = SVR(kernel = 'rbf')
regressor.fit(base_features_train, base_target_train)

svm_predicted = regressor.predict(base_features_test)
svm_error = round(math.sqrt(mean_squared_error(base_target_test, svm_predicted)))

#print("SVM Error: ", svm_error)


In [32]:
SVM_param_grid = {'C' : [0.1, 1, 10, 100, 1000],
                 'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
                 'kernel' : ['rbf']}

scorer = make_scorer(mean_squared_error, greater_is_better=False)

if SEARCH:
    svr_gs = GridSearchCV(SVR(), SVM_param_grid, scoring=scorer, random_state=STATE)
else:
    svr_gs = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0001,
                 kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

svr_gs.fit(base_features_train, base_target_train)

grid_preds = svr_gs.predict(base_features_test)
best_SVM_grid_error = round(math.sqrt(mean_squared_error(base_target_test, grid_preds)))

# Print error
#print("SVM Grid Search Tuning Error: ", best_SVM_grid_error)

# Look at parameters used
#print('Parameters currently in use:\n')
#pprint(svr_gs.get_params())


In [40]:
# Calculate RMSE for each model and benchmark
# Lower = better model

rf_error = round(math.sqrt(mean_squared_error(target_test, rf_predicted)))
svm_error = round(math.sqrt(mean_squared_error(target_test, svm_predicted)))
benchmark_error = round(math.sqrt(mean_squared_error(data['TOTALDEMAND'], data['demand_30'])))

print("FINAL SCORES")
print("Untuned Random Forest Error: ", rf_error)
print("Tuned Random Forest Error: ", best_random_error)
print("Untuned SVM Error: ", svm_error)
print("Tuned SVM Error: ", best_SVM_grid_error)
print("Benchmark Error: ", benchmark_error)

FINAL SCORES
Untuned Random Forest Error:  89
Tuned Random Forest Error:  93
Untuned SVM Error:  1056
Tuned SVM Error:  457
Benchmark Error:  221


In [57]:

predictions = list(rf_random.predict(base_features_np))
targets = list(base_target_np)
output = pd.DataFrame({'prediction':predictions,'target':targets})
output.to_csv('output_rf.csv')