We would like to explore Random Forest Regressors and Support Vector Machines along with some input variables to see if we can make a model that accurately forecasts energy demand. We aim to achieve better results than simply saying 'The demand in 30 minutes time will be the same as it is right now'. This is calculated below as having a mean average loss of 218.

In [1]:
# Import libraries
import os, sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
import math
from pprint import pprint
from datetime import datetime, timedelta

# Set path. Change as required
os.chdir('F:/Users/Dave/Desktop/UNSW-Capstone-Group-E/src/chrisdavid')

# read data
data = pd.read_csv('./Cleaned_Data_mkII.csv')

# Set random state
STATE = 2

# Makes sure datetime is in datetime format
data['DATETIME'] = pd.to_datetime(data['DATETIME'])

# We want to test a time column as an input
data['time'] = (data['DATETIME'].dt.strftime("%H%M%S"))

# Also would like to test demand and tmeperature 30, 60 and 90 mins before the current time as input
# These were found to have diminishing return after 60 mins out for demand and 30 mins out for temp
data['demand_30'] = data.TOTALDEMAND.shift(1)
data['demand_60'] = data.TOTALDEMAND.shift(2)

data['temp_30'] = data.TEMPERATURE.shift(1)

# Select only data from the past 3 years to make hyperparameter tuning take less time
mask = (data['DATETIME'] >= '2018-03-16') & (data['DATETIME'] < '2021-03-16')
data = data.loc[mask]

# The loss that we aim to beat
round(math.sqrt(mean_squared_error(data['TOTALDEMAND'], data['demand_30'])))

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ._gradient_boosting import predict_stages
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ._gradient_boosting import predict_stages
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devd

First we will try a random forrest regressor with just demand and temperature data for a benchmark

In [2]:
# Create features and target sets
base_features = data[['time', 'temp_30', 'demand_30', 'demand_60']]
base_target = data['TOTALDEMAND']

# Convert to numpy arrays and split training/test data
base_features_np = pd.DataFrame(base_features).to_numpy()
base_target_np = np.ravel(pd.DataFrame(base_target).to_numpy())

base_features_train, base_features_test, base_target_train, base_target_test = train_test_split(base_features_np,
                                                                            base_target_np, random_state = STATE)

# Implement Random Forest
base_rnd_clf = RandomForestRegressor(random_state = STATE)
base_rnd_clf.fit(base_features_train, base_target_train)

# Print error
base_rf_predicted = base_rnd_clf.predict(base_features_test)
base_rf_error = round(math.sqrt(mean_squared_error(base_target_test, base_rf_predicted)))
print("Baseline Random Forest Error: ", base_rf_error)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(base_rnd_clf.get_params())

  if _joblib.__version__ >= LooseVersion('0.12'):


Baseline Random Forest Error:  83
Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2,
 'verbose': 0,
 'warm_start': False}


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)
  if _joblib.__version__ >= LooseVersion('0.12'):


Next, try to include weather data to see if it increases performance

In [3]:
# import weather data
weather = pd.read_csv('weather.csv')

# Convert datetime to date
weather['DATE'] = pd.to_datetime(weather[['Year','Month','Day']]).dt.date

# Extract just day and rain columns
weather.columns = ['drop1','drop2','drop3','drop4','drop5','rain','drop6','drop7','DAY']
weather = weather[['DAY','rain']]

# Fill NA values
weather['rain'].fillna(method='ffill', inplace=True)

# Create dummy variables for rain for better model use
weather['RAIN'] = pd.cut(weather['rain'],bins=[-1,0.2,4,10,999999],labels=['NONE','LIGHT','MODERATE','HEAVY'])
weather_dummy = pd.get_dummies(weather,columns=['RAIN'])

# Create day column in data and merge using it
data['DAY'] = data['DATETIME'].dt.date

data = data.merge(weather_dummy, on='DAY')

# Create features and target sets
features = data[['time', 'temp_30', 'demand_30', 'demand_60', 'RAIN_NONE','RAIN_LIGHT','RAIN_MODERATE','RAIN_HEAVY']]
target = data['TOTALDEMAND']

# Convert to numpy arrays and split training/test data
features_np = pd.DataFrame(features).to_numpy()
target_np = np.ravel(pd.DataFrame(target).to_numpy())

features_train, features_test, target_train, target_test = train_test_split(features_np, target_np, random_state = STATE)

# Implement Random Forest
rnd_clf = RandomForestRegressor(random_state = STATE)
rnd_clf.fit(features_train, target_train)

# Print error
rf_predicted = rnd_clf.predict(features_test)
rf_error = round(math.sqrt(mean_squared_error(target_test, rf_predicted)))
print("Random Forest Error: ", rf_error)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rnd_clf.get_params())


  if _joblib.__version__ >= LooseVersion('0.12'):


Random Forest Error:  83
Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2,
 'verbose': 0,
 'warm_start': False}


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)
  if _joblib.__version__ >= LooseVersion('0.12'):


Weather data isn't contributing to the basline model. Try random search of optimizing hyperparamters as a quick way to tell if our baseline model can perform any better.

In [4]:
# Tune hyperparameters

# Create the random grid
random_grid = {'n_estimators': list(range(2, 20, 2)),
               'max_features': ['auto', 'sqrt'],
               'max_depth': list(range(1, 20, 1)),
               'min_samples_split': list(range(2, 10, 1)),
               'min_samples_leaf': list(range(2, 10, 1)),
               'bootstrap': [True, False]}

# Create the base model
rf = RandomForestRegressor()

# Random parameter search (3 fold CV)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                               n_iter = 10, cv = 5, verbose = 2, random_state = STATE, n_jobs = -1)

# Fit the random search model
rf_random.fit(base_features_train, base_target_train)
best_random_pred = rf_random.best_estimator_.predict(base_features_test)
best_random_error = round(math.sqrt(mean_squared_error(base_target_test, best_random_pred)))

# Print error
print("Randomized Hyp Tuning Error: ", best_random_error)

# Look at parameters used
print('Parameters currently in use:\n')
pprint(rf_random.best_estimator_)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  random_state=rnd):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    6.1s finished
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)
  if _joblib.__version__ >= LooseVersion('0.12'):


Randomized Hyp Tuning Error:  79
Parameters currently in use:

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=19,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=7, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=18, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)
  if _joblib.__version__ >= LooseVersion('0.12'):


This tuning is only trivially better than our base model. No point in continuing hyperparpmeter tuning further, or in continuing with the model, unless we can think of other inputs.

We will now start with SVM modelling

In [5]:
# Implement Support Vector Machine Regressor
# TODO: Make it better

svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(base_features_train, base_target_train)
svm_predicted = svm_reg.predict(base_features_test)


# Calculate RMSE for each model and benchmark
# Lower = better model


svm_error = round(math.sqrt(mean_squared_error(base_target_test, svm_predicted)))
#benchmark_error = round(math.sqrt(mean_squared_error(data['TOTALDEMAND'], data['FORECASTDEMAND'])))


print("SVM Error: ", svm_error)
#print("Benchmark Error: ", benchmark_error)

SVM Error:  139




In [None]:
# Implement Support Vector Machine Regressor
# TODO: Make it better

svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(features_train, target_train)
svm_predicted = svm_reg.predict(features_test)

In [None]:
# Calculate RMSE for each model and benchmark
# Lower = better model

rf_error = round(math.sqrt(mean_squared_error(target_test, rf_predicted)))
svm_error = round(math.sqrt(mean_squared_error(target_test, svm_predicted)))
#benchmark_error = round(math.sqrt(mean_squared_error(data['TOTALDEMAND'], data['FORECASTDEMAND'])))

print("Random Forest Error: ", rf_error)
print("SVM Error: ", svm_error)
#print("Benchmark Error: ", benchmark_error)