In [23]:
import os, sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
import math
from datetime import datetime, timedelta

# Set path to current directory
sys.path.append(os.path.abspath(os.getcwd()))

In [24]:
# Load data and split target and features

# Set this to the percentage of the full data file we want to use. Smaller percentages will reduce computation time for testing
PROPORTION = 1
#Random state for repeatable runs
STATE = 42

data = pd.read_csv('./full_data.csv', index_col=0)
data = data[data['STATE'] == 'NSW'] # We are only looking at NSW here

data = data.sample(frac = PROPORTION)

data['DATE'] = pd.to_datetime(data['TIMESTAMP'])

data['DATE'] = data['DATE'].dt.floor(freq='4H')
data = data.groupby('DATE').agg({'FORECASTDEMAND':'mean',
                                'TEMPERATURE':'mean',
                                'TOTALDEMAND':'mean'}).reset_index()



In [25]:
weather = pd.read_csv('weather.csv')

weather['DATE'] = pd.to_datetime(weather[['Year','Month','Day']]).dt.date
weather.columns = ['drop1','drop2','drop3','drop4','drop5','rain','drop6','drop7','DAY']
weather = weather[['DAY','rain']]
weather['rain'].fillna(method='ffill', inplace=True)
weather['RAIN'] = pd.cut(weather['rain'],bins=[-1,0.2,4,10,999999],labels=['NONE','LIGHT','MODERATE','HEAVY'])

weather_dummy = pd.get_dummies(weather,columns=['RAIN'])


In [26]:
# Dummy coding for states
# Not needed because states not used

def get_delay_temp(row, delay):
    delay_date = row['DATE_SAVE'] - timedelta(hours=delay)
    try:
        return data.loc[delay_date]['TOTALDEMAND']
    except KeyError:
        return 0

data['DAY'] = data['DATE'].dt.date
data['DAILY_MAX_TEMP'] = data.groupby('DAY')['TEMPERATURE'].transform('max')

data = data.merge(weather_dummy, on='DAY')

data['DATE_SAVE'] = data['DATE']
data.set_index('DATE', inplace=True)

data['DEMAND_4HOURDELAY'] = data.apply(get_delay_temp, delay=4, axis=1)
data['DEMAND_8HOURDELAY'] = data.apply(get_delay_temp, delay=8, axis=1)
data['DEMAND_24HOURDELAY'] = data.apply(get_delay_temp, delay=24, axis=1)

data.reset_index(inplace=True)


features = data[['TEMPERATURE','DAILY_MAX_TEMP','RAIN_NONE','RAIN_LIGHT','RAIN_MODERATE','RAIN_HEAVY','DEMAND_4HOURDELAY','DEMAND_8HOURDELAY','DEMAND_24HOURDELAY']]
target = data['TOTALDEMAND']



In [27]:
# Convert to numpy arrays and split training/test data

features_np = pd.DataFrame(features).to_numpy()
target_np = np.ravel(pd.DataFrame(target).to_numpy())

features_train, features_test, target_train, target_test = train_test_split(features_np, target_np, random_state= STATE)

In [28]:
# Implement Random Forest
# TODO: Make it better

#Hyperparameter tuning grid

rnd_clf = RandomForestRegressor(random_state = STATE)
rnd_clf.fit(features_train, target_train)

rf_predicted = rnd_clf.predict(features_test)



In [29]:
# Implement Support Vector Machine Regressor
# TODO: Make it better

svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(features_train, target_train)
svm_predicted = svm_reg.predict(features_test)



In [30]:
# Calculate RMSE for each model and benchmark
# Lower = better model

rf_error = round(math.sqrt(mean_squared_error(target_test, rf_predicted)))
svm_error = round(math.sqrt(mean_squared_error(target_test, svm_predicted)))
benchmark_error = round(math.sqrt(mean_squared_error(data['TOTALDEMAND'], data['FORECASTDEMAND'])))

print("Random Forest Error: ", rf_error)
print("SVM Error: ", svm_error)
print("Benchmark Error: ", benchmark_error)

Random Forest Error:  458
SVM Error:  896
Benchmark Error:  755
