In [9]:
import os, sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
import math

# Set path to current directory
sys.path.append(os.path.abspath(os.getcwd()))

In [10]:
# Load data and split target and features

# Set this to the percentage of the full data file we want to use. Smaller percentages will reduce computation time for testing
PROPORTION = .1
#Random state for repeatable runs
STATE = 42

data = pd.read_csv('./full_data.csv', index_col=0)
data = data[data['STATE'] == 'NSW'] # We are only looking at NSW here
data = data.sample(frac = PROPORTION)

features = data.iloc[:, 1 : -1 ]
target = data.iloc[:,  -1 ]

In [11]:
# Dummy coding for states
# Not needed because states not used

#features = pd.get_dummies(features,columns=['STATE'])
features = features[['TEMPERATURE']]


In [12]:
# Convert to numpy arrays and split training/test data

features_np = pd.DataFrame(features).to_numpy()
target_np = np.ravel(pd.DataFrame(target).to_numpy())

features_train, features_test, target_train, target_test = train_test_split(features_np, target_np, random_state= STATE)

In [13]:
# Implement Random Forest
# TODO: Make it better

#Hyperparameter tuning grid
random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               'max_features': ['auto','sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

rnd_clf = RandomForestRegressor(random_state = STATE)

rf_random = RandomizedSearchCV(estimator = rnd_clf, param_distributions = random_grid, n_iter = 25, cv = 3, verbose=2, random_state=STATE, n_jobs = -1)# Fit the random search model
rf_random.fit(features_train, target_train)

rf_predicted = rf_random.predict(features_test)



Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=15.0min
[CV] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time= 2.2min
[CV] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=1200; total time=12.6min
[CV] END bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=1800; total time=19.1min
[CV] END bootstrap=True, max_depth=70, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=18.9min
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time= 5.6min
[CV] END bootstrap=False, max_depth=60, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=

In [14]:
# Implement Support Vector Machine Regressor
# TODO: Make it better

svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(features_train, target_train)
svm_predicted = svm_reg.predict(features_test)

In [15]:
# Calculate RMSE for each model and benchmark
# Lower = better model

rf_error = round(math.sqrt(mean_squared_error(target_test, rf_predicted)))
svm_error = round(math.sqrt(mean_squared_error(target_test, svm_predicted)))
benchmark_error = round(math.sqrt(mean_squared_error(data['TOTALDEMAND'], data['FORECASTDEMAND'])))

print("Random Forest Error: ", rf_error)
print("SVM Error: ", svm_error)
print("Benchmark Error: ", benchmark_error)

Random Forest Error:  1201
SVM Error:  1304
Benchmark Error:  790
