In [13]:
import pandas as pd
import numpy as np


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import RMSprop

from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.optimizers import Adam
from datetime import datetime


from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

In [14]:
#Load dataset into a pandas dataframe
df = pd.read_csv("../data/Cleaned data/data.csv") 

df.dropna(inplace = True)

df.head()

Unnamed: 0,YEAR,MONTH,DAY,HUMIDITY,WINDSPEED,DATE,TOTALDEMAND,HOLIDAY,MIN,MAX,RAIN,SOLAR,RRP,FORECASTDEMAND,OUTPUT,MONTHDATE,WEEKDAY,WEEKEND,TEMPAVE
0,2016,1,1,0.656341,15.902439,2016-01-01,6853.633437,2.0,15.3,28.6,0.0,32.2,38.472917,6665.366167,23.465,01-2016,4,0,21.95
1,2016,1,2,0.656341,15.902439,2016-01-02,6727.613958,0.0,15.9,26.1,0.0,21.7,36.907292,6236.849955,23.465,01-2016,5,1,21.0
2,2016,1,3,0.688837,14.488372,2016-01-03,6616.406076,0.0,17.5,25.6,0.0,10.3,31.997083,6551.924748,23.465,01-2016,6,1,21.55
3,2016,1,4,0.679545,22.477273,2016-01-04,7367.750278,0.0,18.2,23.6,14.0,6.4,33.424583,6729.993123,23.465,01-2016,0,0,20.9
4,2016,1,5,0.768837,22.581395,2016-01-05,7462.242014,0.0,17.6,20.5,39.0,4.4,33.053958,7333.898202,23.465,01-2016,1,0,19.05


In [15]:
#X = df.iloc[:,[3,5,9,16]]
#y = df.iloc[:,10]

df = df.drop(['MIN','MAX','FORECASTDEMAND', 'MONTHDATE','WEEKEND'], axis=1)

#y = df['FORECASTDEMAND']

df.head()


Unnamed: 0,YEAR,MONTH,DAY,HUMIDITY,WINDSPEED,DATE,TOTALDEMAND,HOLIDAY,RAIN,SOLAR,RRP,OUTPUT,WEEKDAY,TEMPAVE
0,2016,1,1,0.656341,15.902439,2016-01-01,6853.633437,2.0,0.0,32.2,38.472917,23.465,4,21.95
1,2016,1,2,0.656341,15.902439,2016-01-02,6727.613958,0.0,0.0,21.7,36.907292,23.465,5,21.0
2,2016,1,3,0.688837,14.488372,2016-01-03,6616.406076,0.0,0.0,10.3,31.997083,23.465,6,21.55
3,2016,1,4,0.679545,22.477273,2016-01-04,7367.750278,0.0,14.0,6.4,33.424583,23.465,0,20.9
4,2016,1,5,0.768837,22.581395,2016-01-05,7462.242014,0.0,39.0,4.4,33.053958,23.465,1,19.05


In [16]:
# Onehot Encoding for categorial data (Weekday)


# Select the "WEEKDAY" column and create a new dataframe
weekday_df = df[['WEEKDAY']]

# Create a one-hot encoder object

encoder = OneHotEncoder(categories='auto')

# Fit and transform the weekday data
weekday_encoded = encoder.fit_transform(df[['WEEKDAY']]).toarray()

# Create a new dataframe with the encoded weekday data
weekday_df = pd.DataFrame(weekday_encoded, columns=['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'])

#Drop weekday column
df = df.drop(['WEEKDAY'], axis=1)

# Concatenate the original dataframe with the encoded weekday dataframe
df = pd.concat([df, weekday_df], axis=1)


df.head()

Unnamed: 0,YEAR,MONTH,DAY,HUMIDITY,WINDSPEED,DATE,TOTALDEMAND,HOLIDAY,RAIN,SOLAR,RRP,OUTPUT,TEMPAVE,MON,TUE,WED,THU,FRI,SAT,SUN
0,2016,1,1,0.656341,15.902439,2016-01-01,6853.633437,2.0,0.0,32.2,38.472917,23.465,21.95,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2016,1,2,0.656341,15.902439,2016-01-02,6727.613958,0.0,0.0,21.7,36.907292,23.465,21.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2016,1,3,0.688837,14.488372,2016-01-03,6616.406076,0.0,0.0,10.3,31.997083,23.465,21.55,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2016,1,4,0.679545,22.477273,2016-01-04,7367.750278,0.0,14.0,6.4,33.424583,23.465,20.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2016,1,5,0.768837,22.581395,2016-01-05,7462.242014,0.0,39.0,4.4,33.053958,23.465,19.05,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Convert the 'date' column to a datetime object
df['DATE'] = pd.to_datetime(df['DATE'])

# identify response variable and predictors
X = df.drop(['TOTALDEMAND', 'DATE'], axis=1).values
y = df['TOTALDEMAND'].values

# Split the dataset into training and testing sets. 

train_set = (df['DATE'] >= datetime(2018, 1, 1)) & (df['DATE'] < datetime(2022, 1, 1))
test_set = df['DATE'] >= datetime(2022, 1, 1)



In [18]:
# Normalize the data

scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X[train_set.index])
X_test_scaled = scaler.transform(X[test_set.index])

train_set_scaled = np.hstack((X_train_scaled, y[train_set.index][:, None]))
test_set_scaled = np.hstack((X_test_scaled, y[test_set.index][:, None]))


print(train_set_scaled[:5])


[[0.00000000e+00 0.00000000e+00 0.00000000e+00 6.79161283e-01
  3.56595002e-01 1.00000000e+00 0.00000000e+00 1.00000000e+00
  2.35697584e-01 2.49868284e-02 6.38081395e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 6.85363344e+03]
 [0.00000000e+00 0.00000000e+00 3.33333333e-02 6.79161283e-01
  3.56595002e-01 0.00000000e+00 0.00000000e+00 6.73913043e-01
  2.26106057e-01 2.49868284e-02 6.10465116e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 6.72761396e+03]
 [0.00000000e+00 0.00000000e+00 6.66666667e-02 7.12786847e-01
  3.24886080e-01 0.00000000e+00 0.00000000e+00 3.19875776e-01
  1.96024526e-01 2.49868284e-02 6.26453488e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 6.61640608e+03]
 [0.00000000e+00 0.00000000e+00 1.00000000e-01 7.03172035e-01
  5.04028540e-01 0.00000000e+00 8.77192982e-02 1.9

In [19]:
#Reshape the data for input to the LSTM model
#(This step is fit the data to 3D tensor format for LSTM model to process the sequential data efficiently 
#and capture any temporal dependencies in the data)

#using sliding window approach to create input-output pairs with
#timesteps n = 1

X_train = []
y_train = []
for i in range(1, len(train_set)):
    X_train.append(train_set_scaled[i-1:i, :])
    y_train.append(train_set_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2]))


X_test = []
y_test = []
for i in range(1, len(test_set)):
    X_test.append(test_set_scaled[i-1:i, :])
    y_test.append(test_set_scaled[i, 0])
X_test, y_test = np.array(X_test), np.array(y_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2]))





In [20]:
#Build LSTM model with 3 layers and 1 Dense layer


model = Sequential()
model.add(LSTM(units=64, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=32, activation='tanh', return_sequences=True))
model.add(LSTM(units=16, activation='relu'))
model.add(Dense(units=1, activation='linear'))


optimizer = RMSprop(learning_rate=0.0001)
model.compile(loss='mean_squared_error', optimizer=optimizer)

In [None]:
# Extract the best hyperparameters
best_batch_size = grid_search.best_params_['batch_size']
best_epochs = grid_search.best_params_['epochs']
best_dropout = grid_search.best_params_['dropout']
best_optimizer = grid_search.best_params_['optimizer']
best_neurons = grid_search.best_params_['neurons']

# Build and compile the model with the best hyperparameters

model = Sequential()
model.add(LSTM(units=best_neurons, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(best_dropout))
model.add(LSTM(units=best_neurons, return_sequences=True))
model.add(Dropout(best_dropout))
model.add(LSTM(units=best_neurons))
model.add(Dropout(best_dropout))
model.add(Dense(units=1, activation='linear'))


# Compile the model with the best optimizer found using grid search
optimizer = best_optimizer
model.compile(loss='mean_squared_error', optimizer=optimizer)


# Train the model with the best number of epochs and batch size found using grid search
history = model.fit(X_train, y_train, epochs=best_epochs, batch_size=best_batch_size, validation_data=(X_test, y_test))



In [None]:

# Make predictions on the test set
y_pred = model.predict(X_test)

# Inverse transform the scaled data back to the original scale
y_pred = scaler.inverse_transform(y_pred)
y_test = scaler.inverse_transform(y_test.reshape(-1, 1))


In [None]:
# Evaluate the LSTM model on the test set
# Calculate MSE and R-squared

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE: %.4f' % mse)
print('R-squared: %.4f' % r2)

In [None]:
# Plot the loss

import matplotlib.pyplot as plt 
import matplotlib.dates as mdates
import datetime as dt

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['test_loss'], label='validation')
plt.title('Model Loss')
plt.ylabel('MSE')
plt.xlabel('Epoch')
plt.legend()
plt.show()


# Plot actual and forecast demand values
plt.plot(y_test_inv, label='Actual Demand')
plt.plot(y_pred_inv, label='Forecast Demand')
plt.legend()
plt.show()




df_lim = X_all.tail(test_len)
df_lim.columns = df.columns
df_lim["YTEST"] = y_test
df_lim["YPRED"] = y_pred
df_lim["DATE"] = pd.to_datetime(dict(year=df.YEAR, month=df.MONTH, day=df.DAY))
df_lim["FORECAST"] = data["FORECASTDEMAND"].tail(test_len).shift(-1).replace(0, np.nan)
df_lim = df_lim.dropna()
df_lim = df_lim[["DATE","TOTALDEMAND","YTEST","YPRED","FORECAST"]]
df_lim.head()


Date = [d.date() for d in df_lim.DATE]
Demand = df_lim.TOTALDEMAND
Forecast = df_lim.FORECAST
Pred = df_lim.YPRED
Test = df_lim.YTEST


fig, ax = plt.subplots(figsize=(20,5))
forecast = ax.plot(Date, Forecast, linewidth=1, color='red')
test = ax.plot(Date, Test, linewidth=1, color='blue')
ax.legend([test, forecast], labels=["Demand", "Forecast Demand"])
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.set_title("Prior Forecast")
plt.show()


# Print prior model performance of the data used

mae1 = mean_absolute_error(Test, Forecast)
mse1 = mean_squared_error(Test, Forecast)
rmse1 = np.sqrt(mse1)
print("Mean Absolute Error: {:.2f}".format(mae1))
print("Root Mean Squared Error: {:.2f}".format(rmse1))


fig, ax = plt.subplots(figsize=(20,5))
forecast = ax.plot(Date, Pred, linewidth=1, color='red')
test = ax.plot(Date, Test, linewidth=1, color='blue')
ax.legend([test, forecast], labels=["Demand", "Forecast Demand"])
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.set_title("Current Forecast")
plt.show()


# print this current model performance

mae2 = mean_absolute_error(Test, Pred)
mse2 = mean_squared_error(Test, Pred)
rmse2 = np.sqrt(mse2)

print("Mean Absolute Error: {:.2f}".format(mae2))
print("Root Mean Squared Error: {:.2f}".format(rmse2))