# Energy Consumption Forecasting
## Table of Contents
1. [Model Selection](#model-selection)
2. [Feature Engineering](#feature-engineering)
3. [Hyperparameter Tuning](#hyperparameter-tuning)
4. [Implementation](#implementation)



In [None]:
# Import data from google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import holidays
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l1, l2
import matplotlib.pyplot as plt

## Model Selection

Long Short-Term Memory (LSTM) and Gated Recurrent Unit (GRU) are two types of Recurrent Neural Networks (RNNs) that are widely used for sequence prediction tasks, including electricity consumption prediction.

* LSTM (Long Short-Term Memory) are designed to overcome the vanishing gradient problem of traditional RNNs by incorporating memory cells that can maintain information over long sequences. LSTMs are excellent at capturing temporal dependencies and patterns within time-series data, making them suitable for predicting future values based on past consumption. LSTMs can model complex nonlinear relationships in electricity usage patterns, accounting for factors like time of day, temperature, and seasonal variations.
* GRU (Gated Recurrent Unit)
GRUs are a variation of LSTMs with a simpler structure. GRUs have fewer parameters than LSTMs, which can lead to faster training times and reduced computational complexity.



## Feature Engineering

* The electricity Consumption is multilevel seasonality.  
  * month $(M_i)$, where $i$ = 1 (Jan.), 2 (Feb.), ..., 11 (Nov.), 0 (Dec.)
  * week $(W_j)$, where $j$ = 1 (Mon),..., 6 (Sat), 0 (Sun)
  * Hours of Daylight ($HDL_t$)
  * Holidays ($H_t$)
  * Temperature ($T_t$)

In [None]:
# Load the dataset
energy_data = pd.read_csv("drive/MyDrive/data/energy.csv", parse_dates=['Date (UTC)'], index_col='Date (UTC)')

# Feature Engineering
# dummy variables for month and week
energy_data['Month'] = energy_data.index.month
month_dummies = pd.get_dummies(energy_data['Month'], prefix='Month')
energy_data = pd.concat([energy_data, month_dummies], axis=1)

energy_data['Week'] = energy_data.index.weekday
week_dummies = pd.get_dummies(energy_data['Week'], prefix='Week')
energy_data = pd.concat([energy_data, week_dummies], axis=1)

# HDL (Hours of Daylight)
energy_data['d_t'] = energy_data.index.dayofyear
# Sun's inclination angle (lambda_t) calculation
lambda_t = 0.4102 * np.sin((2 * np.pi / 365) * (energy_data['d_t'] - 80.25))
# Latitude for Germany
delta = 52
HDL = 7.722 * np.arccos(-np.tan(2 * np.pi * delta / 360) * np.tan(lambda_t))
energy_data['HDL'] = HDL

# dummy variables for holidays
H = holidays.Germany()
# Function to check if a date is a holiday
def is_holiday(date):
    return date in H

energy_data['holiday'] = energy_data.index.to_series().apply(is_holiday)
energy_data['holiday'] = energy_data['holiday'].astype(int)

# select dataset for train model
start_date = '2018-03-01'
end_date = '2019-03-01'
filtered_data = energy_data[(energy_data.index >= start_date) & (energy_data.index <= end_date)]
filtered_data = filtered_data.drop(['Month', 'Week', 'd_t'], axis=1)
features_to_scale = ['Load', 'Temperature', 'HDL']
# Initialize the scaler
scaler = MinMaxScaler()
# Scale the features
filtered_data[features_to_scale] = scaler.fit_transform(filtered_data[features_to_scale])
filtered_data.head()

* Group 1: Feature (Load)
* Group 2: Feature (Load, Temperature)
* Group 3: Feature (Load, Temperature, HDL)
* Group 4: Feature (Load, Temperature, HDL, holiday)
* Group 5: Feature (Load, Temperature, HDL, holiday, month, week)
### One Houe Forecasting

In [None]:
# Reshape data for LSTM and GRU
def create_dataset(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        # Extract the sequence from the dataset
        seq_x = data.iloc[i:(i + n_steps)].values  # Convert to numpy array
        # Append the sequence to X
        X.append(seq_x)
        # Append the target (next value) to y
        # Assuming the target is the first column
        y.append(data.iloc[i + n_steps, 0])
    return np.array(X), np.array(y)

# set LSTM and GRU model for training
def create_model_h(model_type, input_shape, hidden_units=50, dense_units=1, activation='linear'):
    """
    Creates and returns a Sequential model with either an LSTM or GRU layer.

    Parameters:
    model_type (str): Type of the model - 'LSTM' or 'GRU'.
    input_shape (tuple): Shape of the input data (time steps, features).
    hidden_units (int): Number of units in the LSTM/GRU layer.
    dense_units (int): Number of units in the Dense output layer.
    activation (str): 'linear' is a common choice for regression problems.

    Returns:
    Sequential model
    """
    model = Sequential()

    if model_type == 'LSTM':
        model.add(LSTM(hidden_units, input_shape=input_shape))
    elif model_type == 'GRU':
        model.add(GRU(hidden_units, input_shape=input_shape))

    model.add(Dense(dense_units, activation=activation))

    return model

In [None]:
filtered_data_1 = filtered_data[['Load']]
filtered_data_2 = filtered_data[['Load', 'Temperature']]
filtered_data_3 = filtered_data[['Load', 'Temperature', 'HDL']]
filtered_data_4 = filtered_data[['Load', 'Temperature', 'HDL', 'holiday']]

In [None]:
n_steps = 1
n_features = 1
X, y = create_dataset(filtered_data_1, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_steps = 1
n_features = 2
X, y = create_dataset(filtered_data_2, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_steps = 1
n_features = 3
filtered_data_3 = filtered_data[['Load', 'Temperature', 'HDL']]
X, y = create_dataset(filtered_data_3, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 5))

# LSTM Loss Plot
axes[0].plot(history_LSTM.history['loss'], label='Train Loss - LSTM')
axes[0].plot(history_LSTM.history['val_loss'], label='Validation Loss - LSTM')
axes[0].set_title('LSTM Model Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend(loc='upper right')

# GRU Loss Plot
axes[1].plot(history_GRU.history['loss'], label='Train Loss - GRU')
axes[1].plot(history_GRU.history['val_loss'], label='Validation Loss - GRU')
axes[1].set_title('GRU Model Loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend(loc='upper right')

plt.show()

In [None]:
n_steps = 1
n_features = 4
X, y = create_dataset(filtered_data_4, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_steps = 1
n_features = 23
X, y = create_dataset(filtered_data, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

* Impact of Feature Complexity:

Increasing the number of features introduces more complexity and potentially more noise into the models. This might be why the MSE increases for both models in the Group 4 and Group 5 scenario.
More features do not always lead to better performance, especially if some of the added features do not have a strong predictive relationship with the target variable or are highly correlated with each other.
* Model Sensitivity to Feature Set:

The LSTM model seems to perform better with a simpler feature set, while the GRU model is more robust to the increased complexity of the feature set.
This could be due to the inherent architectural differences between LSTM and GRU. GRUs, with fewer parameters, might be better at handling the added complexity without overfitting.
* Feature Selection and Engineering:

The results highlight the importance of careful feature selection and engineering. It suggests that merely adding more features does not guarantee better performance and can sometimes degrade the model's accuracy.
Feature engineering should be guided by domain knowledge and feature importance analysis.
* For one houe forecasting, month, week and holiday dummy variables do not have a strong predictive relationship with electricity consumption.

### One Day Forecasting

In [None]:
def create_sequences(data, n_input_steps, n_output_steps):
    X, y = [], []
    for i in range(len(data) - n_input_steps - n_output_steps + 1):
        seq_x = data.iloc[i:(i + n_input_steps)].values
        seq_y = data.iloc[i + n_input_steps:i + n_input_steps + n_output_steps, 0]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

def create_model_d(model_type, input_shape, hidden_units=50, dense_units=24, activation='linear'):
    model = Sequential()

    if model_type == 'LSTM':
        model.add(LSTM(hidden_units, input_shape=input_shape))
    elif model_type == 'GRU':
        model.add(GRU(hidden_units, input_shape=input_shape))

    model.add(Dense(dense_units, activation=activation))

    return model

In [None]:
n_input_steps = 24
n_output_steps = 24
n_features = 1
X, y = create_sequences(filtered_data_1, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_input_steps = 24
n_output_steps = 24
n_features = 2
X, y = create_sequences(filtered_data_2, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_input_steps = 24
n_output_steps = 24
n_features = 3
X, y = create_sequences(filtered_data_3, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_input_steps = 24
n_output_steps = 24
n_features = 4
filtered_data_4 = filtered_data[['Load', 'Temperature', 'HDL', 'holiday']]
X, y = create_sequences(filtered_data_4, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_input_steps = 24
n_output_steps = 24
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 5))

# LSTM Loss Plot
axes[0].plot(history_LSTM.history['loss'], label='Train Loss - LSTM')
axes[0].plot(history_LSTM.history['val_loss'], label='Validation Loss - LSTM')
axes[0].set_title('LSTM Model Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend(loc='upper right')

# GRU Loss Plot
axes[1].plot(history_GRU.history['loss'], label='Train Loss - GRU')
axes[1].plot(history_GRU.history['val_loss'], label='Validation Loss - GRU')
axes[1].set_title('GRU Model Loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend(loc='upper right')

plt.show()

* For one day forecasting, including all features lead to better performance.
* Same situation for one week forecasting

## Hyperparameter Tuning

Here we adjust models in two methods: n_steps and learning rate
* n_steps

n_steps refers to the number of previous time steps of data that the model uses to make a prediction. It defines the size of the input sequence window for the LSTM or GRU. A larger n_steps value allows the model to learn from longer sequences, which can be beneficial for capturing long-term dependencies in the data.
* Learning Rate

The learning rate is a hyperparameter that determines the size of the steps the model takes during the optimization process. It influences how quickly the model learns. Modern optimization algorithms like Adam adjust the learning rate during training, which can help mitigate the risks of choosing an inappropriate static learning rate.

Here, we consider using learning rate decay or cyclical learning rates to dynamically adjust the learning rate during training, which can lead to better convergence properties.



In [None]:
# Initial learning rate
initial_learning_rate = 0.01
# Decay rate and step
decay_steps = 50 * (len(X_train) / 32)
decay_rate = 0.96
lr_schedule = ExponentialDecay(initial_learning_rate, decay_steps, decay_rate, staircase=True)

### One Hour Forecasting


In [None]:
n_steps = 24
n_features = 3
X, y = create_dataset(filtered_data_3, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_steps = 72
n_features = 3
X, y = create_dataset(filtered_data_3, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

* The performance of both models improves significantly compared to n_steps = 1, which is expected as they now have more context for making predictions.
* Interestingly, the GRU model outperforms the LSTM model in this case, as indicated by its lower MSE.
* This might suggest that for capturing longer-term dependencies (24 steps back in this case), the GRU model is more effective or efficient.

In [None]:
n_steps = 24
n_features = 3
X, y = create_dataset(filtered_data_3, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
lstm_validation = model_LSTM.predict(X_val)
gru_validation = model_GRU.predict(X_val)

# Plotting
plt.figure(figsize=(20, 5))
plt.plot(y_val, label='Actual Data', color='red', linestyle='-')
plt.plot(lstm_validation, label='LSTM Validation', color='blue', linestyle='--')
plt.plot(gru_validation, label='GRU Validation', color='green', linestyle='-.')
plt.title('Comparison of LSTM and GRU Validation with Actual Data for one-hour forecasting')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Load Value')
plt.legend()
plt.show()

* Improved Performance with Learning Rate Decay

In both models (LSTM and GRU), implementing an exponential decay learning rate schedule led to a significant improvement in MSE. The MSE is lower for both models when using the decaying learning rate compared to a constant learning rate.
* Impact on LSTM and GRU Models

While both models benefit from the learning rate schedule, the improvement is more pronounced in the LSTM model. This indicates that the LSTM might be more sensitive to the learning rate adjustments in your specific task and dataset.


### One Day Forecasting

In [None]:
n_input_steps = 72
n_output_steps = 24
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_input_steps = 168
n_output_steps = 24
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

* Effectiveness of Longer Sequences: Both LSTM and GRU models generally benefit from longer input sequences up to a certain point, as they can capture more contextual information from the data.
* Diminishing Returns: There seems to be a point where adding more past data (beyond 72 hours in this case) does not significantly improve or might even slightly worsen the performance for LSTM. This could be due to the model's increasing complexity and potential overfitting.
* Model Selection: The choice between LSTM and GRU should consider not only performance but also computational efficiency. GRUs, with fewer parameters, might be more efficient and still offer comparable performance to LSTMs, especially when dealing with longer sequences.
* *Optimal* n_steps: The optimal number of past time steps (n_steps) depends on the specific characteristics of the time series data. It's important to find a balance between providing enough contextual information and avoiding overfitting or unnecessary computational complexity.

In [None]:
n_input_steps = 72
n_output_steps = 24
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
lstm_validation = model_LSTM.predict(X_val)
gru_validation = model_GRU.predict(X_val)
y_daily_val = np.sum(y_val.reshape(-1, 24), axis=1)
lstm_daily_val = np.sum(lstm_validation.reshape(-1, 24), axis=1)
gru_daily_val = np.sum(gru_validation.reshape(-1, 24), axis=1)

# Plotting
plt.figure(figsize=(20, 5))
plt.plot(y_daily_val, label='Actual Data', color='red', linestyle='-')
plt.plot(lstm_daily_val, label='LSTM Validation', color='blue', linestyle='--')
plt.plot(gru_daily_val, label='GRU Validation', color='green', linestyle='-.')
plt.title('Comparison of LSTM and GRU Validation with Actual Data for one-day forecasting')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Load Value')
plt.legend()
plt.show()

### One Week Forecasting

In [None]:
def create_model_w(model_type, input_shape, hidden_units=50, dense_units=168, activation='linear'):
    model = Sequential()

    if model_type == 'LSTM':
        model.add(LSTM(hidden_units, input_shape=input_shape))
    elif model_type == 'GRU':
        model.add(GRU(hidden_units, input_shape=input_shape))

    model.add(Dense(dense_units, activation=activation))

    return model

In [None]:
n_input_steps = 72
n_output_steps = 168
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_w('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_w('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_input_steps = 168
n_output_steps = 168
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_w('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_w('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
n_input_steps = 336
n_output_steps = 168
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_w('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_w('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.001)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

* Model Sensitivity to Input Length: Both models show varying performance based on the length of the input sequence, indicating sensitivity to how much historical data is used.
* Performance Trade-offs: There's a trade-off between input sequence length and model performance. More historical data doesn't always translate to better performance, as seen with the 168 steps result.
* Model Efficiency: GRU, with a simpler structure, seems to handle longer sequences (336 steps) more efficiently than LSTM, which might be beneficial for computational efficiency.
* Optimal Input Length: The optimal number of past time steps (n_steps) is not always straightforward and may depend on the specific dynamics of the electricity load data.

In [None]:
# LSTM
model_LSTM = create_model_w('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_w('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=0.01)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

* Learning Rate Sensitivity: Both LSTM and GRU models are sensitive to the learning rate. The right learning rate can significantly improve model performance.
* Model Performance: At a higher learning rate of 0.01, the GRU model not only surpasses its performance at the lower learning rate but also outperforms the LSTM model. This indicates that GRU might be more responsive to a higher learning rate in this context.
* Balance in Learning Rate Selection: A higher learning rate can speed up training but may cause instability or overshooting. Conversely, a too-low learning rate might result in slow convergence or getting stuck in local minima.
* Experimentation is Key: These results highlight the importance of hyperparameter tuning, especially for learning rates. The optimal setting can vary depending on the model architecture and the specific task.

In [None]:
# LSTM
model_LSTM = create_model_w('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_w('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# Evaluate
X_val, y_val = X_train[int(X_train.shape[0]*0.8):], y_train[int(y_train.shape[0]*0.8):]
lstm_eval = model_LSTM.evaluate(X_val, y_val, verbose=0)
gru_eval = model_GRU.evaluate(X_val, y_val, verbose=0)
lstm_eval, gru_eval

In [None]:
lstm_validation = model_LSTM.predict(X_val)
gru_validation = model_GRU.predict(X_val)

y_daily_val = np.sum(y_val.reshape(-1, 168), axis=1)
lstm_daily_val = np.sum(lstm_validation.reshape(-1, 168), axis=1)
gru_daily_val = np.sum(gru_validation.reshape(-1, 168), axis=1)
# Plotting
plt.figure(figsize=(20, 5))
plt.plot(y_daily_val, label='Actual Data', color='red', linestyle='-')
plt.plot(lstm_daily_val, label='LSTM Validation', color='blue', linestyle='--')
plt.plot(gru_daily_val, label='GRU Validation', color='green', linestyle='-.')
plt.title('Comparison of LSTM and GRU Validation with Actual Data for one-week forecasting')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Load Value')
plt.legend()
plt.show()

## Implementation

For one hour forecasting, we set n_features = 3 ('Load', 'Temperature', 'HDL'), n_steps = 24 with dynamitic learning rate.


In [None]:
n_steps = 24
n_features = 3
X, y = create_dataset(filtered_data_3, n_steps)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_h('LSTM', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history_LSTM = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_h('GRU', input_shape = (n_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history_GRU = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

lstm_pre = model_LSTM.predict(X_test)
gru_pre = model_GRU.predict(X_test)
# Plotting
plt.figure(figsize=(20, 5))
plt.plot(y_test, label='Actual Data', color='red', linestyle='-')
plt.plot(lstm_pre, label='LSTM Prediction', color='blue', linestyle='--')
plt.plot(gru_pre, label='GRU Prediction', color='green', linestyle='-.')
plt.title('Comparison of LSTM and GRU Prediction with Actual Data for one-hour forecasting')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Load Value')
plt.legend()
plt.show()

In [None]:
# Slicing the data
y_test_subset = y_test[-240:]
lstm_pre_subset = lstm_pre[-72:]
gru_pre_subset = gru_pre[-72:]

# Preparing indices to align the plots
index_y_test = range(len(y_test_subset))
index_lstm_pre = range(len(y_test_subset) - 72, len(y_test_subset))
index_gru_pre = range(len(y_test_subset) - 72, len(y_test_subset))

# Plotting
plt.figure(figsize=(20, 5))
plt.plot(index_y_test, y_test_subset, label='Actual Data', color='red', linestyle='-')
plt.plot(index_lstm_pre, lstm_pre_subset, label='LSTM Prediction', color='blue', linestyle='--')
plt.plot(index_gru_pre, gru_pre_subset, label='GRU Prediction', color='green', linestyle='-.')

plt.title('Comparison of LSTM and GRU Prediction with Actual Data for one-hour forecasting')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Load Value')
plt.legend()
plt.show()

For one hour forecasting, we set n_features = 23 ('Load', 'Temperature', 'HDL', 'Month', 'Week', 'holiday'), n_steps = 72 with dynamitic learning rate.



In [None]:
n_input_steps = 72
n_output_steps = 24
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_d('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_d('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

lstm_pre = model_LSTM.predict(X_test)
gru_pre = model_GRU.predict(X_test)

y_daily_pre = np.sum(y_test.reshape(-1, 24), axis=1)
lstm_daily_pre = np.sum(lstm_pre.reshape(-1, 24), axis=1)
gru_daily_pre = np.sum(gru_pre.reshape(-1, 24), axis=1)
# Plotting
plt.figure(figsize=(20, 5))
plt.plot(y_daily_pre, label='Actual Data', color='red', linestyle='-')
plt.plot(lstm_daily_pre, label='LSTM Prediction', color='blue', linestyle='--')
plt.plot(gru_daily_pre, label='GRU Prediction', color='green', linestyle='-.')
plt.title('Comparison of LSTM and GRU Prediction with Actual Data for one-day forecasting')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Load Value')
plt.legend()
plt.show()

For one hour forecasting, we set n_features = 23 ('Load', 'Temperature', 'HDL', 'Month', 'Week', 'holiday'), n_steps = 336 with dynamitic learning rate.

In [None]:
n_input_steps = 336
n_output_steps = 168
n_features = 23
X, y = create_sequences(filtered_data, n_input_steps, n_output_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM
model_LSTM = create_model_w('LSTM', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_LSTM.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_LSTM.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

# GRU
model_GRU = create_model_w('GRU', input_shape = (n_input_steps, n_features))
optimizer = Adam(learning_rate=lr_schedule)
model_GRU.compile(loss='mean_squared_error', optimizer=optimizer)
history = model_GRU.fit(X_train, y_train, epochs=100, validation_split=0.2, verbose=0, batch_size=32)

lstm_pre = model_LSTM.predict(X_test)
gru_pre = model_GRU.predict(X_test)

y_daily_pre = np.sum(y_test.reshape(-1, 168), axis=1)
lstm_daily_pre = np.sum(lstm_pre.reshape(-1, 168), axis=1)
gru_daily_pre = np.sum(gru_pre.reshape(-1, 168), axis=1)
# Plotting
plt.figure(figsize=(20, 5))
plt.plot(y_daily_pre, label='Actual Data', color='red', linestyle='-')
plt.plot(lstm_daily_pre, label='LSTM Prediction', color='blue', linestyle='--')
plt.plot(gru_daily_pre, label='GRU Prediction', color='green', linestyle='-.')
plt.title('Comparison of LSTM and GRU Prediction with Actual Data for one-week forecasting')
plt.xlabel('Sample Index')
plt.ylabel('Normalized Load Value')
plt.legend()
plt.show()