In [1]:
import pandas as pd
import numpy as np
import torch
import lightning as L
import optuna

# Pruning for Optuna
from lightning.pytorch.callbacks import Callback
from optuna.integration.pytorch_lightning import PyTorchLightningPruningCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
output_dir = "./OutputData/"

In [3]:
df = pd.read_csv(output_dir + "train_data.csv")

In [4]:
df

Unnamed: 0,time,consumption_MWh,consumption_lag2,trend,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
0,2018-01-01 02:00:00,24635.32,27412.81,2,7.071068e-01,7.071068e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
1,2018-01-01 03:00:00,23872.12,26324.39,3,8.660254e-01,5.000000e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
2,2018-01-01 04:00:00,23194.89,24635.32,4,9.659258e-01,2.588190e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
3,2018-01-01 05:00:00,23071.96,23872.12,5,1.000000e+00,6.123234e-17,7.818315e-01,0.62349,5.000000e-01,0.866025
4,2018-01-01 06:00:00,23267.90,23194.89,6,9.659258e-01,-2.588190e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...,...,...
52577,2023-12-31 19:00:00,35090.93,34549.42,52579,-8.660254e-01,5.000000e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52578,2023-12-31 20:00:00,33310.94,36193.59,52580,-7.071068e-01,7.071068e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52579,2023-12-31 21:00:00,32083.96,35090.93,52581,-5.000000e-01,8.660254e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52580,2023-12-31 22:00:00,30469.49,33310.94,52582,-2.588190e-01,9.659258e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000


## Data prep: Getting input & output sequences

In [7]:
past_target = df.consumption_lag2.values

In [8]:
historic_covars = df.drop(["time", "consumption_MWh", "consumption_lag2"], axis = 1).values

In [9]:
future_covars = df.drop(["time", "consumption_MWh", "consumption_lag2"], axis = 1).shift(-1).values

In [10]:
future_target = df.consumption_MWh.shift(-1).values

In [11]:
# This is the consumption_lag2 value at T
past_target

array([27412.81, 26324.39, 24635.32, ..., 35090.93, 33310.94, 32083.96])

In [12]:
# This is consumption_MWh at T+1, the target value at T.
future_target

array([23872.12, 23194.89, 23071.96, ..., 30469.49, 30029.91,      nan])

In [13]:
# These are the trend & seasonality features at T
historic_covars

array([[ 2.00000000e+00,  7.07106781e-01,  7.07106781e-01, ...,
         6.23489802e-01,  5.00000000e-01,  8.66025404e-01],
       [ 3.00000000e+00,  8.66025404e-01,  5.00000000e-01, ...,
         6.23489802e-01,  5.00000000e-01,  8.66025404e-01],
       [ 4.00000000e+00,  9.65925826e-01,  2.58819045e-01, ...,
         6.23489802e-01,  5.00000000e-01,  8.66025404e-01],
       ...,
       [ 5.25810000e+04, -5.00000000e-01,  8.66025404e-01, ...,
         1.00000000e+00, -2.44929360e-16,  1.00000000e+00],
       [ 5.25820000e+04, -2.58819045e-01,  9.65925826e-01, ...,
         1.00000000e+00, -2.44929360e-16,  1.00000000e+00],
       [ 5.25830000e+04, -2.44929360e-16,  1.00000000e+00, ...,
         1.00000000e+00, -2.44929360e-16,  1.00000000e+00]])

In [14]:
# These are the trend & seasonality features at T+1
future_covars

array([[ 3.00000000e+00,  8.66025404e-01,  5.00000000e-01, ...,
         6.23489802e-01,  5.00000000e-01,  8.66025404e-01],
       [ 4.00000000e+00,  9.65925826e-01,  2.58819045e-01, ...,
         6.23489802e-01,  5.00000000e-01,  8.66025404e-01],
       [ 5.00000000e+00,  1.00000000e+00,  6.12323400e-17, ...,
         6.23489802e-01,  5.00000000e-01,  8.66025404e-01],
       ...,
       [ 5.25820000e+04, -2.58819045e-01,  9.65925826e-01, ...,
         1.00000000e+00, -2.44929360e-16,  1.00000000e+00],
       [ 5.25830000e+04, -2.44929360e-16,  1.00000000e+00, ...,
         1.00000000e+00, -2.44929360e-16,  1.00000000e+00],
       [            nan,             nan,             nan, ...,
                    nan,             nan,             nan]])

In [15]:
# Get rid of last rows due to unknown future target
past_target = past_target[:-1]
future_target = future_target[:-1]
historic_covars = historic_covars[:-1, :]
future_covars = future_covars[:-1, :]

In [18]:
# Check shapes
print("Past target shape: " f"{past_target.shape}")
print("Historic covariates shape: " f"{historic_covars.shape}")
print("Future target shape: " f"{future_target.shape}")
print("Future covariates shape: " f"{future_covars.shape}")

Past target shape: (52581,)
Historic covariates shape: (52581, 7)
Future target shape: (52581,)
Future covariates shape: (52581, 7)


In [27]:
# Get shifted datasets
df_past = pd.DataFrame(
    np.concatenate((past_target.reshape(-1, 1), historic_covars), axis = 1),
    columns = df.columns.values[2:]
)
df_future = pd.DataFrame(
    np.concatenate((future_target.reshape(-1, 1), future_covars), axis = 1),
    columns = df.columns.values[2:]
).rename({"consumption_lag2": "consumption_MWh"}, axis = 1)

In [28]:
df_past

Unnamed: 0,consumption_lag2,trend,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
0,27412.81,2.0,0.707107,7.071068e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
1,26324.39,3.0,0.866025,5.000000e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
2,24635.32,4.0,0.965926,2.588190e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
3,23872.12,5.0,1.000000,6.123234e-17,7.818315e-01,0.62349,5.000000e-01,0.866025
4,23194.89,6.0,0.965926,-2.588190e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...
52576,32670.06,52578.0,-0.965926,2.588190e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52577,34549.42,52579.0,-0.866025,5.000000e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52578,36193.59,52580.0,-0.707107,7.071068e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52579,35090.93,52581.0,-0.500000,8.660254e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000


In [29]:
df_future

Unnamed: 0,consumption_MWh,trend,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
0,23872.12,3.0,8.660254e-01,5.000000e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
1,23194.89,4.0,9.659258e-01,2.588190e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
2,23071.96,5.0,1.000000e+00,6.123234e-17,7.818315e-01,0.62349,5.000000e-01,0.866025
3,23267.90,6.0,9.659258e-01,-2.588190e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
4,23875.44,7.0,8.660254e-01,-5.000000e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...
52576,35090.93,52579.0,-8.660254e-01,5.000000e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52577,33310.94,52580.0,-7.071068e-01,7.071068e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52578,32083.96,52581.0,-5.000000e-01,8.660254e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52579,30469.49,52582.0,-2.588190e-01,9.659258e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000


In [5]:
n_steps = len(df_future)
input_length = 48 # Last 48 hours as input
input_dims = 8 # Consumption lag 2, trend, 6 cyclical columns

output_length = 1 # RNN predicts one by one
horizon = 32 # We have to predict from T+1 to T+32, though we are only interested in T+8 to 32.

In [48]:
# One input sequence: past target (T) & future covariates (T+1)
input_seq = np.concatenate((
        df_past.iloc[0:input_length, 0].values.reshape(-1, 1),
        df_future.iloc[0:input_length, 1:].values
    ), axis = 1)
input_seq.shape

(48, 8)

In [50]:
# One output sequence: future target (T+1).
output_seq = df_future.iloc[input_length - 1, 0]
output_seq

28899.26

This will only work for h = 1.

For h > 1, you will need the future covariates at h, and the prediction at h-1.

Probably best to make the output sequence 36-long, include future targets along with future covariates.

Then at h=1, inputs are past target & future covariates from the input sequence.
At h > 1, inputs are the prediction from h - 1, & future covariates from the output sequence.