Implementing a sequence to sequence timeseries forecasting model using a GRU with custom gru cells and layer cells with layer normalization

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Subset
import matplotlib.pyplot as plt
from utils.fetch_data import fetch_timeseries_data
from utils.load_data import TimeSeriesDataset
from utils.early_stopping import EarlyStopping
from utils.fetch_data import create_splits

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x10e642110>

Get the data:

In [3]:
df = fetch_timeseries_data()

# create the training, validation, and test splits:
rail_train, rail_valid, rail_test = create_splits(df, attr='rail', train_ran=['2016-01','2024-12'],val_ran=['2025-01','2025-04'],test_ran=['2025-05'])

In [4]:
seq_length = 56
forecasting_horizon = 14

def create_x_chunks(ds):
    return [ds[i:i+seq_length] for i in range(len(ds)-seq_length-forecasting_horizon+1)]

def create_y_chunks(ds):
    y_chunks = []
    for i in range(len(ds)-seq_length-forecasting_horizon+1):
        seq_chunk = [ds[i+j+1:i+j+1+forecasting_horizon] for j in range(seq_length)] # for each element in the sequence, get the next 14 values
        y_chunks.append(seq_chunk)
    return y_chunks

In [6]:
train_x_chunks, train_y_chunks = create_x_chunks(rail_train.values.tolist()), create_y_chunks(rail_train.values.tolist())
valid_x_chunks, valid_y_chunks = create_x_chunks(rail_valid.values.tolist()), create_y_chunks(rail_valid.values.tolist())
test_x_chunks, test_y_chunks = create_x_chunks(rail_test.values.tolist()), create_y_chunks(rail_test.values.tolist())

In [7]:
train_x_chunks, train_y_chunks = torch.tensor(train_x_chunks), torch.tensor(train_y_chunks)
valid_x_chunks, valid_y_chunks = torch.tensor(valid_x_chunks), torch.tensor(valid_y_chunks)
test_x_chunks, test_y_chunks = torch.tensor(test_x_chunks), torch.tensor(test_y_chunks)

In [8]:
train_x_chunks.shape, train_y_chunks.shape

(torch.Size([3219, 56]), torch.Size([3219, 56, 14]))

In [9]:
class TsDataset(Dataset):
    def __init__(self, x_chunks, y_chunks):
        super().__init__()
        self.x_chunks = x_chunks.unsqueeze(2)
        self.y_chunks = y_chunks
    def __len__(self):
        return self.x_chunks.shape[0]
    def __getitem__(self, index):
        x,y = self.x_chunks[index,:,:], self.y_chunks[index,:,:]
        return x,y

In [10]:
train_ds = TsDataset(train_x_chunks, train_y_chunks)
val_ds = TsDataset(valid_x_chunks, valid_y_chunks)
test_ds = TsDataset(test_x_chunks, test_y_chunks)

In [11]:
train_dl = DataLoader(train_ds, batch_size=128, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=128)
test_dl = DataLoader(test_ds, batch_size=1024)

In [12]:
torch.manual_seed(1)

<torch._C.Generator at 0x10e642110>

In [39]:
class CustomGRUCell(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super().__init__()
        self.input = nn.Linear(input_size, 3*hidden_size, device=device)
        self.hidden = nn.Linear(hidden_size, 3*hidden_size, device=device)
        self.hidden_size = hidden_size

        #Â one layer norm per gate.. best practice
        self.ln_r = nn.LayerNorm(hidden_size, device=device)
        self.ln_z = nn.LayerNorm(hidden_size, device=device)
        self.ln_n = nn.LayerNorm(hidden_size, device=device)
    def forward(self, x, prev_h):
        computed_inputs = self.input(x)
        computed_hiddens = self.hidden(prev_h)
        
        # gates.....
        pre_r = computed_inputs[:,:self.hidden_size]+computed_hiddens[:,:self.hidden_size]
        rgate = torch.sigmoid(self.ln_r(pre_r))

        pre_z = computed_inputs[:,self.hidden_size:2*self.hidden_size]+computed_hiddens[:,self.hidden_size:2*self.hidden_size]
        zgate = torch.sigmoid(self.ln_z(pre_z))

        pre_n = computed_inputs[:,2*self.hidden_size:]+(rgate*computed_hiddens[:,2*self.hidden_size:])
        ngate = torch.tanh(self.ln_n(pre_n))
        h = (1-zgate)*ngate + zgate*prev_h
        return h

In [40]:
class CustomGRULayer(nn.Module):
    def __init__(self, input_size, hidden_size, device):
        super().__init__()
        self.gru_cell = CustomGRUCell(input_size, hidden_size,device=device)
        self.hidden_size = hidden_size
        self.device = device
    def forward(self, input_):
        input_ = input_.to(device)
        batch, seq_len, _ = input_.shape
        state = []
        prev_h = torch.zeros((batch,self.hidden_size),device=device)

        for t in range(seq_len):
            xt = input_[:,t,:]
            prev_h = self.gru_cell(xt, prev_h)
            state.append(prev_h)
        out = torch.stack(state, dim=1)
        return out, prev_h

In [41]:
torch.manual_seed(1)

<torch._C.Generator at 0x10e642110>

In [42]:
class Seq2SeqUnivar(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2,device):
        super().__init__()
        self.gru = CustomGRULayer(input_size, hidden_size1, device=device)
        self.gru2 = CustomGRULayer(hidden_size1, hidden_size2, device=device)
        self.linear = nn.Linear(hidden_size2, 64)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(64,32)
        self.linear3 = nn.Linear(32,14)
        self.ln = nn.LayerNorm(128)
        self.ln2 = nn.LayerNorm(64)
        self.ln3 = nn.LayerNorm(32)
        self.ln
    def forward(self, input_):
        out, prev_h = self.gru(input_)
        out, prev_h = self.gru2(out)
        out = self.ln(out)
        out = self.relu(self.ln2(self.linear(out)))
        out = self.relu(self.ln3(self.linear2(out)))
        return self.linear3(out)

In [43]:
device = torch.device("mps" if torch.mps.is_available() else "cpu")

In [44]:
model = Seq2SeqUnivar(1, 512, 128, device=device).to(device)

In [45]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
criterion = nn.HuberLoss(reduction='sum')
early_stopper = EarlyStopping(patience=50, checkpoint_path='seq2seq_gru.pt', restore_best_weights=True, verbose=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',patience=10, factor=0.9)

In [46]:
n_epochs = 1000

train_loss = [0] * n_epochs
val_loss = [0] * n_epochs

for epoch in range(n_epochs):
    model.train()
    # iterate through the training data
    for x_batch, y_batch in train_dl:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        out = model(x_batch)
        # adding l1 norm
        norm = sum(p.abs().sum() for p in model.parameters())
        loss = criterion(out, y_batch) + 1e-3*norm
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        optimizer.step()
        train_loss[epoch]+=loss.item()
    train_loss[epoch] /= len(train_dl.dataset)

    model.eval()
    with torch.no_grad():
        for x_batch,y_batch in val_dl:
            x_batch,y_batch = x_batch.to(device),y_batch.to(device)
            out = model(x_batch)
            loss = criterion(out, y_batch)
            val_loss[epoch] += loss.item()
        val_loss[epoch] /= len(val_dl.dataset)

        scheduler.step(val_loss[epoch])
        print(f'Epoch: {epoch+1}| Train loss: {train_loss[epoch]:.4f}| Val loss: {val_loss[epoch]:.4f}')
        early_stopper(val_loss[epoch], model, optimizer, epoch)
        if early_stopper.should_stop:
            print(f"Stopping at epoch: {epoch+1}")
            break

Epoch: 1| Train loss: 31.4188| Val loss: 6.7556
Metric improved to 6.7556. Checkpoint saved at epoch 0
Epoch: 2| Train loss: 9.7295| Val loss: 3.5199
Metric improved to 3.5199. Checkpoint saved at epoch 1
Epoch: 3| Train loss: 7.6194| Val loss: 3.5669
No improvement for 1 epoch(s)
Epoch: 4| Train loss: 7.3285| Val loss: 3.2332
Metric improved to 3.2332. Checkpoint saved at epoch 3
Epoch: 5| Train loss: 6.7279| Val loss: 3.3579
No improvement for 1 epoch(s)
Epoch: 6| Train loss: 6.0862| Val loss: 3.2787
No improvement for 2 epoch(s)
Epoch: 7| Train loss: 5.5389| Val loss: 3.3955
No improvement for 3 epoch(s)
Epoch: 8| Train loss: 4.4670| Val loss: 3.0381
Metric improved to 3.0381. Checkpoint saved at epoch 7
Epoch: 9| Train loss: 3.6274| Val loss: 1.9190
Metric improved to 1.9190. Checkpoint saved at epoch 8
Epoch: 10| Train loss: 3.1422| Val loss: 1.9722
No improvement for 1 epoch(s)
Epoch: 11| Train loss: 2.9502| Val loss: 1.7964
Metric improved to 1.7964. Checkpoint saved at epoch 10

In [47]:
l1_loss = nn.L1Loss(reduction='mean')
loss_ = []

with torch.no_grad():
    model.eval()
    for x_batch, y_batch in test_dl:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        out = model(x_batch)
        loss = l1_loss(out, y_batch)
        loss_.append(loss.item())

In [48]:
(sum(loss_)/len(loss_)) * 1e6

29856.85132443905

GRU has the best performance overall for this task