**WaveNet**

Implementing a simple version of the wavenet model whic uses convlayers with the dilation doubled at every layer. In return the general idea behind this model is that the lower layers will learn short time sequences and the higher layers long term sequences. Thats basically the general idea behind the wavenet model. Stack convnets together while doubling dilation -> doubling the dilation ensures that we have a bigger receptive field without increasing the number of layers.

keras implementation:
wavenet_model = tf.keras.Sequential()
wavenet_model.add(tf.keras.layers.Input(shape=[None, 5]))
for rate in (1, 2, 4, 8) * 2:
wavenet_model.add(tf.keras.layers.Conv1D(
filters=32, kernel_size=2, padding="causal", activation="relu",
dilation_rate=rate))
wavenet_model.add(tf.keras.layers.Conv1D(filters=14, kernel_size=1))

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class CausalConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation=1, **kwargs):
        super().__init__()
        self.kernel_size = kernel_size # the conv layer's kernel size
        self.dilation = dilation # the dilation
        self.conv = nn.Conv1d(
            in_channels,
            out_channels,
            kernel_size,
            dilation=dilation,
            padding=0,
            **kwargs
        ) # the conv layer we ough to manipulate and introduce left padding to
    
    def forward(self, x):
        # Calculate the total padding needed
        # Essentially, we are using a stride of 1 -> easy to compute how much we are taking away from the 
        # input sequence in terms of length and then add this deficit to the start of the signal as padded zeros
        # compute the total padding needed
        pad_left = (self.kernel_size - 1)*self.dilation
        # pad left only (causal) padding
        x = F.pad(x, (pad_left, 0)) # applying padding but only to the left as size is only specified to the left and zero padding for the right. default for this func is constant padding of zero which is what is being applied
        return self.conv(x)

In [10]:
class WaveNetStack(nn.Module):
    def __init__(self):
        super().__init__()

        dilations = [1,2,4,8]*2

        in_channels = 1
        layers = []

        for d in dilations:
            layers.append(
                nn.Sequential(
                    CausalConv1d(
                        in_channels=in_channels,
                        out_channels=32,
                        kernel_size=2,
                        dilation=d
                    ),
                    nn.BatchNorm1d(32),
                    nn.ReLU()
                )
            )
            in_channels=32
        
        self.layers = nn.ModuleList(layers)
        self.out_layer = nn.Conv1d(in_channels=32,out_channels=14,kernel_size=1)
        # applying a 1d conv layer with kernel size of 1 -> which essentially functions like a dense/linear layer
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        x = self.out_layer(x)
        return x

Get the data:

In [13]:
import matplotlib.pyplot as plt
from utils.fetch_data import fetch_timeseries_data
from utils.early_stopping import EarlyStopping
from utils.fetch_data import create_splits
from torch.utils.data import DataLoader, Dataset, Subset
import numpy as np
import pandas as pd

In [14]:
torch.manual_seed(1)

<torch._C.Generator at 0x109ab6070>

In [15]:
df = fetch_timeseries_data()
df.head()

Unnamed: 0_level_0,day_type,bus,rail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-01-01,U,297192,126455
2001-01-02,W,780827,501952
2001-01-03,W,824923,536432
2001-01-04,W,870021,550011
2001-01-05,W,890426,557917


In [16]:
df.shape

(9009, 3)

In [17]:
rail_train, rail_valid, rail_test = create_splits(df, attr='rail', train_ran=['2014-01','2023-12'],val_ran=['2024-01','2024-12'],test_ran=['2025-01'])

In [18]:
seq_length = 56
forecasting_horizon = 14

def create_x_chunks(ds):
    return [ds[i:i+seq_length] for i in range(len(ds)-seq_length-forecasting_horizon+1)]

def create_y_chunks(ds):
    y_chunks = []
    for i in range(len(ds)-seq_length-forecasting_horizon+1):
        y_chunks.append([ds[i+1+j:i+1+j+forecasting_horizon] for j in range(seq_length)])
    return y_chunks

In [20]:
train_x_chunks, train_y_chunks = create_x_chunks(rail_train.values.tolist()), create_y_chunks(rail_train.values.tolist())
valid_x_chunks, valid_y_chunks = create_x_chunks(rail_valid.values.tolist()), create_y_chunks(rail_valid.values.tolist())
test_x_chunks, test_y_chunks = create_x_chunks(rail_test.values.tolist()), create_y_chunks(rail_test.values.tolist())

In [21]:
train_x_chunks, train_y_chunks = torch.tensor(train_x_chunks), torch.tensor(train_y_chunks)
valid_x_chunks, valid_y_chunks = torch.tensor(valid_x_chunks), torch.tensor(valid_y_chunks)
test_x_chunks, test_y_chunks = torch.tensor(test_x_chunks), torch.tensor(test_y_chunks)

In [22]:
train_x_chunks.shape, train_y_chunks.shape

(torch.Size([3583, 56]), torch.Size([3583, 56, 14]))

Build the dataset using chunks:

In [23]:
class TsDataset(Dataset):
    def __init__(self, x_chunks, y_chunks):
        super().__init__()
        self.x_chunks = x_chunks.unsqueeze(2)
        self.y_chunks = y_chunks
    def __len__(self):
        return self.x_chunks.shape[0]
    
    def __getitem__(self, index):
        return self.x_chunks[index,:,:],self.y_chunks[index,:,:]

In [24]:
train_ds = TsDataset(train_x_chunks, train_y_chunks)
valid_ds = TsDataset(valid_x_chunks, valid_y_chunks)
test_ds = TsDataset(test_x_chunks, test_y_chunks)

In [25]:
train_dl = DataLoader(train_ds, shuffle=True, batch_size=128)
valid_dl = DataLoader(valid_ds, batch_size=128)
test_dl = DataLoader(test_ds, batch_size=1024)

Model and training:

In [49]:
device = torch.device("mps" if torch.mps.is_available() else "cpu")

In [50]:
torch.manual_seed(1)

<torch._C.Generator at 0x109ab6070>

In [57]:
model = WaveNetStack().to(device)

In [58]:
from utils.early_stopping import EarlyStopping

In [59]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.002)
criterion = nn.HuberLoss(reduction="sum")
early_stopper = EarlyStopping(patience=50, checkpoint_path='wavenet.pt', restore_best_weights=True, verbose=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.2)

Training Loop:

In [60]:
n_epochs = 1000

train_loss = [0]*n_epochs
val_loss = [0]*n_epochs


for epoch in range(n_epochs):
    model.train()
    # iterate through the training data
    for x_batch, y_batch in train_dl:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        x_batch = x_batch.permute(0,2,1)
        optimizer.zero_grad()
        out = model(x_batch)
        out = out.permute(0, 2, 1)
        # adding l1 norm
        norm = sum(p.abs().sum() for p in model.parameters())
        loss = criterion(out, y_batch)
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        optimizer.step()
        train_loss[epoch]+=loss.item()
    train_loss[epoch] /= len(train_dl.dataset)

    model.eval()
    with torch.no_grad():
        for x_batch,y_batch in valid_dl:
            x_batch,y_batch = x_batch.to(device),y_batch.to(device)
            x_batch = x_batch.permute(0,2,1)
            out = model(x_batch)
            out = out.permute(0,2,1)
            loss = criterion(out, y_batch)
            val_loss[epoch] += loss.item()
        val_loss[epoch] /= len(valid_dl.dataset)

        scheduler.step(val_loss[epoch])
        print(f'Epoch: {epoch+1}| Train loss: {train_loss[epoch]:.4f}| Val loss: {val_loss[epoch]:.4f}')
        early_stopper(val_loss[epoch], model, optimizer, epoch)
        if early_stopper.should_stop:
            print(f"Stopping at epoch: {epoch+1}")
            break


Epoch: 1| Train loss: 41.7934| Val loss: 10.5909
Metric improved to 10.5909. Checkpoint saved at epoch 0
Epoch: 2| Train loss: 6.0512| Val loss: 4.5821
Metric improved to 4.5821. Checkpoint saved at epoch 1
Epoch: 3| Train loss: 4.1449| Val loss: 3.0476
Metric improved to 3.0476. Checkpoint saved at epoch 2
Epoch: 4| Train loss: 3.5616| Val loss: 2.3631
Metric improved to 2.3631. Checkpoint saved at epoch 3
Epoch: 5| Train loss: 3.1561| Val loss: 2.1539
Metric improved to 2.1539. Checkpoint saved at epoch 4
Epoch: 6| Train loss: 3.0345| Val loss: 2.1233
Metric improved to 2.1233. Checkpoint saved at epoch 5
Epoch: 7| Train loss: 2.7860| Val loss: 1.9633
Metric improved to 1.9633. Checkpoint saved at epoch 6
Epoch: 8| Train loss: 2.6475| Val loss: 1.7952
Metric improved to 1.7952. Checkpoint saved at epoch 7
Epoch: 9| Train loss: 2.5493| Val loss: 1.7672
Metric improved to 1.7672. Checkpoint saved at epoch 8
Epoch: 10| Train loss: 2.5360| Val loss: 1.6556
Metric improved to 1.6556. Chec

In [61]:
l1_loss = nn.L1Loss(reduction='mean')
loss_ = []

with torch.no_grad():
    model.eval()
    for x_batch, y_batch in test_dl:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        x_batch = x_batch.permute(0,2,1)
        out = model(x_batch)
        out = out.permute(0,2,1)
        loss = l1_loss(out, y_batch)
        loss_.append(loss.item())

In [62]:
(sum(loss_)/len(loss_)) * 1e6

44534.552842378616