# Transformer - Encoder, Decoder layer

## 0. imports

In [1]:
%load_ext jupyter_black

In [2]:
import sys

sys.path.append("..")

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
from src.dataset import ETTDataModule
from src.model import DataEmbedding
from src.model import Attention

## 1. prev setting

In [6]:
dm_params = {
    "data_path": "../data/ETT-small/ETTh1.csv",
    "task": "M",
    "freq": "h",
    "target": "OT",
    "seq_len": 96,
    "label_len": 48,
    "pred_len": 96,
    "use_scaler": True,
    "use_time_enc": True,
    "batch_size": 32,
}


dm = ETTDataModule(**dm_params)

In [7]:
emb_params = {
    "c_in": 7,
    "d_model": 512,
    "embed_type": "time_features",
    "freq": "h",
    "dropout": 0.1,
}

embedding = DataEmbedding(**emb_params)

In [8]:
attn_params = {
    "d_model": 512,
    "n_heads": 8,
    "d_keys": None,
    "d_values": None,
    "scale": None,
    "attention_dropout": 0.1,
    "output_attention": True,
}

attn_layer = Attention(**attn_params)

In [9]:
train_dataloader = dm.train_dataloader()
batch = next(iter(train_dataloader))

In [22]:
x = embedding(x=batch["past_values"], x_features=batch["past_time_features"])

new_x, attn = attn_layer(queries=x, keys=x, values=x)

## 2. Encoder Layer

### 2.1 line by line

In [11]:
d_model = 512
dropout = 0.1
activation = "gelu"

d_ff = 2048

In [15]:
d_ff = d_ff or 4 * d_model
conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
norm1 = nn.LayerNorm(d_model)
norm2 = nn.LayerNorm(d_model)
dropout = nn.Dropout(dropout)
activation = F.relu if activation == "relu" else F.gelu

In [23]:
x = x + dropout(new_x)
y = x = norm1(x)

y = dropout(activation(conv1(y.transpose(-1, 1))))
y = dropout(conv2(y).transpose(-1, 1))
out = norm2(x + y)

### 2.2 EncoderLayer class

In [72]:
class EncoderLayer(nn.Module):
    def __init__(
        self,
        attention: nn.Module,
        d_model: int,
        d_ff: int = None,
        dropout: float = 0.1,
        activation: str = "relu",
    ):
        super(EncoderLayer, self).__init__()

        d_ff = d_ff or 4 * d_model
        self.attention = attention
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu if activation == "relu" else F.gelu

    def forward(self, x: torch.Tensor):
        # 1. compute attention
        new_x, attn = self.attention(queries=x, keys=x, values=x)

        # 2. add and norm
        x = x + self.dropout(new_x)
        y = x = self.norm1(x)

        # 3. positionwise feed forward
        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
        y = self.dropout(self.conv2(y).transpose(-1, 1))

        return self.norm2(x + y), attn

In [43]:
enc_layer_params = {
    "attention": Attention(**attn_params),
    "d_model": 512,
    "d_ff": 2048,
    "dropout": 0.1,
    "activation": "gelu",
}

enc_layer = EncoderLayer(**enc_layer_params)

In [45]:
enc_layer

EncoderLayer(
  (attention): Attention(
    (query_projection): Linear(in_features=512, out_features=512, bias=True)
    (key_projection): Linear(in_features=512, out_features=512, bias=True)
    (value_projection): Linear(in_features=512, out_features=512, bias=True)
    (out_projection): Linear(in_features=512, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (conv1): Conv1d(512, 2048, kernel_size=(1,), stride=(1,))
  (conv2): Conv1d(2048, 512, kernel_size=(1,), stride=(1,))
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [49]:
x = embedding(x=batch["past_values"], x_features=batch["past_time_features"])
out, attn = enc_layer(x)

In [50]:
out.shape

torch.Size([32, 96, 512])

## 3. Encoder block

### 3.1 line by line

In [52]:
num_layers = 2
norm_layer = None

encoder_layers = nn.ModuleList(
    [EncoderLayer(**enc_layer_params) for _ in range(num_layers)]
)

In [55]:
x = embedding(x=batch["past_values"], x_features=batch["past_time_features"])

attns = []
for enc_layer in encoder_layers:
    x, attn = enc_layer(x)
    attns.append(attn)

In [59]:
if norm_layer is not None:
    x = norm_layer(x)

### 3.2 Encoder class

In [67]:
class Encoder(nn.Module):
    def __init__(self, enc_layers: list[nn.Module], norm_layer: nn.Module = None):
        super(Encoder, self).__init__()

        self.enc_layers = nn.ModuleList(enc_layers)
        self.norm_layer = norm_layer

    def forward(self, x: torch.Tensor):
        attns = []
        for enc_layer in self.enc_layers:
            x, attn = enc_layer(x)
            attns.append(attn)

        if self.norm_layer is not None:
            x = self.norm_layer(x)

        return x, attns

In [68]:
d_model = 512
num_enc_layers: int = 2

encoder = Encoder(
    enc_layers=[EncoderLayer(**enc_layer_params) for _ in range(num_enc_layers)],
    norm_layer=nn.LayerNorm(d_model),
)

In [69]:
x = embedding(x=batch["past_values"], x_features=batch["past_time_features"])
out, attns = encoder(x)

In [70]:
out.shape

torch.Size([32, 96, 512])

In [71]:
len(attns)

2