In [3]:
import torch
import torch.nn as nn
import copy

Encoder and decoder stacks

- original paper had 6 identical encoder layers. so the clone function will help create copies of the encoder architecture.

In [1]:
def clone(module, N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class Encoder(nn.Module):
  def __init__(self, layer, N):
    super(Encoder, self).__init__()
    self.layers = clone(layer, N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [4]:
class LayerNorm(nn.Module):
  def __init__(self, features, eps = 1e-6):
    super(LayerNorm, self).__init__()
    self.a_2 = nn.Parameter(torch.ones(features))
    self.b_2 = nn.Parameter(torch.zeros(features))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim =True)
    std = x.std(-1, keepdim = True)
    return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


The normalization formula:

(x - mean): This subtracts the mean from the input data, centering it around zero.
(std + self.eps): This adds the small epsilon value to the standard deviation for numerical stability.
(x - mean) / (std + self.eps): This divides the zero-centered data by the adjusted standard deviation, ensuring that the data has a unit variance.
self.a_2 * ... + self.b_2: This scales and shifts the normalized data using the learnable parameters.
