In [5]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.autograd import Variable

import numpy as np
from copy import deepcopy

import sys
sys.path.append("..")

from typing import Union, List, Tuple

from transformer.modules import clone_module


> The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network. We employ a residual connection around each of the two sub-layers, followed by layer normalization. That is, the output of each sub-layer is `LayerNorm(x + Sublayer(x))`, where `Sublayer(x)` is the function implemented by the sub-layer itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension d_model = 512.

We use `clone_layers` for creating identical layers.  


In [6]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clone_module(layer, N) 
        self.norm = LayerNorm(layer.size)
        """the above text doesn't mention that we apply layer norm at the end. 
        Just that output of each sublayer is normalized. 
        So does the last encoderlayer not have layer norm?
        Also. nn.Modules don't have a size attribute. We need to make sure we 
        save the size attribute in the encoder layer
        """
    
    def forward(self, x, mask):
        """
        Pass the input x(and mask) through each layer in turn.
        """
        for layer in self.layers:
            """
            Encoder layer will also have encoder masks.
            Duh.
            """
            x = layer(x, mask)
        return self.norm(x)

In [7]:
## https://github.com/CyberZHG/torch-layer-normalization/blob/89f405b60f53f85da6f03fe685c190ef394ce50c/torch_layer_normalization/layer_normalization.py#L8

class LayerNorm(nn.Module):
    def __init__(self,
                 in_features: Union[int, List[int], Tuple[int, ...]],
                 gamma: bool = True,
                 beta: bool = True,
                 epsilon: float = 1e-6):
        """Layer normalization layer
        See: [Layer Normalization](https://arxiv.org/pdf/1607.06450.pdf)
        :param in_features: The shape of the input tensor or the
            last dimension of the input tensor.
        :param gamma: Add a scale parameter if it is True.
        :param beta: Add an offset parameter if it is True.
        :param epsilon: Epsilon for calculating variance.
        """
        super(LayerNorm, self).__init__()
        if isinstance(in_features, int):
            in_features = (in_features,)
        else:
            in_features = (in_features[-1],)
        self.in_features = torch.Size(in_features)
        self.epsilon = epsilon
        if gamma:
            self.gamma = torch.ones(*in_features)
        else:
            self.register_parameter('gamma', None)
        if beta:
            self.beta = torch.zeros(*in_features)
        else:
            self.register_parameter('beta', None)

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(std=-1, keepdim=True)
        y = (x - mean) / (std + self.epsilon)
        if self.gamma is not None:
            y *= self.gamma
        if self.beta is not None:
            y += self.beta
        return y

    def extra_repr(self):
        return f'in_features={self.in_features}, ' + \
               f'gamma={self.gamma is not None}, ' + \
               f'beta={self.beta is not None}, ' + \
               f'epsilon={self.epsilon}'

In [8]:
class Sublayer(nn.Module):
    def __init__(self, in_features: int, dropout_prob: float):
        super(Sublayer, self).__init__()
        self.norm = LayerNorm(in_features)
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(x: torch.Tensor, sublayer: nn.Module):
        """
        Apply residual connection to any sublayer with the same size.
        Note: this should have been 
        self.dropout(self.norm(x + sublayer(x)))
        but Sasha Rush said 
        "for code simplicity the norm is applied first as
        opposed to last." 
        """
        return x + self.dropout(sublayer(self.norm(x)))

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout_prob):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clone_module(Sublayer(size, dropout_prob), 2)
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

<h2 align="center">Encoder Network</h2>
<div align="center">
    <img src="images/encoder.png" alt="Encoder Network" />
</div>
<div width="75%">
    <p align="left">
    We have the skeleton of the encoder and the layers. What we don't have: 
    <ul align="left">
        <li>Decoder</li>
        <li>Multi-head attention and self-attention</li>
        <li>Positionwise feedforward</li>
        <li>Positional encoding, embeddings</li>
    </ul>
    </p>
</div>