In [None]:
!pip install -r requirements.tx
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

In [None]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [None]:

# Some convenience helper functions used throughout the notebook


def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

In [None]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """

    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed #Source
        self.tgt_embed = tgt_embed #Target embedding
        self.generator = generator #Generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [None]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)

Encoder:

-Purpose: The encoder takes the input sequence and processes it, capturing its important features.
-Structure: It consists of multiple identical layers stacked on top of each other.
-Layer: Each layer within the encoder has two main parts:
--Self-Attention Mechanism: This part helps the model focus on different parts of the input sequence while processing each token.
--Feed-Forward Network: After self-attention, each token's representation is further refined using a simple neural network.
-Residual Connections: Each layer has a "shortcut" connection called a residual connection. It helps in better information flow through the network by adding the input of the layer to its output.
-Layer Normalization: After each sub-layer (self-attention and feed-forward network), the output is normalized to stabilize training and improve performance.
-Dropout: Dropout is applied to the output of each sub-layer to prevent overfitting.
-Implementation: The encoder is implemented as a stack of these layers, each followed by normalization and dropout.

In [None]:
#The encoder is composed of a stack of N=6 identical layers.
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [None]:
#We employ a residual connection around each of the two sub-layers, followed by layer normalization.

class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [None]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

Decoder:

-Purpose: The decoder takes the processed information from the encoder and generates the output sequence.
-Structure: Similar to the encoder, it consists of multiple identical layers stacked on top of each other.
-Layer: Each layer within the decoder also has multiple sub-layers:
--Self-Attention Mechanism: This helps the decoder focus on different parts of the output sequence while generating each token.
--Source-Target Attention Mechanism: This sub-layer allows the decoder to consider the information from the input sequence (memory) while generating the output sequence.
--Feed-Forward Network: Similar to the encoder, this part refines the representation of each token.
-Residual Connections: Just like the encoder, each layer in the decoder has residual connections for better information flow.
-Masking: The self-attention mechanism in the decoder is modified to prevent positions from attending to subsequent positions. This ensures that during training, predictions for a token only depend on the tokens before it, preventing information leakage from future tokens.
-Implementation: The decoder is also implemented as a stack of these layers, with each layer followed by normalization and dropout.


In [None]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [None]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [None]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [None]:
def example_mask():
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Subsequent Mask": subsequent_mask(20)[0][x, y].flatten(),
                    "Window": y,
                    "Masking": x,
                }
            )
            for y in range(20)
            for x in range(20)
        ]
    )

    return (
        alt.Chart(LS_data)
        .mark_rect()
        .properties(height=250, width=250)
        .encode(
            alt.X("Window:O"),
            alt.Y("Masking:O"),
            alt.Color("Subsequent Mask:Q", scale=alt.Scale(scheme="viridis")),
        )
        .interactive()
    )


show_example(example_mask)

Attention Mechanism:

-The attention function described maps a query and a set of key-value pairs to an output.
-Each of these (query, keys, values, and output) are vectors.
-The output is computed as a weighted sum of the values, where the weights are determined by a compatibility function (often referred to as a scoring function) of the query with the corresponding key.

Scaled Dot-Product Attention:

-This is the specific attention mechanism used in Transformers.
-It involves queries, keys, and values, each of a certain dimension.
-The attention scores are computed by taking the dot product of the query with all keys, scaled by the square root of the dimension of the keys.
-Then, softmax is applied to obtain the weights on the values.
-This mechanism allows the model to focus on different parts of the input sequence differently, based on the similarity between the query and keys.

Additive Attention vs. Dot-Product Attention:

-Two common types of attention are discussed: additive and dot-product.
-Additive attention uses a feed-forward network with a single hidden layer to compute compatibility.
-Dot-product attention, on the other hand, is faster and more space-efficient, as it utilizes matrix multiplication.
-While both have similar theoretical complexity, dot-product attention is preferred due to its efficiency, especially for larger dimensions.

Multi-Head Attention:

-Multi-head attention allows the model to jointly attend to information from different representation subspaces.
-With multiple attention heads, the model can capture different aspects of the input simultaneously.
-Each attention head operates on a transformed version of the input (query, key, value) and produces its own output.
-These outputs are then concatenated and linearly transformed to produce the final output of the multi-head attention layer.

Implementation:

-The MultiHeadedAttention class is provided as an implementation of multi-head attention in PyTorch.
-It takes in the number of heads (h) and the model dimension (d_model).
-The model dimension is divided equally among the heads.
-Linear projections are applied to the input query, key, and value to project them into the required dimensions for each head.
-Attention is then applied independently for each head, and the outputs are concatenated and linearly transformed to produce the final output.


In [None]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x)

The applications of attention in the Transformer model are crucial for enabling it to effectively capture dependencies between input and output sequences. Here's a breakdown of how attention is utilized in different parts of the model:

-Encoder-Decoder Attention:

This attention mechanism allows the decoder to focus on relevant parts of the input sequence when generating each token of the output sequence.
Queries are derived from the previous decoder layer, while keys and values are obtained from the output of the encoder.
By enabling every position in the decoder to attend over all positions in the input sequence, this mechanism facilitates the alignment of input and output sequences, which is essential for tasks like machine translation.
This mimics traditional encoder-decoder attention mechanisms found in sequence-to-sequence models.

-Self-Attention in the Encoder:

Self-attention layers in the encoder enable each position in the encoder to attend to all positions in the previous layer of the encoder.
This allows the model to capture dependencies between different parts of the input sequence, irrespective of their relative positions.
By allowing every position in the encoder to attend to every other position, the model can learn representations that effectively capture the relationships between different tokens in the input sequence.

-Self-Attention in the Decoder:

Similar to the encoder, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position.
However, to maintain the auto-regressive property and prevent information leakage from future tokens, leftward information flow in the decoder must be restricted.
This is achieved by masking out (setting to -∞) all values in the input of the softmax corresponding to illegal connections, ensuring that each position attends only to positions preceding it in the sequence.

Feed Forward Network

-The position-wise feed-forward network (FFN) consists of two linear transformations with a ReLU activation function applied in between.
-It is applied independently to each position in the input sequence.
-The purpose of this network is to learn complex, non-linear transformations of the input representations at each position.

Equation:

Mathematically, the FFN can be represented as:
FFN(𝑥)=ReLU(𝑥𝑊1+𝑏1)𝑊2+𝑏2
where:
𝑥 is the input vector at each position.
𝑊1 and 𝑊2 are weight matrices for the linear transformations.
𝑏1 and 𝑏2 are bias vectors.
ReLU denotes the rectified linear unit activation function.

Architecture:

The input dimensionality (𝑑 model) and output dimensionality (𝑑 model) of the FFN are typically set to 512.
The inner-layer dimensionality (𝑑ff) is set to 2048, making the FFN wider than the input and output layers.
Dropout is applied with a specified dropout rate to prevent overfitting during training.

Implementation:

-In code, the FFN is implemented as a PyTorch module (PositionwiseFeedForward) with two linear layers (self.w_1 and self.w_2) and a dropout layer.
-The ReLU activation function is applied after the first linear transformation, followed by dropout.
-The output of the second linear transformation is the final output of the FFN.
-In summary, the position-wise feed-forward network in the Transformer model acts as a non-linear transformation layer, enhancing the model's ability to capture complex relationships within the input sequence. It is applied independently to each position, providing flexibility and expressive power to the model's representations.

In [None]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

Embeddings:

-Learned embeddings are used to convert input tokens and output tokens into fixed-size vectors of dimension d model.
-The Embeddings class implements this functionality in the Transformer model.
-It utilizes an embedding layer (self.lut) initialized with random weights and learns the embeddings during training.
-The size of the embedding layer is determined by the vocabulary size (vocab) and the model dimension (d_model).
-During the forward pass, the input tokens are passed through the embedding layer, resulting in vectors of dimension d model.
-Additionally, the embeddings are scaled by a factor of d model
-This scaling helps stabilize the gradients during training.

Softmax:

-Softmax function is used to convert the decoder output into predicted probabilities over the vocabulary for the next token.
-This is a standard practice in sequence transduction models.
-The output of the model is passed through a linear transformation followed by the softmax function to obtain token probabilities.

Parameter Sharing:

-The weights of the embedding layers and the pre-softmax linear transformation are shared.
-This means that the same weight matrix is used for both embedding layers and the linear transformation layer.
-Parameter sharing helps reduce the number of parameters in the model and encourages better generalization.

Implementation:

-The Embeddings class inherits from nn.Module in PyTorch and implements the forward method.
-During initialization, it creates an embedding layer (self.lut) with dimensions specified by the vocabulary size and the model dimension.
-During the forward pass, input tokens are passed through the embedding layer and scaled by d model.

In summary, embeddings are essential in converting discrete tokens into continuous vectors that the model can process, while softmax converts model outputs into token probabilities. Parameter sharing between embedding layers and the pre-softmax linear transformation reduces the model's parameter count and improves training efficiency.

In [None]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

Positional Encoding

Purpose:

-Since Transformers lack recurrence and convolution, they need a way to understand the order of tokens in a sequence.
-Positional encodings are added to the input embeddings to provide information about the relative or absolute position of tokens in a sequence.

Method:

-The positional encodings are vectors of the same dimension (d model) as the embeddings, allowing them to be added together.
-Sine and cosine functions of different frequencies are used to generate the positional encodings.
-Each dimension of the positional encoding corresponds to a sinusoid with a different wavelength, forming a geometric progression from 2𝜋 to 10000⋅2π.
-The positional encoding at each position and dimension is calculated using sine and cosine functions as described in the equations provided.

Implementation:

-The PositionalEncoding class implements the positional encoding function.
-It takes parameters for the model dimension, dropout rate, and maximum sequence length.
-Positional encodings are pre-computed once and stored as a buffer.
-During the forward pass, positional encodings are added to the input embeddings and passed through a dropout layer.

Visualization:

-An example visualization is provided to demonstrate how the positional encoding varies across different positions and dimensions.
-It shows the sinusoidal waveforms generated for a specific range of dimensions, plotted against the positions in the sequence.

Comparison with Learned Positional Embeddings:

-The paper mentions that learned positional embeddings were also experimented with and produced similar results.
-However, sinusoidal positional encodings were chosen because they may allow the model to generalize better to longer sequence lengths.

In [None]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

def example_positional():
    pe = PositionalEncoding(20, 0)
    y = pe.forward(torch.zeros(1, 100, 20))

    data = pd.concat(
        [
            pd.DataFrame(
                {
                    "embedding": y[0, :, dim],
                    "dimension": dim,
                    "position": list(range(100)),
                }
            )
            for dim in [4, 5, 6, 7]
        ]
    )

    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N")
        .interactive()
    )


show_example(example_positional)

In [None]:
def make_model(
    src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

Batches and Masking:

-The Batch class is used to hold a batch of data during training. It contains the source (src) and target (tgt) sentences.
-Masking is applied to hide padding and future words in the target sequence.
-The make_std_mask function creates a mask to hide padding and future words in the target sequence.

In [None]:
class Batch:
    """Object for holding a batch of data with mask during training."""

    def __init__(self, src, tgt=None, pad=2):  # 2 = <blank>
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt = tgt[:, :-1]
            self.tgt_y = tgt[:, 1:]
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        return tgt_mask

In [None]:
class TrainState:
    """Track number of steps, examples, and tokens processed"""

    step: int = 0  # Steps in the current epoch
    accum_step: int = 0  # Number of gradient accumulation steps
    samples: int = 0  # total # of examples used
    tokens: int = 0  # total # of tokens processed

def run_epoch(
    data_iter,
    model,
    loss_compute,
    optimizer,
    scheduler,
    mode="train",
    accum_iter=1,
    train_state=TrainState(),
):
    """Train a single epoch"""
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    n_accum = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(
            batch.src, batch.tgt, batch.src_mask, batch.tgt_mask
        )
        loss, loss_node = loss_compute(out, batch.tgt_y, batch.ntokens)
        # loss_node = loss_node / accum_iter
        if mode == "train" or mode == "train+log":
            loss_node.backward()
            train_state.step += 1
            train_state.samples += batch.src.shape[0]
            train_state.tokens += batch.ntokens
            if i % accum_iter == 0:
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                n_accum += 1
                train_state.accum_step += 1
            scheduler.step()

        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 40 == 1 and (mode == "train" or mode == "train+log"):
            lr = optimizer.param_groups[0]["lr"]
            elapsed = time.time() - start
            print(
                (
                    "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f "
                    + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
                )
                % (i, n_accum, loss / batch.ntokens, tokens / elapsed, lr)
            )
            start = time.time()
            tokens = 0
        del loss
        del loss_node
    return total_loss / total_tokens, train_state

Training Data and Batching:

The model is trained on the WMT 2014 English-German dataset, consisting of about 4.5 million sentence pairs.
Sentences are encoded using byte-pair encoding (BPE), with a shared vocabulary of about 37,000 tokens.
English-French training uses the larger WMT 2014 English-French dataset, consisting of 36 million sentences, with a vocabulary of 32,000 word-piece tokens.
Batches are created by grouping together sentence pairs with approximately 25,000 source tokens and 25,000 target tokens.

Hardware and Schedule:

Training is performed on a single machine with 8 NVIDIA P100 GPUs.
For the base models, each training step takes about 0.4 seconds, and training is conducted for a total of 100,000 steps or 12 hours.
Big models have a step time of 1.0 second and are trained for 300,000 steps or 3.5 days.

Optimization Hyperparameters:

-The Adam optimizer is utilized with specific hyperparameters:
-Beta 1 (β1) is set to 0.9.
-Beta 2 (β2) is set to 0.98.
-Epsilon (ϵ) is set to 1e-9.
-The learning rate (lrate) is varied during training according to a formula that involves the model size (d_model), the current training step (step_num), and a warm-up factor (warmup_steps).
-The learning rate is initially increased linearly for the first warmup_steps training steps and then decreased proportionally to the inverse square root of the step number.
-The warm-up steps are set to 4000.

-Example of Learning Rate Schedule:

-An example function (rate) is provided to calculate the learning rate based on the specified parameters.
-The example_learning_schedule function demonstrates how the learning rate varies over 20,000 training steps for different combinations of model sizes and warm-up steps.
-The learning rate curves for different model sizes and warm-up steps are plotted to visualize their behavior during training.

In [None]:
def rate(step, model_size, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

In [None]:
def example_learning_schedule():
    opts = [
        [512, 1, 4000],  # example 1
        [512, 1, 8000],  # example 2
        [256, 1, 4000],  # example 3
    ]

    dummy_model = torch.nn.Linear(1, 1)
    learning_rates = []

    # we have 3 examples in opts list.
    for idx, example in enumerate(opts):
        # run 20000 epoch for each example
        optimizer = torch.optim.Adam(
            dummy_model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9
        )
        lr_scheduler = LambdaLR(
            optimizer=optimizer, lr_lambda=lambda step: rate(step, *example)
        )
        tmp = []
        # take 20K dummy training steps, save the learning rate at each step
        for step in range(20000):
            tmp.append(optimizer.param_groups[0]["lr"])
            optimizer.step()
            lr_scheduler.step()
        learning_rates.append(tmp)

    learning_rates = torch.tensor(learning_rates)

    # Enable altair to handle more than 5000 rows
    alt.data_transformers.disable_max_rows()

    opts_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "Learning Rate": learning_rates[warmup_idx, :],
                    "model_size:warmup": ["512:4000", "512:8000", "256:4000"][
                        warmup_idx
                    ],
                    "step": range(20000),
                }
            )
            for warmup_idx in [0, 1, 2]
        ]
    )

    return (
        alt.Chart(opts_data)
        .mark_line()
        .properties(width=600)
        .encode(x="step", y="Learning Rate", color="model_size:warmup:N")
        .interactive()
    )


example_learning_schedule()

Regularization - Label Smoothing:

-Label smoothing is applied during training with a smoothing value (ε_ls) set to 0.1.
-Label smoothing helps improve accuracy and BLEU score but may hurt perplexity.
-The LabelSmoothing class implements label smoothing using the Kullback-Leibler divergence loss.
-Instead of using a one-hot target distribution, label smoothing creates a distribution with confidence in the correct word and distributes the rest of the smoothing mass throughout the vocabulary.
-An example of label smoothing is provided to visualize how the mass is distributed to words based on confidence.
-Label smoothing penalizes the model if it becomes overly confident about a given choice.

Visualization of Penalization:

-A function is defined to calculate the loss and visualize how it changes based on different confidence levels.
The penalization visualization demonstrates how the loss increases as the model becomes more confident about its predictions.

In [None]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."

    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, true_dist.clone().detach())

In [None]:
def example_label_smoothing():
    crit = LabelSmoothing(5, 0, 0.4)
    predict = torch.FloatTensor(
        [
            [0, 0.2, 0.7, 0.1, 0],
            [0, 0.2, 0.7, 0.1, 0],
            [0, 0.2, 0.7, 0.1, 0],
            [0, 0.2, 0.7, 0.1, 0],
            [0, 0.2, 0.7, 0.1, 0],
        ]
    )
    crit(x=predict.log(), target=torch.LongTensor([2, 1, 0, 3, 3]))
    LS_data = pd.concat(
        [
            pd.DataFrame(
                {
                    "target distribution": crit.true_dist[x, y].flatten(),
                    "columns": y,
                    "rows": x,
                }
            )
            for y in range(5)
            for x in range(5)
        ]
    )

    return (
        alt.Chart(LS_data)
        .mark_rect(color="Blue", opacity=1)
        .properties(height=200, width=200)
        .encode(
            alt.X("columns:O", title=None),
            alt.Y("rows:O", title=None),
            alt.Color(
                "target distribution:Q", scale=alt.Scale(scheme="viridis")
            ),
        )
        .interactive()
    )


show_example(example_label_smoothing)

In [None]:
def loss(x, crit):
    d = x + 3 * 1
    predict = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d]])
    return crit(predict.log(), torch.LongTensor([1])).data


def penalization_visualization():
    crit = LabelSmoothing(5, 0, 0.1)
    loss_data = pd.DataFrame(
        {
            "Loss": [loss(x, crit) for x in range(1, 100)],
            "Steps": list(range(99)),
        }
    ).astype("float")

    return (
        alt.Chart(loss_data)
        .mark_line()
        .properties(width=350)
        .encode(
            x="Steps",
            y="Loss",
        )
        .interactive()
    )


show_example(penalization_visualization)