In [1]:
# Author: Roi Yehoshua
# Date: January 2024
# MIT License

# Based on the PyTorch implementation from https://nlp.seas.harvard.edu/annotated-transformer/

In [2]:
!pip install spacy portalocker --quiet
!pip install torch==2.3.0 torchvision torchaudio --quiet
!pip install torchtext==0.18.0 --quiet
!pip install torchdata==0.9.0 --quiet
!pip install torchtune==0.5.0 --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.2/779.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import sys
sys.path.append('/usr/local/lib/python3.11/dist-packages')

In [4]:


import torch
import torch.nn as nn
import torch.optim as optim
import math
import spacy
import os

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm



In [5]:
torch.manual_seed(42)  # For reproducibility
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Multi-Head Attention

$$
    \text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \ldots, \text{head}_h)W^O \\
    \text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \\  
    \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

In [6]:
class MultiHeadAttention(nn.Module):
    """The multi-head attention module"""
    def __init__(self, d_model, num_heads):
        super().__init__()

        # Ensure the dimension of the model is divisible by the number of heads.
        # This is necessary to equally divide the embedding dimension across heads.
        assert d_model % num_heads == 0, 'd_model must be divisible by num_heads'

        self.d_model = d_model           # Total dimension of the model
        self.num_heads = num_heads       # Number of attention heads
        self.d_k = d_model // num_heads  # Dimnsion of each head. We assume d_v = d_k

        # Linear transformations for queries, keys, and values
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Final linear layer to project the concatenated heads' outputs back to d_model dimensions
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # 1. Calculate attention scores with scaling.
        scores = (Q @ K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # 2. Apply mask (if provided) by setting masked positions to a large negative value.
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-1e9'))

        # 3. Apply softmax to attention scores to get probabilities.
        attn_weights = torch.softmax(scores, dim=-1)

        # 4. Return the weighted sum of values based on attention probabilities.
        output = attn_weights @ V

        return output

    def split_heads(self, x):
        # Reshape the input tensor to [batch_size, num_heads, seq_length, d_k]
        # to prepare for multi-head attention processing
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Inverse operation of split_heads: combine the head outputs back into the original tensor shape
        # [batch_size, seq_length, d_model]
        batch_size, num_heads, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        ### WRITE YOUR CODE HERE

        # 1. Linearly project the queries, keys, and values, and then split them into heads.
        Q = self.split_heads(self.W_q(Q))  # [batch_size, heads, seq_len, d_k]
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # 2. Apply scaled dot-product attention for each head.
        attn = self.scaled_dot_product_attention(Q, K, V, mask)

        # 3. Concatenate the heads' outputs and apply the final linear projection.

        combined = self.combine_heads(attn)

        # 4. Final linear layer
        output = self.W_o(combined)

        return output

### Feed-Forward NN

$$
    \text{FFN}(x) = \max(0, xW_1 + b_1)W_2 + b_2
$$

In [7]:
class PositionwiseFeedForward(nn.Module):
    """The Positionwise Feedforward Network (FFN) module"""
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        ### WRITE YOUR CODE HERE

        # 1. Apply first linear layer
        out = self.linear1(x)

        # 2. Apply ReLU activation
        out = self.relu(out)

        # 3. Apply dropout
        out = self.dropout(out)

        # 4. Apply second linear layer
        out = self.linear2(out)

        return out

### Positional Encoding

$$
    \text{PE}(pos, 2i) = \sin(pos/10000^{2i/d_{\text{model}}}) \\
    \text{PE}(pos, 2i + 1) = \cos(pos/10000^{2i/d_{\text{model}}})
$$

In [8]:
class PositionalEncoding(nn.Module):
    """
    Implements the positional encoding module using sinusoidal functions of different frequencies
    for each dimension of the encoding.
    """
    def __init__(self, d_model, max_seq_length):
        super().__init__()

        # Create a positional encoding (PE) matrix with dimensions [max_seq_length, d_model].
        # This matrix will contain the positional encodings for all possible positions up to max_seq_length.
        pe = torch.zeros(max_seq_length, d_model)

        # Generate a tensor of positions (0 to max_seq_length - 1) and reshape it to [max_seq_length, 1].
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        # Compute the division term used in the formulas for sin and cos functions.
        # This term is based on the dimension of the model and the position, ensuring that the wavelengths
        # form a geometric progression from 2π to 10000 * 2π. It uses only even indices for the dimensions.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply the sin function to even indices in the PE matrix. These values are determined by
        # multiplying the position by the division term, creating a pattern where each position has
        # a unique sinusoidal encoding.
        pe[:, 0::2] = torch.sin(position * div_term)

        # Apply the cos function to odd indices in the PE matrix, complementing the sin-encoded positions.
        pe[:, 1::2] = torch.cos(position * div_term)

        # Register 'pe' as a buffer within the module. Unlike parameters, buffers are not updated during training.
        # This is crucial because positional encodings are fixed and not subject to training updates.
        # The unsqueeze(0) adds a batch dimension for easier broadcasting with input tensors.
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Add positional encoding to the input tensor x.
        # x is expected to have dimensions [batch_size, seq_length, d_model].
        # The positional encoding 'pe' is sliced to match the seq_length of 'x', and then added to 'x'.
        # This operation leverages broadcasting to apply the same positional encoding across the batch.
        x = x + self.pe[:, :x.size(1)]
        return x

### Encoder Layer

In [9]:
class EncoderLayer(nn.Module):
    """An encoder layer consists of a multi-head self-attention sublayer and a feed forward sublayer,
       with a dropout, residual connection, and layer normalization after each sub-layer.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        ### WRITE YOUR CODE HERE

        # Self-attention sublayer with residual connection and layer norm
        attn_output = self.self_attn(x, x, x, mask)
        x = self.layer_norm1(x + self.dropout(attn_output))

        # Feed-forward sublayer with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.layer_norm2(x + self.dropout(ff_output))

        return x

### Decoder Layer

In [10]:
class DecoderLayer(nn.Module):
    """A decoder layer consists of a multi-head self-attention, cross-attention and a feed-forward sublayers,
       with a dropout, residual connection, and layer normalization after each sub-layer.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        ### WRITE YOUR CODE HERE

        # Masked self-attention with residual and layer norm
        self_attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.layer_norm1(x + self.dropout(self_attn_output))

        # Cross-attention with encoder output
        cross_attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.layer_norm2(x + self.dropout(cross_attn_output))

        # Feed-forward layer
        ff_output = self.feed_forward(x)
        x = self.layer_norm3(x + self.dropout(ff_output))

        return x

### The Full Model

In [11]:
class Transformer(nn.Module):
    """
    Implements the Transformer model for sequence-to-sequence tasks such as machine translation.
    The Transformer model, as described in "Attention is All You Need" by Vaswani et al., consists of an encoder and
    decoder architecture that uses self-attention mechanisms to process input sequences and generate output sequences.

    Parameters:
    - src_vocab_size (int): Size of the source vocabulary.
    - tgt_vocab_size (int): Size of the target vocabulary.
    - d_model (int): Dimension of the model embeddings and hidden states.
    - N (int): Number of layers in both the encoder and decoder stacks.
    - n_heads (int): Number of attention heads in each multi-head attention mechanism.
    - d_ff (int): Dimension of the feed-forward network within each layer.
    - max_seq_length (int): Maximum length of input sequences, used for positional encoding.
    - dropout (float): Dropout rate applied to embeddings and sub-layers.
    - pad_idx (int): Index of the padding token in the source and target vocabularies.

    Attributes:
    - src_embedding (torch.nn.Embedding): Embedding layer for source sequences.
    - tgt_embedding (torch.nn.Embedding): Embedding layer for target sequences.
    - positional_encoding (PositionalEncoding): Adds positional information to embeddings.
    - encoder (torch.nn.ModuleList): Stack of N encoder layers.
    - decoder (torch.nn.ModuleList): Stack of N decoder layers.
    - out (torch.nn.Linear): Linear layer that projects decoder output to target vocabulary size.
    - dropout (torch.nn.Dropout): Dropout layer applied after embedding and positional encoding.

    Methods:
    - init_weights: Initializes model parameters using Glorot uniform initialization.
    - create_source_mask: Creates a mask for padding tokens in the source sequence to ignore them in attention computations.
    - create_target_mask: Creates combined padding and future token masks for the target sequence to prevent attending to future tokens and padding tokens.
    - encode: Processes the source sequence through the encoder stack and generates memory states.
    - decode: Processes the target sequence through the decoder stack using memory states from the encoder and applicable masks.
    - forward: Defines the forward pass of the model using the encode and decode methods.
    """
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, N, n_heads, d_ff, max_seq_length, dropout, pad_idx):
        super().__init__()

        # Embedding layers for source and target
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Positional encoding
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Encoder and Decoder stacks
        self.encoder = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(N)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(N)])

        # Output linear layer
        self.out = nn.Linear(d_model, tgt_vocab_size)

        self.dropout = nn.Dropout(dropout)

        # Initialization
        self.init_weights()
        self.pad_idx = pad_idx

    def init_weights(self):
        """Initialize parameters with Glorot / fan_avg"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def create_source_mask(self, src):
        """Create a mask for padding tokens in the source"""
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, src_len]
        # unsqueeze(1) adds a dimension for the heads of the multi-head attention
        # unsqueeze(2) adds a dimension for the attention scores
        # This mask can be broadcasted across the src_len dimension of the attention scores,
        # effectively masking out specific tokens across all heads and all positions in the sequence.
        return src_mask

    def create_target_mask(self, tgt):
        """Create masks for both padding tokens and future tokens"""
        # Target padding mask
        tgt_pad_mask = (tgt != self.pad_idx).unsqueeze(1).unsqueeze(3)  # [batch_size, 1, tgt_len, 1]
        # unsqueeze(1) adds a dimension for the heads of the multi-head attention
        # unsqueeze(3) adds a dimension for the attention scores
        # The final shape allows the mask to be broadcast across the attention scores, ensuring positions only
        # attend to allowed positions as dictated by the no-peak mask (the preceding positions) and the padding mask.

        # Target no-peak mask
        tgt_len = tgt.size(1)
        tgt_nopeak_mask = torch.tril(torch.ones(tgt_len, tgt_len, device=device)).bool()

        # Combine masks
        tgt_mask = tgt_pad_mask & tgt_nopeak_mask  # [batch_size, 1, tgt_len, tgt_len]
        return tgt_mask

    def encode(self, src):
        """Encodes the source sequence using the Transformer encoder stack.
        """
        src_mask = self.create_source_mask(src)
        src = self.dropout(self.positional_encoding(self.src_embedding(src)))

        # Pass through each layer in the encoder
        for layer in self.encoder:
            src = layer(src, src_mask)
        return src, src_mask

    def decode(self, tgt, memory, src_mask):
        """Decodes the target sequence using the Transformer decoder stack, given the memory from the encoder.
        """
        tgt_mask = self.create_target_mask(tgt)
        tgt = self.dropout(self.positional_encoding(self.tgt_embedding(tgt)))

        # Pass through each layer in the decoder
        for layer in self.decoder:
            tgt = layer(tgt, memory, src_mask, tgt_mask)

        # Output layer
        output = self.out(tgt)
        return output

    def forward(self, src, tgt):
        ### WRITE YOUR CODE HERE

         # 1. Encode the source sequence
        memory, src_mask = self.encode(src)

        # 2. Decode the target sequence using the encoder's memory
        output = self.decode(tgt, memory, src_mask)

        return output

In [12]:
# Define the hyperparameters of the model
src_vocab_size = 5000  # Size of source vocabulary
tgt_vocab_size = 5000  # Size of target vocabulary
d_model = 512          # Embedding dimension
N = 6                  # Number of encoder and decoder layers
num_heads = 8          # Number of attention heads
d_ff = 2048            # Dimension of feed forward networks
max_seq_length = 100   # Maximum sequence length
dropout = 0.1          # Dropout rate
pad_idx = 0            # Index of the padding token

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

# Move the model to the appropriate device (GPU or CPU)
model = model.to(device)

### Testing on Random Data

In [13]:
# Generate random sample data
torch.manual_seed(42)

src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)).to(device)  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)).to(device)  # (batch_size, seq_length)

#### Inference

In [14]:
# Generate the next token using the first token in the first target tensor
model.eval()

memory, src_mask = model.encode(src_data[:1, :])
output = model.decode(tgt_data[:1, :1], memory, src_mask)
y = output.view(-1, tgt_vocab_size).argmax(-1)
y

tensor([990], device='cuda:0')

If your code is correct, you should get tensor([990]).

#### Training

In [15]:
# Train the model for 10 epochs
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)
grad_clip = 1
n_epochs = 10

model.train()

for epoch in range(n_epochs):
    optimizer.zero_grad()

    # Forward pass
    output = model(src_data, tgt_data[:, :-1])

    # tgt_data is of shape [batch_size, tgt_len]
    # output is of shape [batch_size, tgt_len, tgt_vocab_size]
    output = output.contiguous().view(-1, tgt_vocab_size)
    tgt = tgt_data[:, 1:].contiguous().view(-1)
    loss = criterion(output, tgt)

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    optimizer.step()
    print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

Epoch: 1, Loss: 8.605189323425293
Epoch: 2, Loss: 8.501507759094238
Epoch: 3, Loss: 8.37141227722168
Epoch: 4, Loss: 8.296963691711426
Epoch: 5, Loss: 8.23848819732666
Epoch: 6, Loss: 8.192156791687012
Epoch: 7, Loss: 8.16485595703125
Epoch: 8, Loss: 8.142219543457031
Epoch: 9, Loss: 8.13027286529541
Epoch: 10, Loss: 8.12206745147705


You should see the loss decreasing from around 8.6 to 8.1.

### Machine Translation Example

We now consider a real-world example using the Multi30k German-English translation task. This task is much smaller than the WMT task considered in the paper (only 30K sentence pairs compared to 4.5M pairs in the WMT-14 English-German dataset), but it illustrates the whole system. <br>
It is recommended to run this example on Google Colab, or on a machine with a strong GPU.

#### Define Tokenizers

In [27]:
# Load spacy models for tokenization
try:
    spacy_de = spacy.load('de_core_news_sm')
except IOError:
    os.system("python -m spacy download de_core_news_sm")
    spacy_de = spacy.load('de_core_news_sm')

try:
    spacy_en = spacy.load('en_core_web_sm')
except IOError:
    os.system("python -m spacy download en_core_web_sm")
    spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def yield_tokens(data_iter, tokenizer, language):
    for data_sample in data_iter:
        yield tokenizer(data_sample[language])

tokenizer_de = get_tokenizer(tokenize_de)
tokenizer_en = get_tokenizer(tokenize_en)

#### Build Vocabularies

In [28]:
train_data, valid_data, test_data = Multi30k(split=('train', 'valid', 'test'))
vocab_src = build_vocab_from_iterator(yield_tokens(train_data, tokenizer_de, 0),
                                      specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab_tgt = build_vocab_from_iterator(yield_tokens(train_data, tokenizer_en, 1),
                                      specials=['<unk>', '<pad>', '<bos>', '<eos>'])

vocab_src.set_default_index(vocab_src['<unk>'])
vocab_tgt.set_default_index(vocab_tgt['<unk>'])

#### Create the Transformer

In [74]:
# Define the hyperparameters of the model
src_vocab_size = len(vocab_src)  # Size of source vocabulary
tgt_vocab_size = len(vocab_tgt)  # Size of target vocabulary
d_model = 512  # Embedding dimension
N = 6     # Number of encoder and decoder layers
num_heads = 8  # Number of attention heads
d_ff = 2048    # Dimension of feed forward networks
max_seq_length = 5000 # Maximum sequence length
dropout = 0.1  # Dropout rate

# Assume pad_idx is the padding index in the target vocabulary
pad_idx = vocab_tgt['<pad>']

# Initialize the Transformer model
model = Transformer(src_vocab_size, tgt_vocab_size, d_model, N, num_heads, d_ff, max_seq_length, dropout, pad_idx)

# Move the model to the appropriate device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Hyperparameters for the training process
batch_size = 128
grad_clip = 1
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Initialize the loss function with CrossEntropyLoss, ignoring the padding index
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

#### Data Processing

In [75]:
def data_process(raw_data_iter):
    data = []
    for raw_src, raw_tgt in raw_data_iter:
        src_tensor = torch.tensor([vocab_src[token] for token in tokenizer_de(raw_src)], dtype=torch.long)
        tgt_tensor = torch.tensor([vocab_tgt[token] for token in tokenizer_en(raw_tgt)], dtype=torch.long)
        data.append((src_tensor, tgt_tensor))
    return data

train_data, valid_data, test_data = Multi30k(split=('train', 'valid', 'test'))
train_data = data_process(train_data)
valid_data = data_process(valid_data)
#test_data = data_process(test_data)
# The test set of Multi30k is corrupted
# See https://discuss.pytorch.org/t/unicodedecodeerror-when-running-test-iterator/192818/3

In [76]:
def generate_batch(data_batch):
    """Processes a batch of source-target pairs by adding start-of-sequence (BOS) and end-of-sequence (EOS) tokens
    to each sequence and padding all sequences to the same length.

    Parameters:
    - data_batch (Iterable[Tuple[Tensor, Tensor]]): A batch of source-target pairs, where each element is a tuple
      containing the source sequence tensor and the target sequence tensor.
    """
    src_batch, tgt_batch = [], []
    src_batch, tgt_batch = [], []

    # Iterate over each source-target pair in the provided batch
    for src_item, tgt_item in data_batch:
        # Prepend the start-of-sequence (BOS) token and append the end-of-sequence (EOS) token to the sequences
        src_batch.append(torch.cat([torch.tensor([vocab_src['<bos>']]), src_item,
                                    torch.tensor([vocab_src['<eos>']])], dim=0))
        tgt_batch.append(torch.cat([torch.tensor([vocab_tgt['<bos>']]), tgt_item,
                                    torch.tensor([vocab_tgt['<eos>']])], dim=0))

    # Pad the sequences in the source batch to ensure they all have the same length.
    # 'batch_first=True' indicates that the batch dimension should come first in the resulting tensor.
    src_batch = pad_sequence(src_batch, padding_value=vocab_src['<pad>'], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=vocab_tgt['<pad>'], batch_first=True)
    return src_batch, tgt_batch

# DataLoader for the training data, using the generate_batch function as the collate_fn.
# This allows custom processing of each batch (adding BOS/EOS tokens and padding) before being fed into the model.
train_iterator = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

# Similarly, DataLoader for the validation data
valid_iterator = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

In [77]:
def train(model, iterator, optimizer, criterion, grad_clip):
    """
    Trains the model for one epoch over the given dataset.
    This function iterates over the provided data iterator, performing the forward and backward passes for each batch.
    It employs teacher forcing by feeding the shifted target sequence (excluding the last token) as input to the decoder.

    Parameters:
    - model (torch.nn.Module): The model to be trained.
    - iterator (Iterable): An iterable object that returns batches of data.
    - optimizer (torch.optim.Optimizer): The optimizer to use for updating the model parameters.
    - criterion (Callable): The loss function used to compute the difference between the model's predictions and the actual targets.
    - grad_clip (float): The maximum norm of the gradients for gradient clipping.

    Returns:
    - float: The average loss for the epoch, computed as the total loss over all batches divided by the number of batches in the iterator.
    """
    # Set the model to training mode.
    # This enables dropout, layer normalization etc., which behave differently during training.
    model.train()

    epoch_loss = 0

    # Enumerate over the data iterator to get batches
    for i, batch in enumerate(iterator):
        # Unpack the batch to get source (src) and target (tgt) sequences
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        # Forward pass through the model.
        # For seq2seq models, the decoder input (tgt[:, :-1]) excludes the last token, implementing teacher forcing.
        output = model(src, tgt[:, :-1])

        # Reshape the output and target tensors to compute loss.
        # The output tensor is reshaped to a 2D tensor where rows correspond to each token in the batch and columns to vocabulary size.

        # tgt is of shape [batch_size, tgt_len]
        # output is of shape [batch_size, tgt_len, tgt_vocab_size]
        output = output.contiguous().view(-1, tgt_vocab_size)

        # The target tensor is reshaped to a 1D tensor, excluding the first token (BOS) from each sequence.
        tgt = tgt[:, 1:].contiguous().view(-1)

        # Compute loss, perform backpropagation, and update model parameters
        loss = criterion(output, tgt)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        epoch_loss += loss.item()

    # Compute average loss per batch for the current epoch
    return epoch_loss / len(iterator)

In [78]:
def evaluate(model, iterator, criterion):
    """
    Evaluates the model's performance on a given dataset.
    This function is similar to the training loop, but without the backward pass and parameter updates.
    """
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

#### Training the Model

In [79]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    mins = int(elapsed_time / 60)
    secs = int(elapsed_time - (mins * 60))
    return mins, secs

In [80]:
import time
n_epochs = 20

for epoch in range(n_epochs):
    epoch_start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, grad_clip)
    val_loss = evaluate(model, valid_iterator, criterion)

    epoch_mins, epoch_secs = epoch_time(epoch_start_time, time.time())

    print(f'\nEpoch: {epoch + 1}')
    print(f'\nTime: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {val_loss:.3f}')


Epoch: 1

Time: 1m 25s
	Train Loss: 5.701
	Val Loss: 5.028

Epoch: 2

Time: 1m 27s
	Train Loss: 4.883
	Val Loss: 4.778

Epoch: 3

Time: 1m 27s
	Train Loss: 4.683
	Val Loss: 4.569

Epoch: 4

Time: 1m 27s
	Train Loss: 4.388
	Val Loss: 4.249

Epoch: 5

Time: 1m 27s
	Train Loss: 4.101
	Val Loss: 4.015

Epoch: 6

Time: 1m 27s
	Train Loss: 3.896
	Val Loss: 3.877

Epoch: 7

Time: 1m 26s
	Train Loss: 3.762
	Val Loss: 3.791

Epoch: 8

Time: 1m 27s
	Train Loss: 3.649
	Val Loss: 3.699

Epoch: 9

Time: 1m 26s
	Train Loss: 3.543
	Val Loss: 3.621

Epoch: 10

Time: 1m 26s
	Train Loss: 3.465
	Val Loss: 3.575

Epoch: 11

Time: 1m 27s
	Train Loss: 3.393
	Val Loss: 3.522

Epoch: 12

Time: 1m 26s
	Train Loss: 3.329
	Val Loss: 3.476

Epoch: 13

Time: 1m 27s
	Train Loss: 3.259
	Val Loss: 3.419

Epoch: 14

Time: 1m 26s
	Train Loss: 3.190
	Val Loss: 3.378

Epoch: 15

Time: 1m 27s
	Train Loss: 3.129
	Val Loss: 3.328

Epoch: 16

Time: 1m 27s
	Train Loss: 3.072
	Val Loss: 3.284

Epoch: 17

Time: 1m 27s
	Train L

The train loss should decrease from around 5.7 to 2.8 after 20 epochs.

#### Translating a Sample Sentence

In [81]:
def translate_sentence_greedy(model, sentence, vocab_src, vocab_tgt, max_length=50):
    """
    Translates a given source sentence into the target language using a trained Transformer model.
    The function preprocesses the input sentence by tokenizing and converting it to tensor format, then uses the model's
    encode and decode methods to generate the translated sentence. The translation process is performed token by token
    using greedy decoding, selecting the most likely next token at each step until an <eos> token is produced or the
    maximum length is reached.

    Parameters:
    - model (torch.nn.Module): The trained Transformer model.
    - sentence (str): The source sentence to translate.
    - vocab_src (dict): The source vocabulary mapping of tokens to indices. It should include special tokens such as
      '<bos>' (beginning of sentence) and '<eos>' (end of sentence).
    - vocab_tgt (dict): The target vocabulary mapping of indices to tokens. It should provide a method `lookup_token`
      to convert token indices back to the string representation.
    - max_length (int, optional): The maximum allowed length for the generated translation. The decoding process will
      stop when this length is reached if an <eos> token has not yet been generated.

    Returns:
    - str: The translated sentence as a string of text in the target language.
    """
    ### WRITE YOUR CODE HERE

    model.eval()

    src_tokens = ['<bos>'] + tokenizer_de(sentence) + ['<eos>']
    src_indices = [vocab_src[t] for t in src_tokens]
    src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)

    with torch.no_grad():
        memory, src_mask = model.encode(src_tensor)

    tgt_indices = [vocab_tgt['<bos>']]

    for _ in range(max_length):
        tgt_tensor = torch.LongTensor(tgt_indices).unsqueeze(0).to(device)
        with torch.no_grad():
            out = model.decode(tgt_tensor, memory, src_mask)
        next_token = out[:, -1, :].argmax(-1).item()
        tgt_indices.append(next_token)
        if next_token == vocab_tgt['<eos>']:
            break

    tokens = [vocab_tgt.lookup_token(i) for i in tgt_indices[1:]]
    if tokens[-1] == '<eos>':
        tokens = tokens[:-1]

    translated_sentence = ' '.join(tokens)

    return translated_sentence

In [82]:
def translate_sentence_beam_search(model, sentence, vocab_src, vocab_tgt, beam_width=5, max_length=50):
    """
    Translates a given source sentence into the target language using a trained Transformer model.
    The function preprocesses the input sentence by tokenizing and converting it to tensor format, then uses the model's
    encode and decode methods to generate the translated sentence. The translation process is performed token by token
    using greedy decoding, selecting the most likely next token at each step until an <eos> token is produced or the
    maximum length is reached.

    Parameters:
    - model (torch.nn.Module): The trained Transformer model.
    - sentence (str): The source sentence to translate.
    - vocab_src (dict): The source vocabulary mapping of tokens to indices. It should include special tokens such as
      '<bos>' (beginning of sentence) and '<eos>' (end of sentence).
    - vocab_tgt (dict): The target vocabulary mapping of indices to tokens. It should provide a method `lookup_token`
      to convert token indices back to the string representation.
    - max_length (int, optional): The maximum allowed length for the generated translation. The decoding process will
      stop when this length is reached if an <eos> token has not yet been generated.

    Returns:
    - str: The translated sentence using beam search.
    """
    ### WRITE YOUR CODE HERE

    model.eval()

    src_tokens = ['<bos>'] + tokenizer_de(sentence) + ['<eos>']
    src_indices = [vocab_src[t] for t in src_tokens]
    src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)

    with torch.no_grad():
        memory, src_mask = model.encode(src_tensor)

    # Each beam is a tuple (tokens, score)
    beams = [([vocab_tgt['<bos>']], 0.0)]

    for _ in range(max_length):
        new_beams = []
        for tokens, score in beams:
            if tokens[-1] == vocab_tgt['<eos>']:
                new_beams.append((tokens, score))
                continue

            tgt_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
            with torch.no_grad():
                output = model.decode(tgt_tensor, memory, src_mask)

            probs = torch.log_softmax(output[:, -1, :], dim=-1).squeeze(0)

            topk_probs, topk_indices = torch.topk(probs, beam_width)

            for i in range(beam_width):
                new_token = topk_indices[i].item()
                new_score = score + topk_probs[i].item()
                new_beams.append((tokens + [new_token], new_score))


        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]


        if all(b[0][-1] == vocab_tgt['<eos>'] for b in beams):
            break

    best_sequence = beams[0][0][1:]

    if best_sequence and best_sequence[-1] == vocab_tgt['<eos>']:
        best_sequence = best_sequence[:-1]

    tokens = [vocab_tgt.lookup_token(i) for i in best_sequence]
    return ' '.join(tokens)

In [83]:
src_sentence1 = "Ein kleiner Junge spielt draußen mit einem Ball."  # A little boy playing outside with a ball.
src_sentence2 = "Zwei Männer stehen auf einem Dach und arbeiten."  # Two men are standing on a roof and working.
src_sentence3 = "Ein Hund rennt über eine Wiese."  # A dog is running across a meadow.
src_sentence4 = "Eine Frau hält einen Regenschirm in der Hand."  # A woman is holding an umbrella in her hand.
src_sentence5 = "Ein Musiker spielt Gitarre auf der Straße."  # A musician is playing guitar on the street.

translated_sentence1 = translate_sentence_greedy(model, src_sentence1, vocab_src, vocab_tgt)
translated_sentence2 = translate_sentence_greedy(model, src_sentence2, vocab_src, vocab_tgt)
translated_sentence3 = translate_sentence_greedy(model, src_sentence3, vocab_src, vocab_tgt)
translated_sentence4 = translate_sentence_greedy(model, src_sentence4, vocab_src, vocab_tgt)
translated_sentence5 = translate_sentence_greedy(model, src_sentence5, vocab_src, vocab_tgt)

print(f'Translated sentence: {translated_sentence1}')
print(f'Translated sentence: {translated_sentence2}')
print(f'Translated sentence: {translated_sentence3}')
print(f'Translated sentence: {translated_sentence4}')
print(f'Translated sentence: {translated_sentence5}')

Translated sentence: A young boy is playing with a toy .
Translated sentence: Two men are standing on a bench in front of a building .
Translated sentence: A dog runs through the grass .
Translated sentence: A woman is sitting on a bench in front of a store .
Translated sentence: A person is playing with a yellow umbrella .


In [84]:
src_sentence1 = "Ein kleiner Junge spielt draußen mit einem Ball."  # A little boy playing outside with a ball.
src_sentence2 = "Zwei Männer stehen auf einem Dach und arbeiten."  # Two men are standing on a roof and working.
src_sentence3 = "Ein Hund rennt über eine Wiese."  # A dog is running across a meadow.
src_sentence4 = "Eine Frau hält einen Regenschirm in der Hand."  # A woman is holding an umbrella in her hand.
src_sentence5 = "Ein Musiker spielt Gitarre auf der Straße."  # A musician is playing guitar on the street.

translated_sentence1 = translate_sentence_beam_search(model, src_sentence1, vocab_src, vocab_tgt)
translated_sentence2 = translate_sentence_beam_search(model, src_sentence2, vocab_src, vocab_tgt)
translated_sentence3 = translate_sentence_beam_search(model, src_sentence3, vocab_src, vocab_tgt)
translated_sentence4 = translate_sentence_beam_search(model, src_sentence4, vocab_src, vocab_tgt)
translated_sentence5 = translate_sentence_beam_search(model, src_sentence5, vocab_src, vocab_tgt)

print(f'Translated sentence: {translated_sentence1}')
print(f'Translated sentence: {translated_sentence2}')
print(f'Translated sentence: {translated_sentence3}')
print(f'Translated sentence: {translated_sentence4}')
print(f'Translated sentence: {translated_sentence5}')

Translated sentence: A young boy is playing in a pool .
Translated sentence: Two men are sitting at a table .
Translated sentence: A dog runs through the grass .
Translated sentence: A woman is sitting at a table .
Translated sentence: A person is sitting on a bench .


You should get a translation similar to the reference after 20 epochs of training.