<a href="https://colab.research.google.com/github/AryaJeet1364/PyTorch_Projects/blob/main/TransformersfromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Attention is all you need - from scratch

## Overview

Complete implementation of the Transformer architecture from the original paper by Vaswani et al. (2017), built from scratch using PyTorch for German-English translation.

##Key Features

✅ Multi-head self-attention mechanism

✅ Positional encoding

✅ Encoder-decoder architecture

✅ Label smoothing and learning rate scheduling

✅ Training on Multi30k dataset

##Implementation Details

Paper: Attention Is All You Need (Vaswani et al., 2017)

Framework: PyTorch

Dataset: Multi30k German-English translation

Key Components: Custom attention, positional encoding, transformer blocks

##Acknowledgments

Implementation developed with guidance from the original paper, supplementary blogs, and AI assistance for debugging and optimization.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [None]:
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is NOT available")

GPU is available: Tesla T4


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def forward(self, query, key, value, mask=None):
        # Get dimensions - query, key, value can have different sequence lengths
        batch_size = query.size(0)
        query_len = query.size(1)
        key_len = key.size(1)
        value_len = value.size(1)

        # Apply linear transformations and reshape for multi-head attention
        # Query: (batch_size, query_len, d_model) -> (batch_size, num_heads, query_len, d_k)
        Q = self.W_q(query).view(batch_size, query_len, self.num_heads, self.d_k).transpose(1, 2)

        # Key: (batch_size, key_len, d_model) -> (batch_size, num_heads, key_len, d_k)
        K = self.W_k(key).view(batch_size, key_len, self.num_heads, self.d_k).transpose(1, 2)

        # Value: (batch_size, value_len, d_model) -> (batch_size, num_heads, value_len, d_k)
        V = self.W_v(value).view(batch_size, value_len, self.num_heads, self.d_k).transpose(1, 2)

        # Apply scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # Concatenate heads and put through final linear layer
        # (batch_size, num_heads, query_len, d_k) -> (batch_size, query_len, d_model)
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            batch_size, query_len, self.d_model
        )

        # Final linear projection
        output = self.W_o(attn_output)
        return output

In [None]:
# This feed-forward network is applied independently to each position in the sequence
# It has two linear layers with a ReLU activation in between

class FeedForwardNetwork(nn.Module):
  def __init__(self, d_model, d_ff, dropout=0.1):
    # d_model: input and output dimensionality (same as the model dimension)
    # d_ff: hidden layer dimensionality (usually larger than d_model)
    # dropout: dropout probability for regularization

    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(d_model, d_ff),
        nn.ReLU(),
        nn.Dropout(dropout), # Dropout after activation to prevent overfitting
        nn.Linear(d_ff, d_model),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.model(x)

In [None]:
# Positional encoding gives the model information about the position of each token in the sequence.
# Since the Transformer does not use recurrence or convolution, this is crucial for capturing order.

#   PE(pos, 2i)   = sin(pos / (10000^(2i / d_model)))
#   PE(pos, 2i+1) = cos(pos / (10000^(2i / d_model)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length=5000):
        super(PositionalEncoding, self).__init__()

        # Creating a zero tensor of shape (max_seq_length, d_model) to hold/' the positional encodings
        pe = torch.zeros(max_seq_length, d_model)

        # Creating a tensor containing position indices: [0, 1, 2, ..., max_seq_length - 1]
        # Shape: (max_seq_length, 1)
        position = torch.arange(0, max_seq_length).unsqueeze(1)

        # Creating a tensor containing the denominator terms for sine and cosine functions
        # Shape: (d_model // 2,)
        # Each i corresponds to a dimension in the embedding
        # The formula below implements:
        #   PE(pos, 2i)   = sin(pos / (10000^(2i / d_model)))
        #   PE(pos, 2i+1) = cos(pos / (10000^(2i / d_model)))
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )

        # Applying the sin function to even indices (dimensions 0, 2, 4, ...)
        # Using: sin(pos / (10000^(2i/d_model)))
        pe[:, 0::2] = torch.sin(position * div_term)

        # Applying the cos function to odd indices (dimensions 1, 3, 5, ...)
        # Using: cos(pos / (10000^(2i/d_model)))
        pe[:, 1::2] = torch.cos(position * div_term)

        # Adding a batch dimension at the front to make shape (1, max_seq_length, d_model)
        # So it can be easily added (broadcasted) to input embeddings of shape (batch_size, seq_len, d_model)
        # register_buffer ensures 'pe' is part of model state (saved and loaded), but not a trainable parameter
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Adding positional encodings to the input embeddings
        # Only take up to the length of the input sequence
        return x + self.pe[:, :x.size(1)]

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()

        # Multi-head self-attention layer:
        # Allows the model to attend to different parts of the sequence simultaneously.
        self.self_attn = MultiHeadAttention(d_model, n_heads)

        # Layer normalization after self-attention block
        self.norm1 = nn.LayerNorm(d_model)

        # Position-wise Feed Forward Network:
        # Applies two linear transformations with a ReLU in between to each token independently.
        self.feed_forward = FeedForwardNetwork(d_model, d_ff)

        # Layer normalization after feed-forward block
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout layer to prevent overfitting and add regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # x: Input tensor of shape (batch_size, seq_len, d_model)
        # mask: Optional attention mask (used to ignore padding tokens or future tokens)

        # ---- Self-Attention Block ----

        # Apply multi-head self-attention:
        # Query = Key = Value = x (since it's self-attention),
        # allowing each token to attend to every other token in the sequence.
        attn_output = self.self_attn(x, x, x, mask)  # Shape: (batch_size, seq_len, d_model)

        # Apply dropout to attention output (for regularization)
        attn_output = self.dropout(attn_output)

        # Add residual connection: adds original input x to the attention output
        # Apply layer normalization to stabilize and speed up training
        x = self.norm1(x + attn_output)  # Shape: (batch_size, seq_len, d_model)

        # ---- Feed-Forward Network Block ----

        # Apply feed-forward network:
        # A two-layer MLP applied independently to each position/token.
        ff_output = self.feed_forward(x)  # Shape: (batch_size, seq_len, d_model)

        # Apply dropout to FFN output
        ff_output = self.dropout(ff_output)

        # Add residual connection again: input x + FFN output
        # Apply second layer normalization
        x = self.norm2(x + ff_output)  # Shape: (batch_size, seq_len, d_model)

        # Return the final encoded representation of the input tokens
        return x


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()

        # Multi-head self-attention (with causal mask so decoder can't peek at future tokens)
        self.self_attn = MultiHeadAttention(d_model, n_heads)

        # Cross-attention: Decoder attends to encoder's output
        # Query comes from decoder input; Key and Value come from encoder output
        self.cross_attn = MultiHeadAttention(d_model, n_heads)

        # Position-wise feed-forward layer (applied to each position separately)
        self.feed_forward = FeedForwardNetwork(d_model, d_ff)

        # Layer Normalizations for each sub-layer (Post residual connection)
        self.norm1 = nn.LayerNorm(d_model)  # For self-attention
        self.norm2 = nn.LayerNorm(d_model)  # For cross-attention
        self.norm3 = nn.LayerNorm(d_model)  # For feed-forward

        # Dropout layer to prevent overfitting
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        """
        Args:
            x: Decoder input (batch_size, tgt_seq_len, d_model)
            enc_output: Encoder output (batch_size, src_seq_len, d_model)
            src_mask: Mask for encoder input (optional, usually for padding)
            tgt_mask: Mask for decoder input (causal mask to prevent peeking ahead)
        Returns:
            x: Output of decoder layer (batch_size, tgt_seq_len, d_model)
        """

        # ----- Step 1: Self-attention (with causal mask) -----
        # Decoder attends to earlier positions in the output only
        self_attn_output = self.self_attn(x, x, x, tgt_mask)  # Shape: (batch_size, tgt_seq_len, d_model)

        # Add & Norm: residual connection followed by layer normalization
        x = self.norm1(x + self.dropout(self_attn_output))

        # ----- Step 2: Cross-attention -----
        # Decoder attends to encoder output (source sentence)
        # Q = current decoder state, K = V = encoder output
        cross_attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)

        # Add & Norm again
        x = self.norm2(x + self.dropout(cross_attn_output))

        # ----- Step 3: Feed Forward -----
        # Position-wise fully connected layers to each token
        ff_output = self.feed_forward(x)

        # Final residual and normalization
        x = self.norm3(x + self.dropout(ff_output))

        # Output is passed to the next decoder layer or final linear + softmax
        return x


In [None]:
class Encoder(nn.Module):
    def __init__(self,
                 vocab_size,       # Size of the input vocabulary
                 d_model,          # Dimensionality of embeddings and hidden states
                 num_layers=6,     # Number of encoder layers to stack
                 num_heads=8,      # Number of attention heads in each layer
                 d_ff=2048,        # Hidden size of feedforward network
                 dropout=0.1,      # Dropout rate
                 max_seq_length=5000):  # Maximum sequence length (for positional encoding)
        super().__init__()

        # 1. Learnable token embeddings: maps each input token to a dense vector of size d_model
        self.embeddings = nn.Embedding(vocab_size, d_model)

        # 2. Scale factor as recommended in the original Transformer paper
        # This prevents the dot-product values from being too small at the start of training
        self.scale = math.sqrt(d_model)

        # 3. Positional Encoding: Adds information about the position of tokens in the sequence
        self.pe = PositionalEncoding(d_model, max_seq_length)

        # 4. Dropout: Helps prevent overfitting by randomly zeroing some values
        self.dropout = nn.Dropout(dropout)

        # 5. Stack of Encoder Layers: Each layer contains self-attention and feedforward sublayers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        """
        Args:
            x: Input tokens as indices (batch_size, seq_len)
            mask: Optional attention mask to ignore padding tokens
        Returns:
            x: Encoded output (batch_size, seq_len, d_model)
        """

        # Step 1: Embed input token indices and scale
        # Input shape: (batch_size, seq_len) → Output shape: (batch_size, seq_len, d_model)
        x = self.embeddings(x) * self.scale

        # Step 2: Add positional encoding and apply dropout
        x = self.dropout(self.pe(x))

        # Step 3: Pass through each encoder layer sequentially
        for layer in self.encoder_layers:
            x = layer(x, mask)

        # Output is a sequence of contextualized embeddings for each token
        return x


In [None]:
class Decoder(nn.Module):
    def __init__(self,
                 vocab_size,       # Vocabulary size for target tokens
                 d_model,          # Embedding and hidden dimension size
                 num_layers=6,     # Number of stacked decoder layers
                 num_heads=8,      # Number of attention heads per layer
                 d_ff=2048,        # Hidden size of feedforward network
                 dropout=0.1,      # Dropout probability
                 max_seq_length=5000):  # Maximum target sequence length for positional encoding
        super().__init__()

        # 1. Output token embeddings: map each target token to a dense vector
        self.embeddings = nn.Embedding(vocab_size, d_model)

        # 2. Scale embeddings by sqrt(d_model) to stabilize training as per the original Transformer paper
        self.scale = math.sqrt(d_model)

        # 3. Positional encoding to add sequence order information (since attention itself is order-agnostic)
        self.pe = PositionalEncoding(d_model, max_seq_length)

        # 4. Dropout for regularization to prevent overfitting
        self.dropout = nn.Dropout(dropout)

        # 5. Stack of N decoder layers (each with self-attention, cross-attention, and feedforward)
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, x, encoder_output, src_mask=None, tgt_mask=None):
        """
        Args:
            x: Target input tokens (batch_size, target_seq_len)
            encoder_output: Encoder output states (batch_size, source_seq_len, d_model)
            src_mask: Optional mask for source padding tokens (to ignore in cross-attention)
            tgt_mask: Optional mask for target tokens (to ignore padding and future tokens in self-attention)
        Returns:
            x: Output of decoder (batch_size, target_seq_len, d_model)
        """

        # Step 1: Embed target tokens and scale
        # Shape changes from (batch_size, target_seq_len) to (batch_size, target_seq_len, d_model)
        x = self.embeddings(x) * self.scale

        # Step 2: Add positional encoding and apply dropout
        x = self.dropout(self.pe(x))

        # Step 3: Pass through each decoder layer sequentially
        # Each layer applies masked self-attention, encoder-decoder cross-attention, and feedforward sublayer
        for layer in self.decoder_layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)

        # Return the contextualized decoder embeddings
        return x

In [None]:
# Replace your existing mask functions with these corrected versions:

def create_padding_mask(seq,pad_idx=2):
    """
    Create mask for padding tokens (token ID 0)
    Args:
        seq: Input sequence tensor of shape (batch_size, seq_len)
        pad_idx: Padding token index (2 for <blank> in your vocab)
    Returns:
        mask: Padding mask of shape (batch_size, 1, 1, seq_len)
              Returns 1 for valid tokens, 0 for padding tokens
    """
    batch_size, seq_len = seq.shape

    # Create mask: 1 for non-padding tokens, 0 for padding tokens
    mask = (seq != pad_idx).float()

    # Reshape to (batch_size, 1, 1, seq_len) for broadcasting
    return mask.view(batch_size, 1, 1, seq_len)


def create_future_mask(size, device):
    """
    Create mask to prevent attending to future tokens (causal mask)
    Args:
        size: length of the sequence (target_seq_len)
        device: device to create the tensor on
    Returns:
        mask: Future mask of shape (1, 1, size, size)
              Returns 1 for allowed positions, 0 for future positions
    """
    # Create lower triangular matrix (including diagonal)
    mask = torch.tril(torch.ones((size, size), device=device))

    # Add batch and head dimensions: (1, 1, size, size)
    return mask.unsqueeze(0).unsqueeze(0)


def create_masks(src, tgt):
    """
    Create source and target masks for Transformer training.
    Args:
        src: Source input sequence (batch_size, src_len)
        tgt: Target input sequence (batch_size, tgt_len)
    Returns:
        src_mask: Padding mask for source input (batch_size, 1, 1, src_len)
        tgt_mask: Combined padding and future mask for target input
    """
    # Get device from input tensors
    device = src.device

    # 1. Create padding masks for source and target sequences
    src_padding_mask = create_padding_mask(src)
    tgt_padding_mask = create_padding_mask(tgt)

    # 2. Create future mask to prevent attention to future tokens in target
    tgt_len = tgt.size(1)
    tgt_future_mask = create_future_mask(tgt_len, device)

    # 3. Combine masks using element-wise multiplication (both should be 1 for valid positions)
    tgt_mask = tgt_padding_mask * tgt_future_mask

    return src_padding_mask, tgt_mask

In [None]:
class Transformer(nn.Module):
    def __init__(self,
                 src_vocab_size,   # Vocabulary size of the source language/input
                 tgt_vocab_size,   # Vocabulary size of the target language/output
                 d_model,          # Embedding and model dimension
                 num_layers=6,     # Number of encoder and decoder layers (transformer blocks)
                 num_heads=8,      # Number of attention heads in multi-head attention
                 d_ff=2048,        # Dimension of feed-forward layer inside transformer blocks
                 dropout=0.1,      # Dropout rate for regularization
                 max_seq_length=5000):  # Maximum sequence length for positional encoding
        super().__init__()

        # Initialize the Encoder with parameters
        self.encoder = Encoder(
            src_vocab_size,
            d_model,
            num_layers,
            num_heads,
            d_ff,
            dropout,
            max_seq_length
        )

        # Initialize the Decoder with parameters
        self.decoder = Decoder(
            tgt_vocab_size,
            d_model,
            num_layers,
            num_heads,
            d_ff,
            dropout,
            max_seq_length
        )

        # Final linear layer to project decoder output to target vocab probabilities
        self.final_layer = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        """
        Forward pass of the Transformer model
        Args:
            src: Input source sequence tensor of shape (batch_size, src_seq_len)
            tgt: Input target sequence tensor of shape (batch_size, tgt_seq_len)
            src_mask: Optional mask for source sequence
            tgt_mask: Optional mask for target sequence
        Returns:
            output: Raw logits for each target token (batch_size, tgt_seq_len, tgt_vocab_size)
        """

        # 1. Create masks for padding and future tokens
        # src_mask masks padded tokens in source input for encoder attention
        # tgt_mask combines padding and causal mask for decoder attention
        if src_mask is None or tgt_mask is None:
            src_mask, tgt_mask = create_masks(src, tgt)

        # 2. Pass the source sequence through the encoder
        # encoder_output shape: (batch_size, src_seq_len, d_model)
        encoder_output = self.encoder(src, src_mask)

        # 3. Pass the target sequence and encoder output to the decoder
        # decoder_output shape: (batch_size, tgt_seq_len, d_model)
        decoder_output = self.decoder(tgt, encoder_output, src_mask, tgt_mask)

        # 4. Project the decoder output to logits over the target vocabulary
        output = self.final_layer(decoder_output)

        # 5. Note:
        # We usually don't apply softmax here because the loss function
        # like nn.CrossEntropyLoss expects raw logits and applies log_softmax internally.

        return output

In [None]:
class TransformerLRScheduler():
    def __init__(self, optimizer, d_model, warmup_steps):
        """
        Learning rate scheduler for Transformer (based on the original paper)

        Args:
            optimizer: Optimizer whose learning rate will be updated
            d_model: Model embedding size (used for scaling)
            warmup_steps: Number of steps to linearly increase the learning rate (warm-up phase)
        """
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.step_num = 0  # Track step number internally

    def step(self):
        """
        Update the learning rate based on current step number

        Formula:
            lrate = d_model^(-0.5) * min(step_num^(-0.5), step_num * warmup_steps^(-1.5))

        This means:
          - Initially (during warmup), learning rate increases linearly with step_num
          - After warmup, learning rate decreases proportionally to step_num^(-0.5)

        Args:
            step_num: Current training step (int or float)
        """
        self.step_num += 1
        # Calculate learning rate scalar using the formula
        lrate = (self.d_model ** -0.5) * min(
            self.step_num ** -0.5,
            self.step_num * (self.warmup_steps ** -1.5)
        )

        # Update optimizer learning rate for all parameter groups
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lrate

        return lrate

In [None]:
class LabelSmoothing(nn.Module):
    def __init__(self, smoothing=0.1):
        """
        Implements label smoothing regularization

        Args:
            smoothing: Float, amount of smoothing to apply (typically 0.1)
        """
        super().__init__()
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing  # Probability for the correct class

    def forward(self, logits, target):
        """
        Calculate the label-smoothed loss

        Args:
            logits: Model outputs before softmax (batch_size, vocab_size)
            target: True class indices (batch_size)

        Returns:
            loss: Smoothed cross-entropy loss (scalar)
        """
        vocab_size = logits.size(-1)

        with torch.no_grad():
            # Create a tensor of the same shape as logits filled with smoothing values
            true_dist = torch.zeros_like(logits)
            # Distribute smoothing mass to all classes except the true class
            true_dist.fill_(self.smoothing / (vocab_size - 1))
            # Assign confidence (1 - smoothing) probability to the true class index
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)

        # Compute cross-entropy between smoothed true distribution and predicted logits
        loss = torch.mean(torch.sum(-true_dist * torch.log_softmax(logits, dim=-1), dim=-1))

        return loss


In [None]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
!pip install spacy
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
def train_transformer(model, train_dataloader, criterion, optimizer, scheduler, num_epochs, device='cuda'):
    """
    Training loop for transformer

    Args:
        model: Transformer model
        train_dataloader: DataLoader for training data
        criterion: Loss function (with label smoothing)
        optimizer: Optimizer
        scheduler: Learning rate scheduler
        num_epochs: Number of training epochs
    """
    # 1. Setup
    model = model.to(device)
    model.train()

    # For tracking training progress
    # total_loss = 0
    all_losses = []

    # 2. Training loop
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0

        for batch_idx, batch in enumerate(train_dataloader):
            # Get source and target batches
            src = batch['src'].to(device)
            tgt = batch['tgt'].to(device)

            # Prepare target for input and output
            # Remove last token from target for input
            tgt_input = tgt[:, :-1]
            # Remove first token from target for output
            tgt_output = tgt[:, 1:]

            # Create masks
            src_mask, tgt_mask = create_masks(src, tgt_input)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(src, tgt_input, src_mask, tgt_mask)

            # Reshape outputs and target for loss calculation
            outputs = outputs.reshape(-1, outputs.size(-1))
            tgt_output = tgt_output.reshape(-1)

            # Mask out padding tokens in loss calculation
            # mask = (tgt_output != 2).float()  # Don't compute loss on padding tokens

            # Calculate loss
            loss = criterion(outputs, tgt_output)

            # Backward pass
            loss.backward()

            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Update weights
            optimizer.step()
            scheduler.step()

            # Update loss tracking
            epoch_loss += loss.item()

            # Print progress every N batches
            if batch_idx % 100 == 0:
                print(f"Batch {batch_idx}, Loss: {loss.item():.4f}")

        # Calculate average loss for epoch
        avg_epoch_loss = epoch_loss / len(train_dataloader)
        all_losses.append(avg_epoch_loss)
        print(f"Epoch {epoch + 1} Loss: {avg_epoch_loss:.4f}")

        # Save checkpoint
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_epoch_loss,
        }, f'checkpoint_epoch_{epoch+1}.pt')

    return all_losses

In [None]:
import os
import torch
import spacy
import urllib.request
import zipfile
from torch.utils.data import Dataset, DataLoader

def download_multi30k():
    """Download Multi30k dataset if not present"""
    # Create data directory
    if not os.path.exists('data'):
        os.makedirs('data')

    # Download files if they don't exist
    base_url = "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/"
    files = {
        "train.de": "train.de.gz",
        "train.en": "train.en.gz",
        "val.de": "val.de.gz",
        "val.en": "val.en.gz",
        "test.de": "test_2016_flickr.de.gz",
        "test.en": "test_2016_flickr.en.gz"
    }

    for local_name, remote_name in files.items():
        filepath = f'data/{local_name}'
        if not os.path.exists(filepath):
            url = base_url + remote_name
            urllib.request.urlretrieve(url, filepath + '.gz')
            os.system(f'gunzip -f {filepath}.gz')

def load_data(filename):
    """Load data from file"""
    with open(filename, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f]

def create_dataset():
    """Create dataset from files"""
    # Download data if needed
    download_multi30k()

    # Load data
    train_de = load_data('data/train.de')
    train_en = load_data('data/train.en')
    val_de = load_data('data/val.de')
    val_en = load_data('data/val.en')

    return (train_de, train_en), (val_de, val_en)

class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_vocab, tgt_vocab, src_tokenizer, tgt_tokenizer):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]

        # Tokenize
        src_tokens = [tok.text for tok in self.src_tokenizer(src_text)]
        tgt_tokens = [tok.text for tok in self.tgt_tokenizer(tgt_text)]

        unk_src = self.src_vocab["<unk>"]
        unk_tgt = self.tgt_vocab["<unk>"]

        src_indices = [self.src_vocab["<s>"]] + [self.src_vocab.get(token, unk_src) for token in src_tokens] + [self.src_vocab["</s>"]]
        tgt_indices = [self.tgt_vocab["<s>"]] + [self.tgt_vocab.get(token, unk_tgt) for token in tgt_tokens] + [self.tgt_vocab["</s>"]]


        return {
            'src': torch.tensor(src_indices),
            'tgt': torch.tensor(tgt_indices)
        }

def build_vocab_from_texts(texts, tokenizer, min_freq=2):
    """Build vocabulary from texts"""
    counter = {}
    for text in texts:
        for token in [tok.text for tok in tokenizer(text)]:
            counter[token] = counter.get(token, 0) + 1

    # Create vocabulary
    vocab = {"<s>": 0, "</s>": 1, "<blank>": 2, "<unk>": 3}
    idx = 4
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

def collate_batch(batch):
    src_tensors = [item['src'] for item in batch]
    tgt_tensors = [item['tgt'] for item in batch]

    # Pad sequences
    src_padded = torch.nn.utils.rnn.pad_sequence(src_tensors, batch_first=True, padding_value=2)
    tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_tensors, batch_first=True, padding_value=2)

    return {
        'src': src_padded,
        'tgt': tgt_padded
    }

def create_dataloaders(batch_size=32):
    # Load tokenizers
    spacy_de = spacy.load("de_core_news_sm")
    spacy_en = spacy.load("en_core_web_sm")

    # Get data
    (train_de, train_en), (val_de, val_en) = create_dataset()

    # Build vocabularies
    vocab_src = build_vocab_from_texts(train_de, spacy_de)
    vocab_tgt = build_vocab_from_texts(train_en, spacy_en)

    # Create datasets
    train_dataset = TranslationDataset(
        train_de, train_en,
        vocab_src, vocab_tgt,
        spacy_de, spacy_en
    )

    val_dataset = TranslationDataset(
        val_de, val_en,
        vocab_src, vocab_tgt,
        spacy_de, spacy_en
    )

    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_batch
    )

    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_batch
    )

    return train_dataloader, val_dataloader, vocab_src, vocab_tgt

In [None]:
# Main execution
if __name__ == "__main__":
    # Load data
    train_dataloader, val_dataloader, vocab_src, vocab_tgt = create_dataloaders()

    if train_dataloader is None:
        print("Failed to load data. Please install spacy models first.")
        exit(1)

    print(f"Source vocab size: {len(vocab_src)}")
    print(f"Target vocab size: {len(vocab_tgt)}")

    # Initialize model
    model = Transformer(
        src_vocab_size=len(vocab_src),
        tgt_vocab_size=len(vocab_tgt),
        d_model=512,
        num_layers=6,
        num_heads=8,
        d_ff=2048,
        dropout=0.1
    ).to(device)

    # Use CrossEntropyLoss with ignore_index for padding tokens
    criterion = nn.CrossEntropyLoss(ignore_index=2)  # Ignore padding tokens

    # You can also use your LabelSmoothing if preferred:
    # criterion = LabelSmoothing(smoothing=0.1).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # Use the custom TransformerLRScheduler
    scheduler = TransformerLRScheduler(optimizer, d_model=512, warmup_steps=4000)

    print("Starting training...")
    losses = train_transformer(
        model=model,
        train_dataloader=train_dataloader,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        num_epochs=10,
        device=device
    )

    print("Training completed!")
    print(f"Final losses: {losses}")

Source vocab size: 8014
Target vocab size: 6191
Starting training...
Epoch 1/10
Batch 0, Loss: 9.0173
Batch 100, Loss: 6.3632
Batch 200, Loss: 5.0167
Batch 300, Loss: 4.4785
Batch 400, Loss: 4.1779
Batch 500, Loss: 4.0542
Batch 600, Loss: 3.7182
Batch 700, Loss: 3.7322
Batch 800, Loss: 3.4975
Batch 900, Loss: 2.9767
Epoch 1 Loss: 4.3983
Epoch 2/10
Batch 0, Loss: 3.1839
Batch 100, Loss: 3.1761
Batch 200, Loss: 2.9956
Batch 300, Loss: 2.8564
Batch 400, Loss: 2.6560
Batch 500, Loss: 2.8358
Batch 600, Loss: 2.6247
Batch 700, Loss: 2.1188
Batch 800, Loss: 2.8202
Batch 900, Loss: 2.4986
Epoch 2 Loss: 2.8344
Epoch 3/10
Batch 0, Loss: 2.2864
Batch 100, Loss: 2.3400
Batch 200, Loss: 2.8043
Batch 300, Loss: 2.1772
Batch 400, Loss: 2.2822
Batch 500, Loss: 2.2163
Batch 600, Loss: 2.3084
Batch 700, Loss: 2.4858
Batch 800, Loss: 2.5319
Batch 900, Loss: 2.4284
Epoch 3 Loss: 2.3629
Epoch 4/10
Batch 0, Loss: 1.7197
Batch 100, Loss: 2.0489
Batch 200, Loss: 1.9925
Batch 300, Loss: 2.1243
Batch 400, Loss: