In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

In [None]:
# --- Step 1: Define Key Modules ---

class PositionalEncoding(nn.Module):
    """
    This module adds positional information to the input embeddings.
    Since Transformers process sequences in parallel, they have no inherent
    sense of word order. Positional encoding provides this information.
    """

In [None]:
def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Create a matrix of shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)
        # Create a tensor of shape (max_len, 1) representing positions
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # Calculate the denominator for the sine and cosine functions
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply sine to even indices and cosine to odd indices
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension to the positional encoding matrix
        pe = pe.unsqueeze(0).transpose(0, 1)