In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

In [None]:
# --- Step 1: Define Key Modules ---

class PositionalEncoding(nn.Module):
    """
    This module adds positional information to the input embeddings.
    Since Transformers process sequences in parallel, they have no inherent
    sense of word order. Positional encoding provides this information.
    """

In [None]:
def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Create a matrix of shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)
        # Create a tensor of shape (max_len, 1) representing positions
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # Calculate the denominator for the sine and cosine functions
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply sine to even indices and cosine to odd indices
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension to the positional encoding matrix
        pe = pe.unsqueeze(0).transpose(0, 1)

        # Register the positional encoding as a buffer, so it's not a parameter
        self.register_buffer('pe', pe)

In [None]:
def forward(self, x):
        """
        x: input tensor of shape (seq_len, batch_size, d_model)
        """
        # Add the positional encoding to the input embeddings
        x = x + self.pe[:x.size(0), :]
        return x

In [None]:
class MultiHeadAttention(nn.Module):
    """
    The core mechanism of the Transformer. It allows the model to weigh
    the importance of different words in a sequence when encoding a single word.
    """
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_heads == 0

        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads # Dimension of each head
        
        # Linear layers for Query, Key, Value
        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)