In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset

ds = load_dataset("bentrevett/multi30k")

KeyboardInterrupt: 

In [5]:
from pathlib import Path
import os

hf_cache = Path.home() / ".cache" / "huggingface" / "hub"
print("Hugging Face cache location:", hf_cache)

# List top-level folders (e.g., datasets you've downloaded)
for item in hf_cache.iterdir():
    print("-", item.name)

Hugging Face cache location: C:\Users\Debojyoti Das\.cache\huggingface\hub
- .locks
- datasets--bentrevett--multi30k
- models--gpt2
- models--microsoft--deberta-v3-small
- version.txt


In [6]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from collections import Counter
import re
import os



# --- 1. Data Loading and Preparation ---
user_home = os.path.expanduser("~")
base_path = os.path.join(user_home, ".cache", "huggingface", "hub", "datasets--bentrevett--multi30k", "snapshots", "4589883f3d09d4ef6361784e03f0ead219836469")

data_files = {
    "train" : os.path.join(base_path, "train.jsonl"),
    "validation" : os.path.join(base_path, "val.jsonl"),
    "test" : os.path.join(base_path, "test.jsonl")
}

raw_datasets = load_dataset("json", data_files=data_files)


# --- 2. Tokenization ---
def tokenize_de(text):
    """ Tokenizes German text by converting to lowercase and splitting by whitespace and punctuation. """
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', '', text) # Keep German characters and letters
    return [tok.lower() for tok in text.split()]

def tokenize_en(text):
    """ Tokenizes English text by converting to lowercase and splitting by whitespace and punctuation. """
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Keep English characters and letters
    return [tok.lower() for tok in text.split()]


# --- 3. Vocabulary Creation ---

class Vocabulary:
    def __init__(self, freq_threshold):
        # Initialize special tokens and mappings.
        # <PAD>: Padding token for sequences of different lengths.
        # <SOS>: Start of Sentence token.
        # <EOS>: End of Sentence token.
        # <UNK>: Unknown token for words not in our vocabulary.
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {v: k for k, v in self.itos.items()}
        self.freq_threshold = freq_threshold
        
    def __len__(self):
        return len(self.itos)
    
    def build_vocabulary(self, sentence_list, tokenizer):
        """ Builds vocabulary from a list of sentences. """
        frequencies = Counter()
        idx = 4
        for sentence in sentence_list:
            for word in tokenizer(sentence):
                frequencies[word] += 1
        
        # Add words to the vocabulary only if their frequency is above the threshold.
        # This helps in filtering out rare words that may not be useful for training.
        for word, freq in frequencies.items():
            if freq >= self.freq_threshold:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
                
                
# --- 4. Custom Pytorch Dataset ---
# This class will be used by the DataLoader to fetch batches of data at a time.

class Multi30kDataset(Dataset):
    def __init__(self, dataset_split, src_vocab, trg_vocab, src_tokenizer, trg_tokenizer):
        self.dataset_split = dataset_split
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer

    def __len__(self):
        return len(self.dataset_split)

    def __getitem__(self, index):
        # Fetch a single source-target pair.
        src_sentence = self.dataset_split[index]['en']
        trg_sentence = self.dataset_split[index]['de']

        # Tokenize and numericalize the source sentence.
        src_tokens = self.src_tokenizer(src_sentence)
        src_numerical = [self.src_vocab.stoi["<SOS>"]]
        src_numerical.extend([self.src_vocab.stoi.get(token, self.src_vocab.stoi["<UNK>"]) for token in src_tokens])
        src_numerical.append(self.src_vocab.stoi["<EOS>"])

        # Tokenize and numericalize the target sentence.
        trg_tokens = self.trg_tokenizer(trg_sentence)
        trg_numerical = [self.trg_vocab.stoi["<SOS>"]]
        trg_numerical.extend([self.trg_vocab.stoi.get(token, self.trg_vocab.stoi["<UNK>"]) for token in trg_tokens])
        trg_numerical.append(self.trg_vocab.stoi["<EOS>"])

        # Return as PyTorch tensors.
        return torch.tensor(src_numerical), torch.tensor(trg_numerical)
    

# --- 5. Collate Function for Padding ---
class PadCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        # Separate the source and target sequences from the batch.
        source_seqs = [item[0] for item in batch]
        target_seqs = [item[1] for item in batch]

        # Pad the sequences. `pad_sequence` stacks a list of tensors along a new dimension.
        # `batch_first=False` is crucial for Transformer models in PyTorch that expect
        # the sequence length to be the first dimension: (Seq_Len, Batch_Size).
        padded_sources = pad_sequence(source_seqs, batch_first=False, padding_value=self.pad_idx)
        padded_targets = pad_sequence(target_seqs, batch_first=False, padding_value=self.pad_idx)

        return padded_sources, padded_targets

# --- 6. Main Data Setup Function ---
# This function orchestrates the entire process.

def get_dataloaders(batch_size, freq_threshold=2):
    # Load the raw datasets.
    train_data = raw_datasets['train']
    eng_sentences = [item['en'] for item in train_data]
    ger_sentences = [item['de'] for item in train_data]
    
    # Build vocabularies
    src_vocab = Vocabulary(freq_threshold)
    trg_vocab = Vocabulary(freq_threshold)
    src_vocab.build_vocabulary(eng_sentences, tokenize_en)
    trg_vocab.build_vocabulary(ger_sentences, tokenize_de)
    
    # Instantiate the custom Dataset for each split.
    train_dataset = Multi30kDataset(raw_datasets['train'], src_vocab, trg_vocab, tokenize_en, tokenize_de)
    val_dataset = Multi30kDataset(raw_datasets['validation'], src_vocab, trg_vocab, tokenize_en, tokenize_de)
    test_dataset = Multi30kDataset(raw_datasets['test'], src_vocab, trg_vocab, tokenize_en, tokenize_de)

    # Get the padding index for our collate function.
    pad_idx = src_vocab.stoi["<PAD>"]

    # Create DataLoader objects.
    # The DataLoader will automatically use our custom Dataset to get items and our
    # PadCollate class to form batches of padded tensors.
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=PadCollate(pad_idx=pad_idx))
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=PadCollate(pad_idx=pad_idx))
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=PadCollate(pad_idx=pad_idx))

    return train_loader, val_loader, test_loader, src_vocab, trg_vocab

In [7]:
# Add this to the end of your step1_data_prep.py file to test it
if __name__ == "__main__":
    # Define hyperparameters
    BATCH_SIZE = 4
    FREQ_THRESHOLD = 2

    # Get the DataLoaders and vocabularies
    train_loader, _, _, src_vocab, trg_vocab = get_dataloaders(
        batch_size=BATCH_SIZE,
        freq_threshold=FREQ_THRESHOLD
    )

    print(f"Source (English) vocabulary size: {len(src_vocab)}")
    print(f"Target (German) vocabulary size: {len(trg_vocab)}")

    # Fetch one batch from the training loader
    print("\n--- Demonstrating one batch of data ---")
    src_batch, trg_batch = next(iter(train_loader))

    print(f"Shape of the source batch tensor: {src_batch.shape}")
    print(f"Shape of the target batch tensor: {trg_batch.shape}")
    print("Note: Shape is (Sequence_Length, Batch_Size)")

    # Select the first example from the batch to inspect
    src_example = src_batch[:, 0]
    trg_example = trg_batch[:, 0]

    print("\n--- Inspecting the first example in the batch ---")
    print("Source (English) numericalized tensor:")
    print(src_example)
    print("\nTarget (German) numericalized tensor:")
    print(trg_example)

    # Convert the numericalized tensors back to text to verify
    src_text = " ".join([src_vocab.itos[idx.item()] for idx in src_example])
    trg_text = " ".join([trg_vocab.itos[idx.item()] for idx in trg_example])

    print("\n--- Converting the first example back to text ---")
    print(f"Source Text: {src_text}")
    print(f"Target Text: {trg_text}")

Source (English) vocabulary size: 5919
Target (German) vocabulary size: 7810

--- Demonstrating one batch of data ---
Shape of the source batch tensor: torch.Size([15, 4])
Shape of the target batch tensor: torch.Size([14, 4])
Note: Shape is (Sequence_Length, Batch_Size)

--- Inspecting the first example in the batch ---
Source (English) numericalized tensor:
tensor([  1,  19, 240,  72, 370,  33,  15,  19, 557,   2,   0,   0,   0,   0,
          0])

Target (German) numericalized tensor:
tensor([  1,  46, 122, 352,  31,  11,  28, 531,   2,   0,   0,   0,   0,   0])

--- Converting the first example back to text ---
Source Text: <SOS> a group of dogs standing in a river <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>
Target Text: <SOS> eine gruppe hunde steht in einem fluss <EOS> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        """
        Initializes the Multi-Head Attention layer. This is the core component
        that allows the model to weigh the importance of different words in the
        input sequence.

        Args:
            embed_size (int): The dimensionality of the input and output embeddings (d_model).
            heads (int): The number of attention heads (h). The embedding will be split among these heads.
        """
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        # The dimension of each head's key, query, and value vectors.
        self.head_dim = embed_size // heads

        # This assertion ensures that the embedding size can be evenly split into the number of heads.
        # For example, if embed_size=512 and heads=8, head_dim will be 64.
        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        # We create one large linear layer for each of queries, keys, and values.
        # This is more efficient than creating 'h' separate small linear layers.
        # These layers will project the input embeddings into the Q, K, V spaces.
        self.values = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.keys = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.queries = nn.Linear(self.embed_size, self.embed_size, bias=False)
        
        # This is the final linear layer (W^O in the paper) that combines the outputs of all attention heads.
        self.fc_out = nn.Linear(self.embed_size, self.embed_size)

    def forward(self, values, keys, queries, mask):
        # In Encoder self-attention, the values, keys, and queries are all the same input that is passed to the attention layer.



        # Get the batch size, which is the second dimension of the input tensors.
        N = queries.shape[1]

        # Get the sequence lengths for values, keys, and queries from the first dimension.
        value_len, key_len, query_len = values.shape[0], keys.shape[0], queries.shape[0]

        # Pass the inputs through their respective linear layers. The shape remains unchanged.
        # Shape: (seq_len, N, embed_size) -> (seq_len, N, embed_size)
        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Reshape the Q, K, V tensors to split the `embed_size` dimension into `heads` and `head_dim`.
        # The reshape operation splits the last dimension.
        # Shape: (seq_len, N, embed_size) -> (seq_len, N, heads, head_dim)
        values = values.reshape(value_len, N, self.heads, self.head_dim)
        keys = keys.reshape(key_len, N, self.heads, self.head_dim)
        queries = queries.reshape(query_len, N, self.heads, self.head_dim)

        # For batch matrix multiplication in PyTorch, the batch dimensions (N and heads) must come first.
        # We use permute to reorder the dimensions.
        # Original order: (0: seq_len, 1: N, 2: heads, 3: head_dim) -> (N, heads, seq_len, head_dim).
        queries = queries.permute(1, 2, 0, 3)
        keys = keys.permute(1, 2, 0, 3)
        values = values.permute(1, 2, 0, 3)

        # Compute the dot product of queries and keys to get the attention "energy" or raw scores.
        # Q shape: (N, heads, query_len, head_dim)
        # K.transpose shape: (N, heads, head_dim, key_len)
        # energy shape: (N, heads, query_len, key_len)
        energy = torch.matmul(queries, keys.transpose(-2, -1))

        # Apply the mask if it is provided. The mask is used to hide padding or future tokens. Where the mask is 0, we set the energy to a very small number.
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Apply softmax along the last dimension (key_len) to get attention weights.
        # This normalizes the scores into a probability distribution. Also apply the scaling factor by dividing by sqrt(d_model).
        # attention shape: (N, heads, query_len, key_len)
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=-1)


        # Multiply the attention weights with the value vectors. Attention shape: (N, heads, query_len, key_len). Values shape: (N, heads, value_len, head_dim)
        # out shape: (N, heads, query_len, head_dim)
        out = torch.matmul(attention, values)


        # First, permute to bring seq_len back after the batch dimension, preparing for concatenation.
        # Shape: (N, heads, query_len, head_dim) -> (N, query_len, heads, head_dim)
        out = out.permute(0, 2, 1, 3).contiguous()

        # Reshape to combine the heads back into a single `embed_size` dimension. This is the "concatenation" step.
        # Shape: (N, query_len, heads, head_dim) -> (N, query_len, embed_size)
        out = out.reshape(N, query_len, self.heads * self.head_dim)

        # Transpose the result back to the (seq_len, N, embed_size) format, which is our convention.
        # Shape: (N, query_len, embed_size) -> (query_len, N, embed_size)
        out = out.transpose(0, 1)

        # Pass the concatenated output through the final linear layer (W^O).
        out = self.fc_out(out)
        
        return out


class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        """
        Initializes a standard Transformer block, the repeating unit of the encoder.
        It contains a multi-head attention layer and a feed-forward network.

        Args:
            embed_size (int): The dimensionality of the embeddings (d_model).
            heads (int): The number of attention heads.
            dropout (float): The dropout rate for regularization.
            forward_expansion (int): The expansion factor for the FFN's inner dimension.
        """
        super(TransformerBlock, self).__init__()
        
        # The first sub-layer: Multi-Head Attention
        self.attention = MultiHeadAttention(embed_size, heads)
        
        # Two Layer Normalization modules. One for each sub-layer.
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        
        # The second sub-layer: Position-wise Feed-Forward Network
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        
        # A dropout layer to be applied after each sub-layer's output before adding to the residual.
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        """
        The forward pass for the Transformer block.
        """
        # --- First Sub-layer: Multi-Head Attention followed by Add & Norm ---
        
        attention_out = self.attention(value, key, query, mask)
        
        # Apply the residual connection (Add) and dropout.
        # The 'query' input serves as the residual connection from before the attention layer.
        add_attention = query + self.dropout(attention_out)
        

        norm_attention_out = self.norm1(add_attention)
        
        # --- Second Sub-layer: Feed-Forward Network followed by Add & Norm ---
        
        forward_out = self.feed_forward(norm_attention_out)
        
        add_forward = norm_attention_out + self.dropout(forward_out)
        
        norm_forward_out = self.norm2(add_forward)
        
        return norm_forward_out


class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):
        """
        Initializes the Encoder, which is a stack of TransformerBlocks.
        """
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device


        # A lookup table for word embeddings.
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        # A lookup table for positional embeddings (learnable version).
        self.positional_embedding = nn.Embedding(max_length, embed_size)


        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )
        
        # A dropout layer for regularization after adding embeddings.
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        """
        The forward pass for the Encoder.

        Args:
            x (torch.Tensor): Input tensor of token indices. Shape: (seq_len, N).
            mask (torch.Tensor): Source padding mask.
        """
        # Get sequence length and batch size from the input tensor shape.
        seq_length, N = x.shape
        
        # Create a tensor of position indices: [0, 1, 2, ..., seq_len-1].
        # .expand() repeats it for each item in the batch.
        # .transpose() changes shape from (N, seq_len) to (seq_len, N) to match x.
        positions = torch.arange(0, seq_length).expand(N, seq_length).transpose(0, 1).to(self.device)
        
        # --- Apply Embeddings and Positional Encoding ---
        # The input x needs to be (N, seq_len) for nn.Embedding, so we transpose it.
        word_embed = self.word_embedding(x.transpose(0, 1))
        pos_embed = self.positional_embedding(positions.transpose(0, 1))
        
        # Add word and positional embeddings, apply dropout, and transpose back to our (seq_len, N, embed_size) convention.
        out = self.dropout(word_embed + pos_embed).transpose(0, 1)
        
        # --- Pass through Transformer Blocks ---
        # Sequentially pass the output through each TransformerBlock.
        # For self-attention in the encoder, value, key, and query are all the same.
        for layer in self.layers:
            out = layer(out, out, out, mask)
            
        return out


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        """
        Initializes a single Decoder block. It has three sub-layers:
        masked self-attention, cross-attention, and a feed-forward network.
        """
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        # This is the masked self-attention layer for the target sequence.
        self.attention = MultiHeadAttention(embed_size, heads)
        
        # The second and third sub-layers (cross-attention + FFN) are encapsulated
        # in a standard TransformerBlock for code reuse.
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        """
        The forward pass for the Decoder block.
        """
        # --- 1. Masked Multi-Head Self-Attention + Add & Norm ---
        # The first sub-layer is self-attention on the target sequence `x`.
        # We pass `trg_mask` to prevent it from attending to future tokens.
        attention = self.attention(x, x, x, trg_mask)
        # Apply residual connection and dropout.
        query = x + self.dropout(attention)
        # Apply layer normalization.
        query = self.norm(query)
        
        # --- 2. Cross-Attention + FFN (handled by TransformerBlock) ---
        # The output of the first sub-layer (`query`) is used as the query for cross-attention.
        # The `value` and `key` come from the encoder's output.
        # The `src_mask` is used here to hide padding in the source sequence.
        out = self.transformer_block(value, key, query, src_mask)
        
        return out


class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        """
        Initializes the Decoder, a stack of DecoderBlocks, culminating in a final linear layer.
        """
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.positional_embedding = nn.Embedding(max_length, embed_size)

        # A stack of N DecoderBlocks.
        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        # The final linear layer that projects the decoder output to the vocabulary size to get logits.
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        """
        The forward pass for the Decoder.

        Args:
            x (torch.Tensor): Target token indices. Shape: (trg_len, N).
            enc_out (torch.Tensor): Output from the encoder. Shape: (src_len, N, embed_size).
            src_mask, trg_mask: Masks for source and target sequences.
        """
        # Get sequence length and batch size.
        seq_length, N = x.shape
        # Create position indices.
        positions = torch.arange(0, seq_length).expand(N, seq_length).transpose(0, 1).to(self.device)
        
        # --- Apply Embeddings and Positional Encoding ---
        # Transpose to (N, trg_len) for embedding, then add and transpose back to (trg_len, N, embed_size).
        word_embed = self.word_embedding(x.transpose(0, 1))
        pos_embed = self.positional_embedding(positions.transpose(0, 1))
        x = self.dropout(word_embed + pos_embed).transpose(0, 1)
        
        # --- Pass through Decoder Blocks ---
        # Sequentially pass through each DecoderBlock.
        for layer in self.layers:
            # `enc_out` is passed as the key and value for cross-attention in each block.
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)
            
        # --- Final Output Layer ---
        # Pass the final output through the linear layer to get logits.
        out = self.fc_out(x)
        
        return out

        
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=512,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0.1,
        device="cpu",
        max_length=100,
    ):
        """
        Initializes the complete Transformer model by assembling the Encoder and Decoder.
        """
        super(Transformer, self).__init__()

        # Instantiate the Encoder.
        self.encoder = Encoder(
            src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_length
        )

        # Instantiate the Decoder.
        self.decoder = Decoder(
            trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length
        )

        # Store padding indices and the device, which are needed for mask creation.
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        """
        Creates a mask for the source sequence to hide padding tokens.
        Input `src` has shape (src_len, N).
        """
        # Transpose src to (N, src_len) to easily compare with the padding index.
        src = src.transpose(0, 1)
        # Create a boolean mask where `False` indicates a padding token.
        # Then add dimensions to make it broadcastable with the attention energy matrix (N, heads, query_len, key_len).
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # Final shape: (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        """
        Creates a mask for the target sequence, combining a padding mask and a causal mask.
        Input `trg` has shape (trg_len, N).
        """
        # Transpose trg to (N, trg_len) to get batch size and sequence length.
        trg = trg.transpose(0, 1)
        N, trg_len = trg.shape
        
        # 1. Causal (Look-ahead) Mask: A lower triangular matrix of ones.
        # Shape: (trg_len, trg_len)
        trg_look_ahead_mask = torch.tril(torch.ones((trg_len, trg_len))).to(self.device)
        
        # 2. Padding Mask for the target sequence.
        # Shape: (N, 1, trg_len, 1) after unsqueezing.
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2) # Swapped dimensions for correct broadcasting
        
        # 3. Combine the masks using a bitwise AND.
        # The final mask will be 1 only if a token is not padding AND is not in the future.
        # Broadcasting handles the dimension mismatch.
        trg_mask = trg_pad_mask & trg_look_ahead_mask.bool()
        # Final shape: (N, 1, trg_len, trg_len)
        return trg_mask.to(self.device)

    def forward(self, src, trg):
        """
        The forward pass for the entire Transformer model.

        Args:
            src (torch.Tensor): Source sequence. Shape: (src_len, N).
            trg (torch.Tensor): Target sequence. Shape: (trg_len, N).
        """
        # Create the necessary masks for the source and target sequences.
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        # Pass the source sequence through the encoder to get the contextualized memory.
        enc_src = self.encoder(src, src_mask)
        
        # Pass the encoder's output and the target sequence through the decoder to get the final logits.
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        
        return out

In [14]:
# train.py

import torch
import torch.nn as nn
import torch.optim as optim

# --- 1. Setup and Hyperparameters ---

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Training hyperparameters
NUM_EPOCHS = 30
LEARNING_RATE = 3e-4 # 0.0003
BATCH_SIZE = 32

# Model hyperparameters
SRC_VOCAB_SIZE = 0 # Will be set after loading data
TRG_VOCAB_SIZE = 0 # Will be set after loading data
EMBED_SIZE = 512
NUM_LAYERS = 3 # A smaller number for faster training on a single machine
HEADS = 8
FORWARD_EXPANSION = 4
DROPOUT = 0.1
MAX_LENGTH = 100
SRC_PAD_IDX = 0 # Will be set after loading data
TRG_PAD_IDX = 0 # Will be set after loading data

# --- 2. Load Data ---
print("Loading data and building vocabularies...")
train_loader, _, _, src_vocab, trg_vocab = get_dataloaders(
    batch_size=BATCH_SIZE
)

# Update vocab sizes and padding indices from the loaded data
SRC_VOCAB_SIZE = len(src_vocab)
TRG_VOCAB_SIZE = len(trg_vocab)
SRC_PAD_IDX = src_vocab.stoi["<PAD>"]
TRG_PAD_IDX = trg_vocab.stoi["<PAD>"]
print("Data loading complete.")
print(f"Source vocabulary size: {SRC_VOCAB_SIZE}")
print(f"Target vocabulary size: {TRG_VOCAB_SIZE}")

# --- 3. Initialize Model, Optimizer, and Loss Function ---

# Initialize the model
model = Transformer(
    src_vocab_size=SRC_VOCAB_SIZE,
    trg_vocab_size=TRG_VOCAB_SIZE,
    src_pad_idx=SRC_PAD_IDX,
    trg_pad_idx=TRG_PAD_IDX,
    embed_size=EMBED_SIZE,
    num_layers=NUM_LAYERS,
    forward_expansion=FORWARD_EXPANSION,
    heads=HEADS,
    dropout=DROPOUT,
    device=device,
    max_length=MAX_LENGTH,
).to(device)

# Initialize the optimizer
# We pass the model's parameters to the optimizer so it knows what to update.
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Initialize the loss function
# `ignore_index` is very important. It tells the loss function to ignore the loss
# calculation for tokens that are padding.
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

# --- 4. The Training Loop ---

print("\n--- Starting Training ---")
for epoch in range(NUM_EPOCHS):
    print(f"[Epoch {epoch+1} / {NUM_EPOCHS}]")
    
    # Set the model to training mode
    model.train()
    
    # We will use this to track the average loss for the epoch
    total_loss = 0
    num_batches = 0
    
    # Iterate over batches from the DataLoader
    for batch_idx, batch in enumerate(train_loader):
        # Move data to the same device as the model
        src_data, trg_data = batch
        src = src_data.to(device)
        trg = trg_data.to(device)
        
        # --- Forward Pass ---
        # The target sequence needs to be prepared for teacher forcing.
        # The input to the decoder should be all tokens except the last one.
        # The ground truth for the loss function should be all tokens except the first one (<SOS>).
        trg_input = trg[:-1, :] # Shape: (trg_len - 1, N)
        
        # The model's forward pass
        output = model(src, trg_input) # Shape: (trg_len - 1, N, trg_vocab_size)
        
        # --- Loss Calculation ---
        # Reshape the output and target for the CrossEntropyLoss function.
        # It expects the output to be (N * (trg_len - 1), trg_vocab_size)
        # and the target to be (N * (trg_len - 1)).
        output_for_loss = output.reshape(-1, output.shape[2])
        target_for_loss = trg[1:, :].reshape(-1)
        
        # Clear previous gradients
        optimizer.zero_grad()
        
        # Calculate the loss
        loss = criterion(output_for_loss, target_for_loss)
        total_loss += loss.item()
        num_batches += 1
        
        # --- Backward Pass and Optimization ---
        # Compute gradients
        loss.backward()
        
        # Clip gradients to prevent them from exploding
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        
        # Update model parameters
        optimizer.step()
        
        # Optional: Print progress
        if (batch_idx + 1) % 100 == 0:
            print(f"  Batch {batch_idx+1}/{len(train_loader)} | Current Loss: {loss.item():.4f}")

    # Calculate and print average loss for the epoch
    avg_loss = total_loss / num_batches
    print(f"End of Epoch {epoch+1} | Average Loss: {avg_loss:.4f}\n")

print("--- Training Complete ---")

# Optional: Save the trained model
# torch.save(model.state_dict(), "my_transformer_model.pth")
# print("Model saved to my_transformer_model.pth")

Using device: cuda
Loading data and building vocabularies...
Data loading complete.
Source vocabulary size: 5919
Target vocabulary size: 7810

--- Starting Training ---
[Epoch 1 / 30]
  Batch 100/907 | Current Loss: 4.7173
  Batch 200/907 | Current Loss: 4.1020
  Batch 300/907 | Current Loss: 3.4979
  Batch 400/907 | Current Loss: 3.4882
  Batch 500/907 | Current Loss: 3.1177
  Batch 600/907 | Current Loss: 2.8146
  Batch 700/907 | Current Loss: 2.9691
  Batch 800/907 | Current Loss: 2.7814
  Batch 900/907 | Current Loss: 2.9507
End of Epoch 1 | Average Loss: 3.4740

[Epoch 2 / 30]
  Batch 100/907 | Current Loss: 2.3982
  Batch 200/907 | Current Loss: 2.2926
  Batch 300/907 | Current Loss: 2.2116
  Batch 400/907 | Current Loss: 2.1200
  Batch 500/907 | Current Loss: 1.9316
  Batch 600/907 | Current Loss: 2.1181
  Batch 700/907 | Current Loss: 1.7589
  Batch 800/907 | Current Loss: 1.8677
  Batch 900/907 | Current Loss: 1.9866
End of Epoch 2 | Average Loss: 2.1632

[Epoch 3 / 30]
  Batc

In [18]:
# Translate an English sentence using the trained Transformer model

def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_length=50):
    # Tokenize and numericalize the input sentence
    tokens = ["<SOS>"] + [tok.lower() for tok in sentence.split()] + ["<EOS>"]
    src_indices = [src_vocab.stoi.get(tok, src_vocab.stoi["<UNK>"]) for tok in tokens]
    src_tensor = torch.tensor(src_indices).unsqueeze(1).to(device)  # Shape: (seq_len, 1)

    # Encode source sentence
    with torch.no_grad():
        src_mask = model.make_src_mask(src_tensor)
        enc_src = model.encoder(src_tensor, src_mask)

        # Prepare initial target input (<SOS>)
        trg_indices = [trg_vocab.stoi["<SOS>"]]
        for _ in range(max_length):
            trg_tensor = torch.tensor(trg_indices).unsqueeze(1).to(device)  # Shape: (cur_len, 1)
            trg_mask = model.make_trg_mask(trg_tensor)
            output = model.decoder(trg_tensor, enc_src, src_mask, trg_mask)
            pred_token = output.argmax(2)[-1, 0].item()
            trg_indices.append(pred_token)
            if pred_token == trg_vocab.stoi["<EOS>"]:
                break

    # Convert indices to words
    translated_tokens = [trg_vocab.itos[idx] for idx in trg_indices]
    return " ".join(translated_tokens[1:-1])  # Remove <SOS> and <EOS> for display

# Example usage
test_en_sentence = "This is a test ."
translation = translate_sentence(test_en_sentence, src_vocab, trg_vocab, model, device)
print(f"English: {test_en_sentence}")
print(f"German translation: {translation}")

English: This is a test .
German translation: dies ist ein <UNK>


### Let's work on a model that can identify nouns and replace the token UNK with an appropriate noun in the output.
