In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import spacy
from collections import Counter
import json
import os

In [2]:
spacy_de = spacy.load("de_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

print("spaCy language models loaded successfully.")

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it. Reversing the source sequence is a common trick that was found to improve performance
    in early sequence-to-sequence models
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens).
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]


DATASET_PATH = r"C:\Users\Debojyoti Das\.cache\huggingface\hub\datasets--bentrevett--multi30k\snapshots\4589883f3d09d4ef6361784e03f0ead219836469"

def load_data(path, file_prefix):
    """
    Loads data from a .jsonl file and return a list of (source, target) tuples.
    """
    filepath = os.path.join(path, f"{file_prefix}.jsonl")
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            data.append((entry['de'], entry['en']))
        return data


train_data = load_data(DATASET_PATH, "train")

# Display a sample to verify
print(f"Loaded {len(train_data)} training examples.")
print("Sample training pair:")
print(f"  German (source): {train_data[0][0]}")
print(f"  English (target): {train_data[0][1]}")


# Tokenize the sample
de_tokens = tokenize_de(train_data[0][0])
en_tokens = tokenize_en(train_data[0][1])
print("\nTokenized sample:")
print(f"  German tokens (reversed): {de_tokens}")
print(f"  English tokens: {en_tokens}")

spaCy language models loaded successfully.
Loaded 29000 training examples.
Sample training pair:
  German (source): Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
  English (target): Two young, White males are outside near many bushes.

Tokenized sample:
  German tokens (reversed): ['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']
  English tokens: ['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [3]:
class Vocabulary:
    def __init__(self, min_freq=2):
        """
        Initializes the Vocabulary.
        :param min_freq: The minimum frequency a token must have to be included in the vocabulary.
        """
        # Initialize the vocabulary with special tokens
        # itos : Index to string mapping
        # stoi : String to index mapping
        self.itos = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.stoi = {v: k for k, v in self.itos.items()}
        self.min_freq = min_freq
        
        
    def __len__(self):
        """
        Returns the size of the vocabulary.
        """
        return len(self.itos)
    
    def build_vocabulary(self, sentence_list, tokenizer):
        """
        Builds the vocabulary from a list of tokens.
        :param sentence_list: A List of sentences(Strings) to build the vocabulary from.
        :param tokenizer: The tokenizer function to use (e.g., tokenize_de or tokenize_en).
        """
        frequencies = Counter()
        idx = 4
        for sentence in sentence_list:
            tokens = tokenizer(sentence)
            frequencies.update(tokens)
            
        # Create the vocabulary based on the frequencies
        for word, freq in frequencies.items():
            if freq >= self.min_freq:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
                
    def numericalize(self, text, tokenizer):
        """
        Converts a list of tokens to their corresponding indices in the vocabulary.
        :param text: A string or list of strings to convert.
        :param tokenizer: The tokenizer function to use (e.g., tokenize_de or tokenize_en).
        :return: A list of indices corresponding to the tokens in the vocabulary.
        """
        tokenized_text = tokenizer(text)
        
        numericalized_text = []
        for token in tokenized_text:
            if token in self.stoi:
                numericalized_text.append(self.stoi[token])
            else:
                numericalized_text.append(self.stoi["<unk>"])
                
        return numericalized_text

In [4]:
source_sentences = [pair[0] for pair in train_data]
target_sentences = [pair[1] for pair in train_data]

source_vocab = Vocabulary(min_freq=2)
target_vocab = Vocabulary(min_freq=2)

source_vocab.build_vocabulary(source_sentences, tokenize_de)
target_vocab.build_vocabulary(target_sentences, tokenize_en)

print("\nVocabularies built successfully.")
print(f"Source (German) Vocabulary Size: {len(source_vocab)}")
print(f"Target (English) Vocabulary Size: {len(target_vocab)}")

# --- Test the vocabulary ---
sample_german_sentence = train_data[5][0]
sample_english_sentence = train_data[5][1]

print(f"\nOriginal German: {sample_german_sentence}")
numericalized_german = source_vocab.numericalize(sample_german_sentence, tokenize_de)
print(f"Numericalized German: {numericalized_german}")

print(f"\nOriginal English: {sample_english_sentence}")
numericalized_english = target_vocab.numericalize(sample_english_sentence, tokenize_en)
print(f"Numericalized English: {numericalized_english}")


Vocabularies built successfully.
Source (German) Vocabulary Size: 8014
Target (English) Vocabulary Size: 6191

Original German: Ein Mann in grün hält eine Gitarre, während der andere Mann sein Hemd ansieht.
Numericalized German: [22, 29, 11, 46, 47, 48, 49, 50, 51, 12, 52, 29, 53, 32, 54, 16]

Original English: A man in green holds a guitar while the other man observes his shirt.
Numericalized English: [25, 32, 17, 46, 47, 21, 48, 49, 42, 50, 32, 51, 52, 34, 14]


In [5]:
class Multi30kDataset(Dataset):
    def __init__(self, data_path, file_prefix, source_vocab, target_vocab, source_tokenizer, target_tokenizer):
        """
        Initializes the custom dataset.
        :param data_path: Path to the dataset directory.
        :param file_prefix: 'train', 'val', or 'test'.
        :param source_vocab: The built source Vocabulary object.
        :param target_vocab: The built target Vocabulary object.
        :param source_tokenizer: The tokenizer for the source language.
        :param target_tokenizer: The tokenizer for the target language.
        """
        # Load the raw text data
        self.data = load_data(data_path, file_prefix)
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.source_tokenizer = source_tokenizer
        self.target_tokenizer = target_tokenizer

    def __len__(self):
        """Returns the total number of examples in the dataset."""
        return len(self.data)

    def __getitem__(self, index):
        """
        Retrieves one data example, numericalizes it, and adds special tokens.
        :param index: The index of the data example to retrieve.
        :return: A tuple of (numericalized_source, numericalized_target).
        """
        source_text, target_text = self.data[index]

        numericalized_source = self.source_vocab.numericalize(source_text, self.source_tokenizer)
        numericalized_target = self.target_vocab.numericalize(target_text, self.target_tokenizer)
        

        sos_token_idx = self.target_vocab.stoi["<sos>"]
        eos_token_idx = self.target_vocab.stoi["<eos>"]
        
        processed_target = [sos_token_idx] + numericalized_target + [eos_token_idx]


        return torch.tensor(numericalized_source), torch.tensor(processed_target)

train_dataset = Multi30kDataset(DATASET_PATH, "train", source_vocab, target_vocab, tokenize_de, tokenize_en)
val_dataset = Multi30kDataset(DATASET_PATH, "val", source_vocab, target_vocab, tokenize_de, tokenize_en)
test_dataset = Multi30kDataset(DATASET_PATH, "test", source_vocab, target_vocab, tokenize_de, tokenize_en)

# Verify one item from the training dataset
source_tensor, target_tensor = train_dataset[0]
print("--- Verifying a single Dataset item ---")
print(f"Source Tensor: {source_tensor}")
print(f"Target Tensor (with <sos> and <eos>): {target_tensor}")
print(f"Shape of source tensor: {source_tensor.shape}")
print(f"Shape of target tensor: {target_tensor.shape}")

--- Verifying a single Dataset item ---
Source Tensor: tensor([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])
Target Tensor (with <sos> and <eos>): tensor([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  2])
Shape of source tensor: torch.Size([13])
Shape of target tensor: torch.Size([13])


In [6]:
class PadCollate:
    def __init__(self, pad_idx):
        """
        Initializes the collation object.
        :param pad_idx: The integer index of the <pad> token.
        """
        self.pad_idx = pad_idx

    def __call__(self, batch):
        """
        This method is called by the DataLoader to process a batch of data.
        :param batch: A list of (source_tensor, target_tensor) tuples.
        :return: A tuple of (padded_sources, padded_targets).
        """
        # Separate source and target sequences from the batch
        sources = [item[0] for item in batch]
        targets = [item[1] for item in batch]

        # Use torch's built-in pad_sequence utility
        # It pads sequences to the length of the longest sequence in the batch.
        # batch_first=False makes the output shape [sequence_length, batch_size],
        # which is the expected input format for PyTorch RNNs by default.
        padded_sources = torch.nn.utils.rnn.pad_sequence(
            sources, batch_first=False, padding_value=self.pad_idx
        )
        padded_targets = torch.nn.utils.rnn.pad_sequence(
            targets, batch_first=False, padding_value=self.pad_idx
        )
        
        return padded_sources, padded_targets



BATCH_SIZE = 32
pad_idx = source_vocab.stoi["<pad>"] 


collate_fn = PadCollate(pad_idx=pad_idx)

# Create the DataLoaders
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,         
    collate_fn=collate_fn
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

print("\n--- Verifying a single DataLoader batch ---")

source_batch, target_batch = next(iter(train_loader))

print(f"Shape of source batch tensor: {source_batch.shape}")
print(f"Shape of target batch tensor: {target_batch.shape}")
print(f"\nSource batch tensor (first 5 tokens of first 3 sentences):")
print(source_batch[:5, :3])


--- Verifying a single DataLoader batch ---
Shape of source batch tensor: torch.Size([22, 32])
Shape of target batch tensor: torch.Size([28, 32])

Source batch tensor (first 5 tokens of first 3 sentences):
tensor([[  17,   22,   22],
        [ 363,  232, 2244],
        [ 266,  233, 1157],
        [  34,   50,  212],
        [  35,   12,  240]])


In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hidden_dim, dec_hidden_dim, dropout):
        """
        Initializes the Encoder module.
        :param input_dim: The size of the source vocabulary.
        :param emb_dim: The dimensionality of the word embeddings.
        :param enc_hid_dim: The dimensionality of the encoder's hidden state (for each direction).
        :param dec_hid_dim: The dimensionality of the decoder's hidden state.
        :param dropout: The dropout probability.
        """
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        
        self.rnn = nn.GRU(emb_dim, enc_hidden_dim, bidirectional=True)
        """ It takes embeddings as input and outputs the hidden states for each time step.
            :param enc_hidden_dim: The dimensionality of the hidden state for each direction.
        """

        self.fc = nn.Linear(enc_hidden_dim * 2, dec_hidden_dim)
        """ The output of the bidirectional GRU is concatenated and passed through a linear layer to match the decoder's hidden state size.
        """
        
        self.dropout = nn.Dropout(dropout)
        """ Dropout layer to prevent overfitting.
        """
        
        
    def forward(self, src):
        """
        Defines the forward pass of the encoder.
        :param src: Source sentence tensor. Shape: [src_len, batch_size]
        :return:
            - outputs: The concatenated top-layer hidden states from each time step.
                       These are the annotations. Shape: [src_len, batch_size, enc_hid_dim * 2]
            - hidden: The final decoder hidden state, derived from the encoder's final states.
                      Shape: [batch_size, dec_hid_dim]
        """
        embedded = self.dropout(self.embedding(src))
        # The embedded tensor has shape [src_len, batch_size, emb_dim]



        outputs, hidden = self.rnn(embedded)
        # Output contains concatenated hidden states from both directions for each time step.
        # Hidden contains the final hidden states for both directions at the last time step.
        
        
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # Concatenate the last hidden states from both directions to form the final hidden state.
        
        
        hidden = torch.tanh(self.fc(hidden_cat))
        # Apply a linear transformation and tanh activation to match the decoder's hidden state size.
        
        return outputs, hidden
    

INPUT_DIM = len(source_vocab)
EMB_DIM = 256
ENC_HID_DIM = 512  # Hidden state dimensionality for each direction
DEC_HID_DIM = 512  # Hidden state dimensionality for the decoder
DROPOUT = 0.5  # Dropout probability


encoder = Encoder(
    input_dim=INPUT_DIM,
    emb_dim=EMB_DIM,
    enc_hidden_dim=ENC_HID_DIM,
    dec_hidden_dim=DEC_HID_DIM,
    dropout=DROPOUT
)


print("--- Verifying the Encoder module ---")
print(f"Encoder architecture:\n{encoder}\n")


encoder_outputs, encoder_hidden = encoder(source_batch)

print(f"Shape of encoder outputs (annotations): {encoder_outputs.shape}")
print("This corresponds to [source_sequence_length, batch_size, encoder_hidden_dim * 2]\n")

print(f"Shape of final hidden state (for decoder input): {encoder_hidden.shape}")
print("This corresponds to [batch_size, decoder_hidden_dim]")

--- Verifying the Encoder module ---
Encoder architecture:
Encoder(
  (embedding): Embedding(8014, 256, padding_idx=0)
  (rnn): GRU(256, 512, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=512, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

Shape of encoder outputs (annotations): torch.Size([22, 32, 1024])
This corresponds to [source_sequence_length, batch_size, encoder_hidden_dim * 2]

Shape of final hidden state (for decoder input): torch.Size([32, 512])
This corresponds to [batch_size, decoder_hidden_dim]


In [17]:
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        """
        Initializes the Attention module.
        :param enc_hid_dim: The dimensionality of the encoder's hidden state.
        :param dec_hid_dim: The dimensionality of the decoder's hidden state.
        """
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        
        # The alignment model as described in the paper: e = V.T * tanh(W*s + U*h)
        # Here, W*s and U*h are combined into a single linear layer for efficiency. The input to this layer will be the concatenation of the decoder hidden state and an encoder hidden state.
        self.attn_in = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        

        self.v = nn.Linear(dec_hid_dim, 1, bias=False)
        
    def forward(self, decoder_hidden, encoder_outputs):
        """
        Defines the forward pass for the attention mechanism.
        :param decoder_hidden: The previous hidden state from the decoder.
                               Shape: [batch_size, dec_hid_dim]
        :param encoder_outputs: The sequence of annotations from the encoder.
                                Shape: [src_len, batch_size, enc_hid_dim * 2]
        :return:
            - attention_weights: A tensor of attention weights.
                                 Shape: [batch_size, src_len]
        """
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        # To calculate the alignment score for each encoder output, we need to
        # repeat the decoder hidden state 'src_len' times.
        # Repeat decoder hidden state src_len times
        # decoder_hidden shape: [batch_size, dec_hid_dim]
        # repeated_decoder_hidden shape: [batch_size, src_len, dec_hid_dim]
        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # The encoder_outputs need to be reshaped to match the repeated hidden state.
        # encoder_outputs shape: [src_len, batch_size, enc_hid_dim * 2]
        # Permute to: [batch_size, src_len, enc_hid_dim * 2]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        # Concatenate the repeated decoder state and the encoder outputs
        # This prepares the input for our alignment model's linear layer.
        # concat_input shape: [batch_size, src_len, (enc_hid_dim * 2) + dec_hid_dim]
        concat_input = torch.cat((repeated_decoder_hidden, encoder_outputs), dim=2)
        
        # Pass the concatenated tensor through the first linear layer and tanh activation
        # energy shape: [batch_size, src_len, dec_hid_dim]
        energy = torch.tanh(self.attn_in(concat_input))
        
        # Pass the energy through the second linear layer (v) to get the alignment scores
        # attention shape: [batch_size, src_len, 1]
        attention = self.v(energy).squeeze(2)
        
        # Apply softmax to get a probability distribution of weights over the source sequence
        # The softmax is applied to the last dimension (src_len)
        # attention_weights shape: [batch_size, src_len]
        attention_weights = F.softmax(attention, dim=1)
        
        return attention_weights


ENC_HID_DIM = 512
DEC_HID_DIM = 512

attention_module = Attention(ENC_HID_DIM, DEC_HID_DIM)

print("--- Verifying the Attention module ---")
print(f"Attention module architecture:\n{attention_module}\n")

# Use the outputs from the Encoder verification step
# encoder_outputs shape: [src_len, batch_size, enc_hid_dim * 2]
# encoder_hidden shape: [batch_size, dec_hid_dim] (this will act as the first decoder hidden state s_0)
attention_weights = attention_module(encoder_hidden, encoder_outputs)

print(f"Shape of output attention weights: {attention_weights.shape}")
print("This corresponds to [batch_size, source_sequence_length]\n")

# Check if the weights sum to 1 for each example in the batch
sum_of_weights = torch.sum(attention_weights, dim=1)
print(f"Sum of weights for the first 5 examples in the batch:\n{sum_of_weights[:5]}")
print("\nEach sum should be very close to 1.0, confirming softmax is working correctly.")

--- Verifying the Attention module ---
Attention module architecture:
Attention(
  (attn_in): Linear(in_features=1536, out_features=512, bias=True)
  (v): Linear(in_features=512, out_features=1, bias=False)
)

Shape of output attention weights: torch.Size([32, 22])
This corresponds to [batch_size, source_sequence_length]

Sum of weights for the first 5 examples in the batch:
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<SliceBackward0>)

Each sum should be very close to 1.0, confirming softmax is working correctly.


In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        """
        Initializes the Decoder module.
        :param output_dim: The size of the target vocabulary.
        :param emb_dim: The dimensionality of the word embeddings.
        :param enc_hid_dim: The dimensionality of the encoder's hidden state.
        :param dec_hid_dim: The dimensionality of the decoder's hidden state.
        :param dropout: The dropout probability.
        :param attention: The attention module instantiated in the previous step.
        """
        super().__init__()
        
        self.output_dim = output_dim
        self.attention = attention
        
        # 1. Embedding Layer for target vocabulary
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        
        # 2. GRU Layer
        # The input to the GRU at each time step is the concatenated previous word's embedding
        # and the context vector. So, the input dimension is emb_dim + (enc_hid_dim * 2).
        self.rnn = nn.GRU(emb_dim + (enc_hid_dim * 2), dec_hid_dim)
        
        # 3. Fully Connected Layer (Readout layer)
        # This layer generates the final prediction (logits).
        # It takes the concatenation of the current GRU output, the context vector,
        # and the previous word's embedding as input.
        self.fc_out = nn.Linear(dec_hid_dim + (enc_hid_dim * 2) + emb_dim, output_dim)
        
        # 4. Dropout Layer
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
        """
        Defines the forward pass for a SINGLE time step of the decoder.
        :param input: The current input token (previous word). Shape: [batch_size]
        :param hidden: The previous hidden state from the decoder. Shape: [batch_size, dec_hid_dim]
        :param encoder_outputs: The sequence of annotations from the encoder.
                                Shape: [src_len, batch_size, enc_hid_dim * 2]
        :return:
            - prediction: Raw logits for the next word. Shape: [batch_size, output_dim]
            - hidden: The new decoder hidden state. Shape: [batch_size, dec_hid_dim]
            - attention: The attention weights for this step. Shape: [batch_size, src_len]
        """
        # The input to the decoder is a single token at a time, so we need to add a sequence dimension.
        # input shape: [batch_size] -> [1, batch_size]
        input = input.unsqueeze(0)
        
        # 1. Get embeddings and apply dropout
        # embedded shape: [1, batch_size, emb_dim]
        embedded = self.dropout(self.embedding(input))
        
        # 2. Get attention weights from the attention module
        # The attention module takes the *previous* decoder hidden state and all encoder outputs.
        # attention_weights shape: [batch_size, src_len]
        attention_weights = self.attention(hidden, encoder_outputs)
        
        # 3. Calculate the context vector
        # attention_weights shape: [batch_size, src_len] -> [batch_size, 1, src_len]
        attention_weights_unsqueezed = attention_weights.unsqueeze(1)
        
        # encoder_outputs shape: [src_len, batch_size, enc_hid_dim * 2] -> [batch_size, src_len, enc_hid_dim * 2]
        encoder_outputs_permuted = encoder_outputs.permute(1, 0, 2)
        
        # Use batch matrix multiplication (bmm) to get the weighted sum
        # context_vector shape: [batch_size, 1, enc_hid_dim * 2]
        context_vector = torch.bmm(attention_weights_unsqueezed, encoder_outputs_permuted)
        
        # Permute the context vector to match the GRU's expected input shape
        # context_vector shape: [batch_size, 1, enc_hid_dim * 2] -> [1, batch_size, enc_hid_dim * 2]
        context_vector = context_vector.permute(1, 0, 2)
        
        # 4. Prepare the input for the GRU cell
        # Concatenate the word embedding and the context vector
        # rnn_input shape: [1, batch_size, emb_dim + enc_hid_dim * 2]
        rnn_input = torch.cat((embedded, context_vector), dim=2)
        
        # 5. Pass the input and previous hidden state through the GRU
        # The hidden state needs to be of shape [num_layers, batch_size, dec_hid_dim]
        # hidden shape: [batch_size, dec_hid_dim] -> [1, batch_size, dec_hid_dim]
        # The GRU returns the output for this step and the new hidden state.
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        # output shape: [1, batch_size, dec_hid_dim]
        # hidden shape: [1, batch_size, dec_hid_dim]
        
        # Squeeze out the sequence dimension (of size 1) from all tensors
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        context_vector = context_vector.squeeze(0)
        hidden = hidden.squeeze(0) # The returned hidden state is ready for the next time step.
        
        # 6. Generate the final prediction
        # Concatenate the GRU output, context vector, and the input embedding
        # This is the "readout" step, as described in the paper
        prediction_input = torch.cat((output, context_vector, embedded), dim=1)
        
        # Pass through the final linear layer
        # prediction shape: [batch_size, output_dim]
        prediction = self.fc_out(prediction_input)
        
        return prediction, hidden, attention_weights


OUTPUT_DIM = len(target_vocab)
# Other dims (EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT) are from previous steps

# Instantiate the decoder
# It requires the attention module we built earlier
decoder = Decoder(OUTPUT_DIM, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT, attention_module)

print("--- Verifying the Decoder module ---")
print(f"Decoder architecture:\n{decoder}\n")

# To verify, we need a sample input token for the decoder.
# Let's take the first token from the target batch (<sos> token).
# target_batch shape: [trg_len, batch_size]
decoder_input_token = target_batch[0, :]

# Pass the sample inputs through the decoder
# encoder_outputs and encoder_hidden are from the Encoder verification step
prediction, decoder_hidden, attention = decoder(decoder_input_token, encoder_hidden, encoder_outputs)

print(f"Shape of prediction (logits): {prediction.shape}")
print(f"This corresponds to [batch_size, target_vocabulary_size]\n")

print(f"Shape of new decoder hidden state: {decoder_hidden.shape}")
print(f"This corresponds to [batch_size, decoder_hidden_dim]\n")

print(f"Shape of attention weights for this step: {attention.shape}")
print("This corresponds to [batch_size, source_sequence_length]")

--- Verifying the Decoder module ---
Decoder architecture:
Decoder(
  (attention): Attention(
    (attn_in): Linear(in_features=1536, out_features=512, bias=True)
    (v): Linear(in_features=512, out_features=1, bias=False)
  )
  (embedding): Embedding(6191, 256, padding_idx=0)
  (rnn): GRU(1280, 512)
  (fc_out): Linear(in_features=1792, out_features=6191, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

Shape of prediction (logits): torch.Size([32, 6191])
This corresponds to [batch_size, target_vocabulary_size]

Shape of new decoder hidden state: torch.Size([32, 512])
This corresponds to [batch_size, decoder_hidden_dim]

Shape of attention weights for this step: torch.Size([32, 22])
This corresponds to [batch_size, source_sequence_length]


In [10]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        """
        Initializes the Seq2Seq model wrapper.
        :param encoder: The instantiated Encoder module.
        :param decoder: The instantiated Decoder module.
        :param device: The device (e.g., 'cuda' or 'cpu') to move tensors to.
        """
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        """
        Defines the forward pass for the entire sequence-to-sequence model.
        :param src: The source sentence tensor. Shape: [src_len, batch_size]
        :param trg: The target sentence tensor. Shape: [trg_len, batch_size]
        :param teacher_forcing_ratio: The probability to use teacher forcing.
                                      e.g., 0.5 means teacher forcing is used 50% of the time.
        :return:
            - outputs: A tensor of predictions. Shape: [trg_len, batch_size, output_dim]
        """
        # Unpack dimensions from input tensors
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # 1. Pass the source sequence through the encoder
        encoder_outputs, hidden = self.encoder(src)
        
        # 2. Prepare for decoding
        # Create a tensor to store the decoder's predictions
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        # The first input to the decoder is the <sos> token
        # trg[0,:] contains the <sos> tokens for the entire batch
        input = trg[0,:]
        
        # 3. Loop through the target sequence to generate predictions
        # We loop from 1 to trg_len because we have already used trg[0] as the first input.
        # The output at step t is the prediction for the (t+1)-th token.
        for t in range(1, trg_len):
            
            # Run one decoding step
            # prediction shape: [batch_size, output_dim]
            # hidden shape: [batch_size, dec_hid_dim]
            prediction, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            
            # Store the prediction in our outputs tensor
            outputs[t] = prediction
            
            # Decide whether to use teacher forcing for the next input
            teacher_force = random.random() < teacher_forcing_ratio
            
            # Get the highest predicted token from the prediction tensor
            # top1 shape: [batch_size]
            top1 = prediction.argmax(1)
            
            # If teacher forcing, use the actual next token from the target sequence.
            # Otherwise, use the model's own prediction.
            input = trg[t] if teacher_force else top1
            
        return outputs

# --- Setup Device and Instantiate the Final Model ---

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Instantiate all components
enc = Encoder(INPUT_DIM, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
dec = Decoder(OUTPUT_DIM, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT, attn)

# Instantiate the final Seq2Seq model and move it to the device
model = Seq2Seq(enc, dec, device).to(device)

# --- Verification ---
print("\n--- Verifying the final Seq2Seq model ---")
# Move a sample batch to the correct device
source_batch_device = source_batch.to(device)
target_batch_device = target_batch.to(device)

# Pass the sample batch through the model
# We use a high teacher forcing ratio just for this verification
predictions = model(source_batch_device, target_batch_device, teacher_forcing_ratio=0.75)

print(f"Shape of the final predictions tensor: {predictions.shape}")
print("This corresponds to [target_sequence_length, batch_size, target_vocabulary_size]")
print("\nThe model is successfully assembled and produces output of the correct shape.")

Using device: cuda

--- Verifying the final Seq2Seq model ---
Shape of the final predictions tensor: torch.Size([28, 32, 6191])
This corresponds to [target_sequence_length, batch_size, target_vocabulary_size]

The model is successfully assembled and produces output of the correct shape.


In [11]:
import time
import math

# --- Helper Functions ---

def init_weights(m):
    """
    Initializes the learnable weights of the model.
    :param m: A module in the model.
    """
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.uniform_(param.data, -0.08, 0.08)
        elif 'bias' in name:
            nn.init.constant_(param.data, 0)
            
def epoch_time(start_time, end_time):
    """Calculates the time taken for an epoch."""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Apply the weight initialization
model.apply(init_weights)

# --- Define Optimizer and Loss Function ---

# We use the Adam optimizer
optimizer = optim.Adam(model.parameters())

# Our loss function is CrossEntropyLoss
# We ignore the loss calculated on the <pad> token index.
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

print("--- Training Setup ---")
print(f"Optimizer: {optimizer}")
print(f"Loss Function: CrossEntropyLoss (ignoring padding)")





#### **7.2 The Training and Evaluation Loops**

def train(model, iterator, optimizer, criterion, clip):
    """Performs one epoch of training."""
    model.train()  # Set the model to training mode (enables dropout)
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)
        
        # 1. Zero the gradients from the previous iteration
        optimizer.zero_grad()
        
        # 2. Forward pass: get predictions
        output = model(src, trg)  # teacher_forcing_ratio defaults to 0.5
        
        # To calculate the loss, we need to reshape the output and target tensors
        # output shape: [trg_len, batch_size, output_dim]
        # trg shape: [trg_len, batch_size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim) # Ignore <sos> token, flatten
        trg = trg[1:].view(-1)                   # Ignore <sos> token, flatten
        
        # 3. Calculate the loss
        loss = criterion(output, trg)
        
        # 4. Backward pass: compute gradients
        loss.backward()
        
        # 5. Clip gradients to prevent them from exploding
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # 6. Update the weights
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    """Performs one epoch of evaluation."""
    model.eval()  # Set the model to evaluation mode (disables dropout)
    epoch_loss = 0
    
    with torch.no_grad():  # No need to calculate gradients during evaluation
        for i, batch in enumerate(iterator):
            src, trg = batch
            src, trg = src.to(device), trg.to(device)
            
            # Forward pass: get predictions.
            # We turn off teacher forcing for evaluation.
            output = model(src, trg, teacher_forcing_ratio=0)
            
            # Reshape tensors for loss calculation
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            # Calculate the loss
            loss = criterion(output, trg)
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

--- Training Setup ---
Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)
Loss Function: CrossEntropyLoss (ignoring padding)


In [12]:
# --- Main Training Loop ---

N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')

print("\n--- Starting Training ---")

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    # Run one training epoch
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    
    # Run one evaluation epoch
    valid_loss = evaluate(model, val_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Save the best model found so far
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'nmt-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


--- Starting Training ---


KeyboardInterrupt: 

In [13]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

def translate_sentence(sentence, source_vocab, target_vocab, model, device, max_len=50):
    """
    Translates a single source sentence into the target language.
    :param sentence: The raw source sentence string.
    :param source_vocab: The source vocabulary object.
    :param target_vocab: The target vocabulary object.
    :param model: The trained Seq2Seq model.
    :param device: The device to run on ('cpu' or 'cuda').
    :param max_len: The maximum length for the output sentence.
    :return:
        - translated_sentence_tokens: A list of translated token strings.
        - attention: The attention weights tensor. Shape: [trg_len, src_len]
    """
    model.eval() # Set the model to evaluation mode

    # --- 1. Pre-process the source sentence ---
    
    # Tokenize the source sentence
    tokens = tokenize_de(sentence) # tokenize_de also reverses the sentence
    
    # Add special tokens if your tokenizer doesn't. Ours doesn't for the source.
    # tokens = [source_vocab.stoi['<sos>']] + tokens + [source_vocab.stoi['<eos>']]
    
    # Numericalize the tokens
    src_indexes = source_vocab.numericalize(sentence, tokenize_de)
    
    # Convert to a tensor and add the batch dimension (batch_size = 1)
    # Shape: [src_len, 1]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
    
    # --- 2. Encoder Pass ---
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)
        
    # --- 3. Autoregressive Decoding Loop ---
    
    # The first input to the decoder is the <sos> token
    trg_indexes = [target_vocab.stoi['<sos>']]
    
    # Create a tensor to store attention scores for each decoding step
    attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)
    
    for i in range(max_len):
        # Get the last predicted token as the input for the next step
        # Add batch dimension and move to device
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        
        with torch.no_grad():
            # Run one decoding step
            output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs)
            
            # Store the attention weights
            attentions[i] = attention

        # Get the index of the most likely predicted token
        pred_token = output.argmax(1).item()
        
        # Append the prediction to our running list of target indices
        trg_indexes.append(pred_token)
        
        # If the predicted token is <eos>, we're done
        if pred_token == target_vocab.stoi['<eos>']:
            break
            
    # --- 4. Post-process the output ---
    
    # Convert the output indices to tokens, skipping the initial <sos>
    translated_sentence_tokens = [target_vocab.itos[i] for i in trg_indexes[1:]]
    
    # Return the translated tokens and the attention matrix
    # attentions shape: [trg_len, 1, src_len] -> [trg_len, src_len]
    return translated_sentence_tokens, attentions[:len(trg_indexes)-1, :, :].squeeze(1)




# --- Load the model and test ---

# Instantiate a new model with the same architecture
enc = Encoder(INPUT_DIM, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
dec = Decoder(OUTPUT_DIM, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)

# Load the saved state dictionary
model.load_state_dict(torch.load('nmt-model.pt'))

print("Model loaded successfully from nmt-model.pt")

Model loaded successfully from nmt-model.pt


In [14]:
# Choose an example from the validation set
example_idx = 10

source_raw = val_dataset.data[example_idx][0]
target_raw = val_dataset.data[example_idx][1]

print("--- Testing on a sample sentence ---")
print(f'Source Sentence      = {source_raw}')
print(f'Actual Translation   = {target_raw}')

# Get the model's translation
translation_tokens, attention = translate_sentence(source_raw, source_vocab, target_vocab, model, device)
predicted_translation = ' '.join(translation_tokens)

print(f'Predicted Translation = {predicted_translation}')


# --- Visualize the Attention ---

# The source tokens for display should not be reversed
# So we tokenize again without the reversal for the plot's labels.
source_tokens_for_display = [tok.text for tok in spacy_de.tokenizer(source_raw)]
# The translated tokens should not include the <eos> token for the plot
translation_tokens_for_display = translation_tokens[:-1] if translation_tokens[-1] == '<eos>' else translation_tokens


--- Testing on a sample sentence ---
Source Sentence      = Ein kleines Kind steht allein auf einem zerklüfteten Felsen.
Actual Translation   = A young child is standing alone on some jagged rocks.
Predicted Translation = A young child stands alone on a rock rock rock . <eos>
