In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import hashlib
import requests
import zipfile
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
class Vocab:
    """Vocabulary for text."""

    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []

        # Count token frequencies
        counter = Counter()
        for seq in tokens:
            counter.update(seq)

        # Sort by frequency
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)

        # Add reserved tokens (unk, pad, bos, eos)
        self.idx_to_token = ["<unk>"] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}

        # Add tokens that meet frequency threshold
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        return 0

In [3]:
class MTFraEng(Dataset):
    """English-French machine translation dataset."""

    def __init__(
        self, batch_size=32, num_steps=10, num_train=512, num_val=128, root="./data"
    ):
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        self.root = root

        # Download and process data
        raw_text = self._download()
        text = self._preprocess(raw_text)
        src, tgt = self._tokenize(text, num_train + num_val)

        # Build arrays and vocabularies
        self.arrays, self.src_vocab, self.tgt_vocab = self._build_arrays(src, tgt)

    def _download(self):
        """Download the dataset."""
        os.makedirs(self.root, exist_ok=True)
        url = "http://d2l-data.s3-accelerate.amazonaws.com/fra-eng.zip"
        zip_path = os.path.join(self.root, "fra-eng.zip")

        # Download if not exists
        if not os.path.exists(zip_path):
            print(f"Downloading {url}...")
            response = requests.get(url)
            with open(zip_path, "wb") as f:
                f.write(response.content)

        # Extract
        extract_path = os.path.join(self.root, "fra-eng")
        if not os.path.exists(extract_path):
            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                zip_ref.extractall(self.root)

        # Read file
        with open(os.path.join(extract_path, "fra.txt"), encoding="utf-8") as f:
            return f.read()

    def _preprocess(self, text):
        """Preprocess the raw text."""
        # Replace non-breaking space with space
        text = text.replace("\u202f", " ").replace("\xa0", " ")

        # Insert space between words and punctuation marks
        def no_space(char, prev_char):
            return char in set(",.!?") and prev_char != " "

        out = [
            " " + char if i > 0 and no_space(char, text[i - 1]) else char
            for i, char in enumerate(text.lower())
        ]
        return "".join(out)

    def _tokenize(self, text, max_examples=None):
        """Tokenize the text into source and target sequences."""
        src, tgt = [], []
        for i, line in enumerate(text.split("\n")):
            if max_examples and i >= max_examples:
                break
            parts = line.split("\t")
            if len(parts) == 2:
                # Split on spaces and filter empty tokens
                src.append([t for t in f"{parts[0]} <eos>".split(" ") if t])
                tgt.append([t for t in f"{parts[1]} <eos>".split(" ") if t])
        return src, tgt

    def _build_arrays(self, src, tgt, src_vocab=None, tgt_vocab=None):
        """Build arrays from tokenized sequences."""

        def _build_array(sentences, vocab, is_tgt=False):
            # Pad or trim sequences to num_steps
            def pad_or_trim(seq, t):
                if len(seq) > t:
                    return seq[:t]  # Trim
                else:
                    return seq + ["<pad>"] * (t - len(seq))  # Pad

            sentences = [pad_or_trim(s, self.num_steps) for s in sentences]

            # Add <bos> i.e. beginning-of-sequence token for target sequences
            if is_tgt:
                sentences = [["<bos>"] + s for s in sentences]

            # Build vocabulary
            if vocab is None:
                vocab = Vocab(
                    sentences, min_freq=2, reserved_tokens=["<pad>", "<bos>", "<eos>"]
                )

            # Convert to indices
            array = torch.tensor([vocab[s] for s in sentences])

            # Calculate valid lengths (excluding padding)
            valid_len = (array != vocab["<pad>"]).type(torch.int32).sum(1)

            return array, vocab, valid_len

        # Build source arrays
        src_array, src_vocab, src_valid_len = _build_array(src, src_vocab)

        # Build target arrays
        tgt_array, tgt_vocab, _ = _build_array(tgt, tgt_vocab, is_tgt=True)

        # Return: (src, tgt_input, src_valid_len, tgt_label)
        # tgt_input: <bos> + tokens (excluding last)
        # tgt_label: tokens + <eos> (excluding <bos>)
        return (
            (src_array, tgt_array[:, :-1], src_valid_len, tgt_array[:, 1:]),
            src_vocab,
            tgt_vocab,
        )

    def get_dataloader(self, train=True):
        """Get dataloader for training or validation."""
        src_array, tgt_array, src_valid_len, label = self.arrays

        if train:
            indices = slice(0, self.num_train)
        else:
            indices = slice(self.num_train, self.num_train + self.num_val)

        # Create TensorDataset
        dataset = torch.utils.data.TensorDataset(
            src_array[indices],
            tgt_array[indices],
            src_valid_len[indices],
            label[indices],
        )

        return DataLoader(dataset, batch_size=self.batch_size, shuffle=train)

    def build(self, src_sentences, tgt_sentences):
        """Build arrays for new sentence pairs using existing vocabularies."""
        raw_text = "\n".join(
            [src + "\t" + tgt for src, tgt in zip(src_sentences, tgt_sentences)]
        )
        text = self._preprocess(raw_text)
        src, tgt = self._tokenize(text)
        arrays, _, _ = self._build_arrays(src, tgt, self.src_vocab, self.tgt_vocab)
        return arrays


In [4]:
data = MTFraEng(batch_size=3, num_steps=9, num_train=512, num_val=128)

data, len(data.src_vocab), len(data.tgt_vocab)

(<__main__.MTFraEng at 0x7881f12de170>, 195, 213)

In [70]:
train_loader = data.get_dataloader(train=True)
src, tgt, src_valid_len, label = next(iter(train_loader))
print("source:", src.type(torch.int32))
print("decoder input:", tgt.type(torch.int32))
print("source len excluding pad:", src_valid_len.type(torch.int32))
print("label:", label.type(torch.int32))

source: tensor([[  5,  96,   4,   3,   1,   1,   1,   1,   1,   1],
        [  7,   0,   4,   3,   1,   1,   1,   1,   1,   1],
        [ 10, 124,   4,   3,   1,   1,   1,   1,   1,   1],
        [ 38,  12,   6,   3,   1,   1,   1,   1,   1,   1],
        [  0,  82,   4,   3,   1,   1,   1,   1,   1,   1],
        [ 29,  57,   4,   3,   1,   1,   1,   1,   1,   1],
        [  7, 100,   6,   3,   1,   1,   1,   1,   1,   1],
        [ 38,  10,   4,   3,   1,   1,   1,   1,   1,   1],
        [  0,  48,  11,   3,   1,   1,   1,   1,   1,   1],
        [  9,  73,   4,   3,   1,   1,   1,   1,   1,   1],
        [  5,  52,  75,   4,   3,   1,   1,   1,   1,   1],
        [  7,  58,   4,   3,   1,   1,   1,   1,   1,   1],
        [  5,   0,   4,   3,   1,   1,   1,   1,   1,   1],
        [  5,  96,   4,   3,   1,   1,   1,   1,   1,   1],
        [ 15,  27,   4,   3,   1,   1,   1,   1,   1,   1],
        [ 50,  51,   6,   3,   1,   1,   1,   1,   1,   1],
        [165,   9,   6,   3,   1

unlike character-level tokenization, for machine translation we
prefer word-level tokenization here (state-of-the-art models use more complex tokenization techniques).


For languages that doesn't have clear spaces between words like japanese, we do subword tokenization(BPE, WordPiece, SentencePiece)


In machine translation, we have paris like:

"I love cats" 3 tokens

"J'adore les chats" 3 tokens

but also 

English: "How are you doing today?" (5 tokens)

French: "Comment allez-vous aujourd'hui?" (3 tokens)

but nns work with fixed-size tensors. you can't have a batch where one example has 3 tokens and another has 5.

The solution is Padding and Truncation:

Let's say we will use `num_steps = 5`:

Example 1 (too short):
```
Original: ["hi", "<eos>"]  (2 tokens)
Padded:   ["hi", "<eos>", "<pad>", "<pad>", "<pad>"]  (5 tokens)
```
Example 2 (perfect fit):
```
Original: ["how", "are", "you", "?", "<eos>"]  (5 tokens)
No change: ["how", "are", "you", "?", "<eos>"]
```
Example 3 (too long):
```
Original: ["what", "are", "you", "doing", "today", "?", "<eos>"]  (7 tokens)
Truncated: ["what", "are", "you", "doing", "today"]  (5 tokens)
```
Now all sequences are exactly 5 tokens → can stack into a batch tensor!


We also need to track the valid length to ignore the padding

---

In general seq2seq problems like machine translation,
inputs and outputs are of varying lenghts that are unaligned.
The standard approach to handle this sort of data is to design an
encoder-decoder architecture,

consisting of :

1. An encoder that takes a variable lenght sequence as input
2. a decoder that acts as a conditional language model, taking in the encoded input and the tleftward context of the target sequence and predicting the subsequent token in the target sequence


In [6]:
from torch import nn

In [8]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X, *args):
        raise NotImplementedError


class Decoder(nn.Module):
    def __init__(self):
        super().__init__()

    def init_states(enc_outputs, *args):
        raise NotImplementedError

    def forward(self, X, state):
        raise NotImplementedError


class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X, *args):
        # Run encoder
        enc_outputs = self.encoder(enc_X, *args)

        # Initialize decoder state using encoder outputs
        dec_state = self.decoder.init_state(enc_outputs, *args)

        # Run decoder and return only its output
        dec_outputs, _ = self.decoder(dec_X, dec_state)
        return dec_outputs

In this section, we will demonstrate the application of an encoder–decoder architecture, where both the encoder and decoder are implemented as RNNs, to the task of machine translation 


Here, the encoder RNN will take a variable-length sequence as input and transform it into a fixed-shape hidden state. Later, in Section 11, we will introduce attention mechanisms, which allow us to access encoded inputs without having to compress the entire input into a single fixed-length representation.


Then to generate the output sequence, one token at a time, the decoder model, consisting of a separate RNN, will predict each successive target token given both the input sequence and the preceding tokens in the output.


## Teacher Forcing
In a standard sequence generation process (like translation)
, the decoder produces its output one token at a time.
The input for the next time step is the token that the decoder predicted at the current time step.

However, during training,
if the decoder makes a mistake early in the sequence, 
that mistake is fed back as the input for the next step,
potentially compounding errors and confusing the training process. 
The model might never recover from a single bad prediction, leading to unstable learning.

The most common approach is sometimes called *teacher forcing*.

Teacher Forcing circumvents this instability by feeding the decoder the correct token from the target sequence at every time step, regardless of what the decoder actually predicted.

The "Teacher" (the ground truth target sequence) forces the network to follow the correct path, ensuring that the model's recurrent states are always updated based on accurate context.

More concretely, the special beginning-of-sequence token and the original target sequence, excluding the final token, 
are concatenated as input to the decoder, 
while the decoder output (labels for training) is the original target sequence,
shifted by one token: 

$$
\text{"<bos>", "Ils", "regardent", "."}\Rightarrow\text{"Ils", "regardent", ".", "<eos>"}
$$

![](../imgs/seq2seq.png)

another approach is to feed the predicted token from the previous time step as the current inpt.
While this is how the model is used in the real world, it is often too difficult to train with, as errors quickly compound. Teacher Forcing provides a smooth, stable gradient path during training, helping the decoder learn the correct local transitions.



## Encoder

The Encoder reads the entire source sequence 
(e.g., an English sentence) and converts it into a single, fixed-length vector known as the **Context Vector** (or latent state).

The encoder typically uses a unidirectional or bidirectional RNN 
(e.g., an LSTM or GRU). 
At each time step $t$, the RNN processes the input token $x_t$ and updates its hidden state $\mathbf{h}_t$.

For an input sequence $\mathbf{X} = (x_1, x_2, \ldots, x_T)$:

1. Input: Each token $x_t$ is transformed into a feature vector $\mathbf{x}_t$ (embedding).

2. Recurrence: The hidden state $\mathbf{h}_t$ is computed based on the current input $\mathbf{x}_t$ and the previous state $\mathbf{h}_{t-1}$:$$\mathbf{h}_t = f(\mathbf{x}_t, \mathbf{h}_{t-1})$$

3. Context Vector: After processing the final token $x_T$,
the encoder's final hidden state, 
$\mathbf{h}_T$, serves as the Context Vector. 
This vector is intended to summarize all the information of the entire source sequence.

In [17]:
class Seq2SeqEncoder(Encoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
        super().__init__(**kwargs)
        # 1. Embedding Layer: Maps tokens to vectors
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # 2. RNN Layer: Processes the sequence
        # (e.g., using GRU or LSTM)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers)

    def forward(self, X, *args):
        # X: (batch size, sequence length)
        X = self.embedding(X)
        # X: (sequence length, batch size, embed size)
        X = X.permute(1, 0, 2)

        # rnn returns output and final hidden state (state)
        output, state = self.rnn(X)

        # output is usually ignored in the simplest Seq2Seq model
        # state is the context vector fed to the decoder
        return output, state

## Decoder
The **decoder** takes the context vecotr from the encoder and generates the output sequence one token at a time.
It operates **autoregressively**, meaning the prediction at the current step is influenced by the prediction made at the previous step.

The decoder is also an RNN.

1. Initial State: The decoder's initial hidden state is set directly to the 
**Context Vector** ($h_T$) produced by the encoder.
This is how the information form the source sequence is transferred.

2. Input: At each time step $t'$, the decoder takes the token generated at the previous time step $t' -1$.

The decoder's recurrence is similar to the encoder's, but it includes an output layer:
1. Recurrence: The decoder's hidden state $\mathbf{s}_{t'}$ is calculated using its input 
$y_{t'-1}$ and its previous state $\mathbf{s}_{t'-1}$:
$$\mathbf{s}_{t'} = g(y_{t'-1}, \mathbf{s}_{t'-1}, \mathbf{h}_T)$$
2. Output: The hidden state $\mathbf{s}_{t'}$ is used to predict the next token $\hat{y}_{t'}$ via a fully connected layer and a Softmax function:
$$\hat{y}_{t'} = \text{softmax}(\mathbf{s}_{t'} \mathbf{W}_{qs} + \mathbf{b}_q)$$


In [24]:
class Seq2SeqDecoder(Decoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
        super().__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # Input size: embed_size + num_hiddens (for context)
        self.rnn = nn.GRU(embed_size + num_hiddens, num_hiddens, num_layers)
        self.dense = nn.LazyLinear(vocab_size)

    def init_state(self, enc_outputs, *args):
        # enc_outputs is typically a tuple of (all_encoder_outputs, final_hidden_state)
        # We take the final hidden state (index 1) to initialize the decoder RNN
        return enc_outputs

    def forward(self, Y, state):
        # Y shape: (batch_size, num_steps)

        # 1. Embedding and Transpose: (seq_len, batch_size, embed_size)
        embs = self.embedding(Y.type(torch.long)).permute(1, 0, 2)

        # 2. Unpack Encoder State
        enc_output, hidden_state = (
            state  # hidden_state is (num_layers, batch_size, num_hiddens)
        )

        # 3. CONTEXT EXTRACTION (CRITICAL FIX)
        # Use the final hidden state of the last encoder layer as the context vector
        # context_vector shape is (batch_size, num_hiddens) -> (4, 16)
        context_vector = hidden_state[-1]

        # Add a sequence dimension of size 1, then repeat (broadcast)
        # Final context shape: (seq_len, batch_size, num_hiddens) -> (9, 4, 16)
        context = context_vector.unsqueeze(0).repeat(embs.shape[0], 1, 1)

        # 4. Concatenation (seq_len, batch_size, embed_size + num_hiddens)
        embs_and_context = torch.cat((embs, context), -1)

        # 5. RNN Forward Pass
        outputs, hidden_state = self.rnn(embs_and_context, hidden_state)

        # 6. Final Prediction and Transpose back to (batch_size, seq_len, vocab_size)
        output = self.dense(outputs).swapaxes(0, 1)

        # Return predictions and the updated state
        return output, [enc_output, hidden_state]

In [25]:
def check_shape(tensor, expected_shape):
    """
    Placeholder for d2l.check_shape, prints success or failure.
    """
    assert tensor.shape == expected_shape, (
        f"Shape mismatch: Got {tensor.shape}, Expected {expected_shape}"
    )
    print(f"Shape check successful for tensor with shape: {expected_shape}")

In [26]:
# --- Setup Parameters ---
vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2
batch_size, num_steps = 4, 9

# --- Dummy Input Data ---
# Use Long tensor for inputs to embedding layer
X = torch.zeros((batch_size, num_steps), dtype=torch.long)

print("--- Testing Encoder Shapes ---")
encoder = Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers)

# ENCODER FORWARD PASS
enc_outputs, enc_state = encoder(X)

# 1. Check Encoder Output Sequence (All hidden states)
# Shape: (sequence length, batch size, number of hidden units)
check_shape(enc_outputs, (num_steps, batch_size, num_hiddens))

# 2. Check Encoder Final State (Context Vector stack)
# Shape: (number of layers, batch size, number of hidden units)
check_shape(enc_state, (num_layers, batch_size, num_hiddens))


print("\n--- Testing Decoder Shapes ---")
decoder = Seq2SeqDecoder(vocab_size, embed_size, num_hiddens, num_layers)

# Initialize Decoder State: Get the final hidden state from the encoder
# NOTE: encoder(X) returns (enc_outputs, enc_state)
initial_state = decoder.init_state(encoder(X))

# DECODER FORWARD PASS
# Decoder input X is the target sequence (Teacher Forcing)
dec_outputs, state = decoder(X, initial_state)

# 3. Check Decoder Output Predictions
# Shape: (batch size, sequence length, vocabulary size)
check_shape(dec_outputs, (batch_size, num_steps, vocab_size))

# 4. Check Decoder Final State (The recurrent hidden state stack)
# state[1] is the (num_layers, batch_size, num_hiddens) state of the decoder RNN
check_shape(state[1], (num_layers, batch_size, num_hiddens))

print("\nAll shape checks passed successfully!")

--- Testing Encoder Shapes ---
Shape check successful for tensor with shape: (9, 4, 16)
Shape check successful for tensor with shape: (2, 4, 16)

--- Testing Decoder Shapes ---
Shape check successful for tensor with shape: (4, 9, 10)
Shape check successful for tensor with shape: (2, 4, 16)

All shape checks passed successfully!


In [29]:
def sequence_mask(X, valid_len, value=0):
    maxlen = X.size(1)
    mask = (
        torch.arange((maxlen), dtype=torch.float32, device=X.device)[None, :]
        < valid_len[:, None]
    )

    X[~mask] = value
    return X


class MaskedLoss(nn.CrossEntropyLoss):
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction = "none"
        unweighted_loss = super().forward(pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss

In [53]:
class Seq2Seq(EncoderDecoder):
    def __init__(self, encoder, decoder, tgt_pad, lr):
        super().__init__(encoder, decoder)
        self.tgt_pad = tgt_pad
        self.lr = lr
        self.loss = MaskedLoss()

    def forward(self, enc_X, dec_X, enc_valid_len, *args):
        # Passes source X and target X to the base EncoderDecoder
        return super().forward(enc_X, dec_X, *args)

    def training_step(self, batch):
        # Batch items: (Source X, Source X_lengths, Target Y, Target Y_lengths)
        # Assuming the DataLoader prepares: enc_X, dec_X, enc_valid_len, dec_valid_len
        # NOTE: dec_X must be the shifted target sequence (<bos> + Y[:-1])
        # The labels Y_labels should be the unshifted target sequence (Y[1:] + <eos>)

        enc_X, dec_X, enc_valid_len, Y_labels = (
            batch  # Assuming you add Y_labels to batch
        )

        dec_valid_len = (Y_labels != self.tgt_pad).sum(1)

        # 1. Forward Pass: Get predictions
        Y_hat = self(enc_X, dec_X, enc_valid_len)

        # 2. Compute Loss
        # Flatten predictions and labels for loss calculation
        loss = self.loss(Y_hat, Y_labels, dec_valid_len).sum()

        # In a real D2L/PyTorch Lightning environment, this would log the loss:
        # self.plot('loss', loss, train=True)
        return loss

    def validation_step(self, batch):
        # Performs the same calculation for evaluation
        enc_X, dec_X, enc_valid_len, dec_valid_len = batch
        Y_hat = self(enc_X, dec_X, enc_valid_len, dec_valid_len)
        Y_labels = batch[3]
        loss = self.loss(Y_hat, Y_labels, dec_valid_len).sum()

        # In a real setup, you'd track metrics like BLEU score here
        return loss

    def predict_step(self, batch, num_steps, device, save_attention_weights=False):
        # ... (Move data to device, unpack batch)
        batch = [a.to(device) for a in batch]
        src, tgt, src_valid_len, _ = batch

        # 1. Encoder Forward Pass
        enc_all_outputs = self.encoder(src, src_valid_len)

        # 2. Decoder State Initialization
        dec_state = self.decoder.init_state(enc_all_outputs, src_valid_len)

        # 3. Initialize Output Sequence
        # outputs starts with the <bos> token.
        # tgt[:, (0)] selects the first token of the target batch (which is <bos>)
        # .unsqueeze(1) changes shape from (batch_size,) to (batch_size, 1)
        outputs, attention_weights = (
            [
                tgt[:, (0)].unsqueeze(1),
            ],
            [],
        )

        # 4. Decoding Loop (Autoregressive Generation)
        for _ in range(num_steps):
            # Decode one step: Y is the prediction logits for the next token
            # outputs[-1] is the *predicted token* from the previous step (Teacher Forcing is off)
            Y, dec_state = self.decoder(outputs[-1], dec_state)

            # Greedy Decoding: Take the token with the highest probability (argmax)
            # Y.argmax(2) gets the index of the predicted token across the vocab dimension (dim 2)
            outputs.append(Y.argmax(2))

            # Save attention weights (if applicable)
            if save_attention_weights:
                attention_weights.append(self.decoder.attention_weights)

        # Return: Concatenate the predicted tokens (excluding the initial <bos>)
        return torch.cat(outputs[1:], 1), attention_weights

    def configure_optimizers(self):
        # Standard configuration for the Adam optimizer
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [54]:
from tqdm import tqdm


def train_seq2seq(model, data_iter, lr, num_epochs, device):
    """Train a sequence-to-sequence model with enhanced tqdm visualization."""

    model.to(device)
    optimizer = model.configure_optimizers()

    print(f"Training on {device}")

    # --- Outer Loop: Epochs ---
    # Use ascii=True for wider compatibility if not in a full Jupyter environment
    epoch_pbar = tqdm(range(1, num_epochs + 1), desc="Epochs")

    for epoch in epoch_pbar:
        model.train()
        total_loss = 0.0

        # --- Inner Loop: Batches ---
        # Initialize inner tqdm for batches, linked to the outer loop description
        batch_pbar = tqdm(
            enumerate(data_iter, 1),
            total=len(data_iter),
            desc=f"Epoch {epoch}",
            leave=False,
        )

        for i, batch in batch_pbar:
            # Move data to device
            batch = [x.to(device) for x in batch]

            optimizer.zero_grad()

            # Compute loss
            loss = model.training_step(batch)

            loss.backward()

            # Gradient clipping (essential for RNNs)
            grad_clip = 1
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

            optimizer.step()

            # Track loss and calculate running average
            total_loss += loss.item()
            running_avg_loss = total_loss / i

            # Update the progress bar description with the current running loss
            batch_pbar.set_postfix(train_loss=f"{running_avg_loss:.4f}")

        # --- End of Epoch ---
        avg_train_loss = total_loss / len(data_iter)

        # Update the outer loop description with the epoch's final loss
        epoch_pbar.set_postfix(avg_epoch_loss=f"{avg_train_loss:.4f}")

    print("Training complete!")

In [68]:
embed_size = 256
num_hiddens = 256
num_layers = 2
lr = 0.005
num_epochs = 30
batch_size = 32
num_steps = 10  # Corresponds to the sequence length used in MTFraEng
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Initialize Data
# We must ensure the `tgt_pad` index is correctly captured.
data = MTFraEng(batch_size=batch_size, num_steps=num_steps)
train_loader = data.get_dataloader(train=True)
tgt_pad_idx = data.tgt_vocab["<pad>"]

# 2. Initialize Components
encoder = Seq2SeqEncoder(
    vocab_size=len(data.src_vocab),
    embed_size=embed_size,
    num_hiddens=num_hiddens,
    num_layers=num_layers,
)

decoder = Seq2SeqDecoder(
    vocab_size=len(data.tgt_vocab),
    embed_size=embed_size,
    num_hiddens=num_hiddens,
    num_layers=num_layers,
)

# 3. Initialize Model
model = Seq2Seq(encoder=encoder, decoder=decoder, tgt_pad=tgt_pad_idx, lr=lr)

# 4. Run Training
# NOTE: The training will take a few minutes depending on your device/data size.
# If using Jupyternotebook/Colab, consider wrapping the training call in a block.
train_seq2seq(model, train_loader, lr, num_epochs, device)

Training on cuda


Epochs: 100%|██████████| 30/30 [00:06<00:00,  4.33it/s, avg_epoch_loss=3.9016]

Training complete!





In [56]:
import math
import collections


def bleu(pred_seq, label_seq, k):
    pred_tokens, label_tokens = pred_seq.split(" "), label_seq.split(" ")
    len_pred, len_label = len(pred_tokens), len(label_tokens)

    # 1. Brevity Penalty (BP)
    # The term: math.exp(min(0, 1 - len_label / len_pred))
    # If len_pred > len_label, 1 - ratio is negative, BP < 1.0. If len_pred >= len_label, BP = 1.0.
    score = math.exp(min(0, 1 - len_label / len_pred))

    # 2. Modified N-gram Precision (for n=1 up to k)
    for n in range(1, min(k, len_pred) + 1):
        num_matches, label_subs = 0, collections.defaultdict(int)

        # Count n-grams in the Reference (label_subs)
        for i in range(len_label - n + 1):
            label_subs[" ".join(label_tokens[i : i + n])] += 1

        # Check n-grams in the Prediction and apply clipping (modified precision)
        for i in range(len_pred - n + 1):
            ngram = " ".join(pred_tokens[i : i + n])
            if label_subs[ngram] > 0:
                num_matches += 1
                label_subs[ngram] -= 1  # Clip: decrement available count

        # Modified precision P_n = num_matches / total_predicted_n_grams
        precision = num_matches / (len_pred - n + 1)

        # Combine score: score *= P_n^(0.5^n)
        score *= math.pow(precision, math.pow(0.5, n))
    return score

In [69]:
# The input sentences
engs = ["go .", "i lost .", "he's calm .", "i'm home ."]
fras = ["va !", "j'ai perdu .", "il est calme .", "je suis chez moi ."]


# 1. Build the batch: Converts the English sentences (src) and a dummy target (tgt)
#    into tensors using the vocabulary and padding/trimming.
#    The prediction only needs the encoder input (src).
preds, _ = model.predict_step(data.build(engs, fras), data.num_steps, "cuda")

# 2. Iterate and Print Results
for en, fr, p in zip(engs, fras, preds):
    translation = []
    predicted_indices = p.tolist()
    for token in data.tgt_vocab.to_tokens(predicted_indices):
        if token == "<eos>":
            break  # Stop at the End-of-Sequence token
        translation.append(token)

    pred_text = " ".join(translation)

    print(f"{en} => {pred_text}, bleu,{bleu(pred_text, fr, k=2):.3f}")

go . => va !, bleu,1.000
i lost . => j'ai perdu ., bleu,1.000
he's calm . => je vais <unk> ., bleu,0.000
i'm home . => je suis chez moi ., bleu,1.000
