# NMT

Overview in Concept

In [None]:
# encoder = 'part1'
# decoder = 'part2'
# # Traditional seq2seq (with bottleneck)
# def translate_sentence(input_sentence):
#     # Entire sentence compressed into one vector
#     context_vector = encoder(input_sentence)
    
#     # Generate entire translation from just this vector
#     translation = decoder(context_vector)
#     return translation

# # With Bahdanau attention (2014)
# def translate_sentence_with_attention(input_sentence):
#     # Encode input but keep all hidden states
#     encoder_hidden_states = encoder(input_sentence)  # Shape: [input_length, hidden_size]
    
#     # For each output word
#     translation = []
#     for i in range(max_output_length):
#         # Compute "attention scores" - how relevant is each input word?
#         attention_scores = calculate_relevance(decoder_state, encoder_hidden_states)
        
#         # Create weighted sum of encoder states (attention context)
#         context_vector = weighted_sum(attention_scores, encoder_hidden_states)
        
#         # Predict next word using both previous decoder state AND context
#         next_word = decoder_step(decoder_state, context_vector)
#         translation.append(next_word)
    
#     return translation

Use case on Opus Dataset

In [1]:
# Cell 1: Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cell 2: Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Cell 3: Load dataset and tokenizers
# We'll use English-French from the Opus Books dataset
dataset = load_dataset("opus_books", "en-fr", split="train[:5000]")  # Limit to 5000 examples for simplicity
print(f"Dataset loaded with {len(dataset)} examples")

# Sample a few examples
for i in range(3):
    print(f"Example {i+1}:")
    print(f"English: {dataset[i]['translation']['en']}")
    print(f"French: {dataset[i]['translation']['fr']}")
    print()

# Load tokenizers
en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
fr_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

Dataset loaded with 5000 examples
Example 1:
English: The Wanderer
French: Le grand Meaulnes

Example 2:
English: Alain-Fournier
French: Alain-Fournier

Example 3:
English: First Part
French: PREMIÈRE PARTIE





In [4]:
# Cell 4: Prepare dataset
class TranslationDataset(Dataset):
    def __init__(self, dataset, en_tokenizer, fr_tokenizer, max_len=50):
        self.dataset = dataset
        self.en_tokenizer = en_tokenizer
        self.fr_tokenizer = fr_tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # Get English and French sentences
        en_text = self.dataset[idx]['translation']['en']
        fr_text = self.dataset[idx]['translation']['fr']
        
        # Tokenize
        en_tokens = self.en_tokenizer.encode(
            en_text, 
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).squeeze(0)
        
        fr_tokens = self.fr_tokenizer.encode(
            fr_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).squeeze(0)
        
        return {
            'input_ids': en_tokens,
            'target_ids': fr_tokens,
            'en_text': en_text,
            'fr_text': fr_text
        }

# Create dataset and dataloader
translation_dataset = TranslationDataset(dataset, en_tokenizer, fr_tokenizer)
dataloader = DataLoader(translation_dataset, batch_size=32, shuffle=True)

# Check a batch
batch = next(iter(dataloader))
print(f"Input shape: {batch['input_ids'].shape}")
print(f"Target shape: {batch['target_ids'].shape}")

Input shape: torch.Size([32, 50])
Target shape: torch.Size([32, 50])


In [5]:
# Cell 5: Define the Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers=1, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # src: [batch_size, src_len]
        embedded = self.dropout(self.embedding(src))
        # embedded: [batch_size, src_len, emb_dim]
        
        outputs, hidden = self.rnn(embedded)
        # outputs: [batch_size, src_len, hidden_dim * 2]
        # hidden: [n_layers * 2, batch_size, hidden_dim]
        
        # Combine bidirectional outputs
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden: [batch_size, hidden_dim * 2]
        
        hidden = torch.tanh(self.fc(hidden))
        # hidden: [batch_size, hidden_dim]
        
        return outputs, hidden

In [6]:
# Cell 6: Define the Attention Mechanism (Bahdanau attention)
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear((hidden_dim * 2) + hidden_dim, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
        
    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hidden_dim]
        # encoder_outputs: [batch_size, src_len, hidden_dim * 2]
        
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        
        # Repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        # hidden: [batch_size, src_len, hidden_dim]
        
        # Calculate energy
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        # energy: [batch_size, src_len, hidden_dim]
        
        attention = self.v(energy).squeeze(2)
        # attention: [batch_size, src_len]
        
        # Apply softmax to get attention weights summing to 1
        attention_weights = torch.softmax(attention, dim=1)
        # attention_weights: [batch_size, src_len]
        
        # Get weighted sum of encoder outputs
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)
        # context: [batch_size, 1, hidden_dim * 2]
        context = context.squeeze(1)
        # context: [batch_size, hidden_dim * 2]
        
        return context, attention_weights

In [7]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, attention, n_layers=1, dropout=0.5):
        super().__init__()
        
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.n_layers = n_layers  # Add this line to store n_layers
        
        self.rnn = nn.GRU((hidden_dim * 2) + emb_dim, hidden_dim, n_layers, batch_first=True)
        self.fc_out = nn.Linear((hidden_dim * 2) + hidden_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
        # input: [batch_size, 1] or [batch_size]
        # hidden: [batch_size, hidden_dim]
        # encoder_outputs: [batch_size, src_len, hidden_dim * 2]
        
        if input.dim() == 1:
            input = input.unsqueeze(1)  # Add sequence dimension if needed
        # input: [batch_size, 1]
        
        embedded = self.dropout(self.embedding(input))
        # embedded: [batch_size, 1, emb_dim]
        
        # Calculate attention
        context, attn_weights = self.attention(hidden, encoder_outputs)
        # context: [batch_size, hidden_dim * 2]
        # attn_weights: [batch_size, src_len]
        
        # Concatenate context and embedded
        rnn_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)
        # rnn_input: [batch_size, 1, (hidden_dim * 2) + emb_dim]
        
        # Important change here: Reshape hidden to match the expected dimensions
        # The RNN expects hidden state shape: [n_layers, batch_size, hidden_dim]
        hidden = hidden.unsqueeze(0).repeat(self.n_layers, 1, 1)
        # hidden: [n_layers, batch_size, hidden_dim]
        
        output, hidden = self.rnn(rnn_input, hidden)
        # output: [batch_size, 1, hidden_dim]  
        # hidden: [n_layers, batch_size, hidden_dim]
        
        # Take the last layer's hidden state
        hidden_for_output = hidden[-1]
        
        output = output.squeeze(1)
        # output: [batch_size, hidden_dim]
        
        # Concatenate for prediction
        output = torch.cat((output, context, embedded.squeeze(1)), dim=1)
        # output: [batch_size, (hidden_dim * 2) + hidden_dim + emb_dim]
        
        prediction = self.fc_out(output)
        # prediction: [batch_size, output_dim]
        
        return prediction, hidden_for_output, attn_weights

In [8]:
# Cell 8: Define the Seq2Seq model with Attention
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [batch_size, src_len]
        # trg: [batch_size, trg_len]
        
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # Tensor to store attention
        attentions = torch.zeros(batch_size, trg_len, src.shape[1]).to(self.device)
        
        # Encode the source sequence
        encoder_outputs, hidden = self.encoder(src)
        
        # First input to the decoder is the <sos> token
        input = trg[:, 0]
        
        for t in range(1, trg_len):
            # Use previous hidden state to produce a new output
            output, hidden, attention = self.decoder(input, hidden, encoder_outputs)
            
            # Store predictions and attention
            outputs[:, t] = output
            attentions[:, t] = attention
            
            # Decide whether to use teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            
            # Get the highest predicted token from our predictions
            top1 = output.argmax(1)
            
            # If teacher forcing, use actual next token as next input
            # If not, use predicted token
            input = trg[:, t] if teacher_force else top1
            
        return outputs, attentions

In [9]:
# Cell 9: Initialize the model
# Define hyperparameters
INPUT_DIM = len(en_tokenizer.get_vocab())
OUTPUT_DIM = len(fr_tokenizer.get_vocab())
EMB_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# Create model components
encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
attention = Attention(HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, attention, N_LAYERS, DEC_DROPOUT)

# Create Seq2Seq model
model = Seq2Seq(encoder, decoder, device).to(device)

# Initialize weights
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

# Define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index

print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters")

The model has 149,913,722 trainable parameters




In [10]:
# Cell 10: Define training function
def train(model, dataloader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    
    for batch in tqdm(dataloader):
        # Get data
        src = batch['input_ids'].to(device)
        trg = batch['target_ids'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        output, _ = model(src, trg)
        
        # Output: [batch_size, trg_len, output_dim]
        # Trg: [batch_size, trg_len]
        
        output_dim = output.shape[-1]
        
        # Reshape output and target for loss calculation
        output = output[:, 1:].reshape(-1, output_dim)  # Skip <sos> token
        trg = trg[:, 1:].reshape(-1)  # Skip <sos> token
        
        # Calculate loss
        loss = criterion(output, trg)
        
        # Backpropagation
        loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Update parameters
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(dataloader)

In [11]:
# Cell 11: Define function to translate a sentence
def translate_sentence(sentence, src_tokenizer, trg_tokenizer, model, device, max_len=50):
    model.eval()
    
    # Tokenize the source sentence
    tokens = src_tokenizer.encode(sentence, return_tensors='pt').to(device)
    
    # Encode the source sentence
    encoder_outputs, hidden = model.encoder(tokens)
    
    # Start with SOS token
    input = torch.tensor([trg_tokenizer.bos_token_id]).to(device)
    
    trg_indexes = [trg_tokenizer.bos_token_id]
    attentions = torch.zeros(max_len, tokens.shape[1]).to(device)
    
    for i in range(max_len):
        # Decode one token at a time
        output, hidden, attention = model.decoder(input, hidden, encoder_outputs)
        
        # Store attention scores
        attentions[i] = attention
        
        # Get most likely next token
        pred_token = output.argmax(1).item()
        
        # If EOS token, stop generating
        if pred_token == trg_tokenizer.eos_token_id:
            break
        
        # Add token to output
        trg_indexes.append(pred_token)
        
        # Update input for next time step
        input = torch.tensor([pred_token]).to(device)
    
    # Convert tokens to words
    trg_tokens = trg_tokenizer.decode(trg_indexes)
    
    return trg_tokens, attentions[:i+1]

In [None]:
# Cell 13: Train for a few epochs (reduced for demonstration)
N_EPOCHS = 3
best_loss = float('inf')

for epoch in range(N_EPOCHS):
    print(f"Epoch {epoch+1}/{N_EPOCHS}")
    
    train_loss = train(model, dataloader, optimizer, criterion)
    
    print(f"\tTrain Loss: {train_loss:.4f}")
    
    # Save model if it has the best loss
    if train_loss < best_loss:
        best_loss = train_loss
        torch.save(model.state_dict(), 'nmt-model.pt')
        print(f"\tModel saved with loss: {best_loss:.4f}")