# Sequence-to-Sequence Model with LSTM and Attention

## Introduction

In this notebook, we'll build a **Sequence-to-Sequence (Seq2Seq)** model for English-French translation.

### Key Components:
1. **Encoder**: LSTM that processes the input sequence (English)
2. **Attention Mechanism**: Allows the decoder to focus on different parts of the input
3. **Decoder**: LSTM that generates the output sequence (French)

### Dataset:
- English-French sentence pairs from `data/eng_french.csv`

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


## Load and Prepare Data

In [2]:
# Load the dataset
df = pd.read_csv('data/eng_french.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

# Take a subset for faster training (can be adjusted)
df = df.sample(n=min(50000, len(df)), random_state=42).reset_index(drop=True)
print(f"\nUsing {len(df)} sentence pairs")

Dataset shape: (175621, 2)

First few rows:
  English words/sentences French words/sentences
0                     Hi.                 Salut!
1                    Run!                Cours !
2                    Run!               Courez !
3                    Who?                  Qui ?
4                    Wow!             Ça alors !

Using 50000 sentence pairs


## Preprocessing

In [3]:
import re

# Special tokens
SOS_TOKEN = '<SOS>'
EOS_TOKEN = '<EOS>'
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'

def preprocess_sentence(sentence):
    """Basic preprocessing: lowercase and clean punctuation"""
    sentence = sentence.lower().strip()
    # Add space before punctuation
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = sentence.strip()
    return sentence

# Preprocess sentences
df['eng_clean'] = df['English words/sentences'].apply(preprocess_sentence)
df['fr_clean'] = df['French words/sentences'].apply(preprocess_sentence)

print("Sample preprocessed data:")
print(df[['eng_clean', 'fr_clean']].head())

Sample preprocessed data:
                                       eng_clean  \
0                                  take a seat .   
1                          i wish tom was here .   
2                      how did the audition go ?   
3  i've no friend to talk to about my problems .   
4   i really like this skirt . can i try it on ?   

                                            fr_clean  
0                                     prends place !  
1                       j'aimerais que tom soit là .  
2                 comment s'est passée l'audition  ?  
3  je n'ai pas d'ami avec lequel je puisse m'entr...  
4  j'aime beaucoup cette jupe , puis-je l'essayer  ?  


## Build Vocabulary

In [4]:
class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {PAD_TOKEN: 0, SOS_TOKEN: 1, EOS_TOKEN: 2, UNK_TOKEN: 3}
        self.index2word = {0: PAD_TOKEN, 1: SOS_TOKEN, 2: EOS_TOKEN, 3: UNK_TOKEN}
        self.word_count = {}
        self.n_words = 4  # Count PAD, SOS, EOS, UNK
    
    def add_sentence(self, sentence):
        for word in sentence.split():
            self.add_word(word)
    
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word_count[word] = 1
            self.n_words += 1
        else:
            self.word_count[word] += 1

# Build vocabularies
eng_vocab = Vocabulary('english')
fr_vocab = Vocabulary('french')

for _, row in df.iterrows():
    eng_vocab.add_sentence(row['eng_clean'])
    fr_vocab.add_sentence(row['fr_clean'])

print(f"English vocabulary size: {eng_vocab.n_words}")
print(f"French vocabulary size: {fr_vocab.n_words}")

English vocabulary size: 9581
French vocabulary size: 17309


## Dataset Class

In [5]:
class TranslationDataset(Dataset):
    def __init__(self, df, eng_vocab, fr_vocab, max_len=50):
        self.df = df
        self.eng_vocab = eng_vocab
        self.fr_vocab = fr_vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def sentence_to_indices(self, sentence, vocab, add_sos_eos=False):
        indices = []
        if add_sos_eos:
            indices.append(vocab.word2index[SOS_TOKEN])
        
        for word in sentence.split():
            if word in vocab.word2index:
                indices.append(vocab.word2index[word])
            else:
                indices.append(vocab.word2index[UNK_TOKEN])
        
        if add_sos_eos:
            indices.append(vocab.word2index[EOS_TOKEN])
        
        # Truncate if too long
        indices = indices[:self.max_len]
        
        # Pad if too short
        while len(indices) < self.max_len:
            indices.append(vocab.word2index[PAD_TOKEN])
        
        return indices
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Encoder input (English) - no SOS/EOS
        eng_indices = self.sentence_to_indices(row['eng_clean'], self.eng_vocab, add_sos_eos=False)
        
        # Decoder input (French) - with SOS/EOS
        fr_indices = self.sentence_to_indices(row['fr_clean'], self.fr_vocab, add_sos_eos=True)
        
        return torch.tensor(eng_indices, dtype=torch.long), torch.tensor(fr_indices, dtype=torch.long)

# Split data
train_size = int(0.8 * len(df))
train_df = df[:train_size]
val_df = df[train_size:]

# Create datasets
train_dataset = TranslationDataset(train_df, eng_vocab, fr_vocab)
val_dataset = TranslationDataset(val_df, eng_vocab, fr_vocab)

# Create dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Batches per epoch: {len(train_loader)}")

Training samples: 40000
Validation samples: 10000
Batches per epoch: 625


## Attention Mechanism

### Bahdanau Attention (Additive Attention)

The attention mechanism computes a weighted sum of encoder hidden states:

```
score(h_t, h_s) = v^T tanh(W_1 h_t + W_2 h_s)
attention_weights = softmax(scores)
context = sum(attention_weights * encoder_outputs)
```

Where:
- h_t: current decoder hidden state
- h_s: encoder hidden state at position s
- v, W_1, W_2: learned parameters

In [6]:
class Attention(nn.Module):
    """
    Bahdanau (Additive) Attention mechanism.
    """
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        
        # W_1: transforms encoder outputs
        self.W1 = nn.Linear(hidden_dim, hidden_dim)
        # W_2: transforms decoder hidden state
        self.W2 = nn.Linear(hidden_dim, hidden_dim)
        # v: combines the transformed states
        self.V = nn.Linear(hidden_dim, 1)
    
    def forward(self, decoder_hidden, encoder_outputs):
        """
        Args:
            decoder_hidden: (batch_size, hidden_dim)
            encoder_outputs: (batch_size, seq_len, hidden_dim)
        
        Returns:
            context: (batch_size, hidden_dim)
            attention_weights: (batch_size, seq_len)
        """
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        
        # Repeat decoder hidden state seq_len times
        # (batch_size, seq_len, hidden_dim)
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, seq_len, 1)
        
        # Calculate attention scores
        # energy shape: (batch_size, seq_len, hidden_dim)
        energy = torch.tanh(self.W1(encoder_outputs) + self.W2(decoder_hidden))
        
        # attention shape: (batch_size, seq_len)
        scores = self.V(energy).squeeze(2)
        
        # Get attention weights (batch_size, seq_len)
        attention_weights = torch.softmax(scores, dim=1)
        
        # Calculate context vector
        # (batch_size, hidden_dim)
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        
        return context, attention_weights

## Encoder

In [7]:
class Encoder(nn.Module):
    """
    LSTM Encoder for the input sequence.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.5):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM layer
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        Args:
            x: (batch_size, seq_len)
        
        Returns:
            outputs: (batch_size, seq_len, hidden_dim)
            hidden: (num_layers, batch_size, hidden_dim)
            cell: (num_layers, batch_size, hidden_dim)
        """
        # Embedding: (batch_size, seq_len, embedding_dim)
        embedded = self.dropout(self.embedding(x))
        
        # LSTM outputs
        outputs, (hidden, cell) = self.lstm(embedded)
        
        return outputs, hidden, cell

## Decoder with Attention

In [8]:
class Decoder(nn.Module):
    """
    LSTM Decoder with Attention mechanism.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.5):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # Attention mechanism
        self.attention = Attention(hidden_dim)
        
        # LSTM layer (input: embedding + context)
        self.lstm = nn.LSTM(
            embedding_dim + hidden_dim,  # Embedding + context vector
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Output layer
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, hidden, cell, encoder_outputs):
        """
        Args:
            x: (batch_size, 1) - single time step
            hidden: (num_layers, batch_size, hidden_dim)
            cell: (num_layers, batch_size, hidden_dim)
            encoder_outputs: (batch_size, seq_len, hidden_dim)
        
        Returns:
            output: (batch_size, vocab_size)
            hidden: (num_layers, batch_size, hidden_dim)
            cell: (num_layers, batch_size, hidden_dim)
            attention_weights: (batch_size, seq_len)
        """
        # Embedding: (batch_size, 1, embedding_dim)
        embedded = self.dropout(self.embedding(x))
        
        # Calculate attention using the last layer's hidden state
        # hidden[-1]: (batch_size, hidden_dim)
        context, attention_weights = self.attention(hidden[-1], encoder_outputs)
        
        # Concatenate embedding and context: (batch_size, 1, embedding_dim + hidden_dim)
        lstm_input = torch.cat([embedded, context.unsqueeze(1)], dim=2)
        
        # LSTM forward
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        
        # Output: (batch_size, vocab_size)
        output = self.fc(output.squeeze(1))
        
        return output, hidden, cell, attention_weights

## Seq2Seq Model

In [9]:
class Seq2Seq(nn.Module):
    """
    Complete Sequence-to-Sequence model with attention.
    """
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        """
        Args:
            src: source sequence (batch_size, src_len)
            trg: target sequence (batch_size, trg_len)
            teacher_forcing_ratio: probability of using teacher forcing
        
        Returns:
            outputs: (batch_size, trg_len, vocab_size)
        """
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.vocab_size
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # Encode the source sequence
        encoder_outputs, hidden, cell = self.encoder(src)
        
        # First input to decoder is <SOS> token
        decoder_input = trg[:, 0].unsqueeze(1)  # (batch_size, 1)
        
        # Decode one token at a time
        for t in range(1, trg_len):
            # Forward through decoder
            output, hidden, cell, attention_weights = self.decoder(
                decoder_input, hidden, cell, encoder_outputs
            )
            
            # Store output
            outputs[:, t, :] = output
            
            # Teacher forcing: use ground truth as next input
            if np.random.random() < teacher_forcing_ratio:
                decoder_input = trg[:, t].unsqueeze(1)
            else:
                # Use model's own prediction
                decoder_input = output.argmax(1).unsqueeze(1)
        
        return outputs

# Model hyperparameters
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.5

# Initialize model
encoder = Encoder(
    vocab_size=eng_vocab.n_words,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
).to(device)

decoder = Decoder(
    vocab_size=fr_vocab.n_words,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
).to(device)

model = Seq2Seq(encoder, decoder, device).to(device)

print("Seq2Seq Model Architecture:")
print(f"\nEncoder parameters: {sum(p.numel() for p in encoder.parameters()):,}")
print(f"Decoder parameters: {sum(p.numel() for p in decoder.parameters()):,}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

Seq2Seq Model Architecture:

Encoder parameters: 6,130,944
Decoder parameters: 18,563,230
Total parameters: 24,694,174


## Training Functions

In [10]:
def train_epoch(model, dataloader, criterion, optimizer, clip, teacher_forcing_ratio):
    """Train for one epoch"""
    model.train()
    epoch_loss = 0
    
    for src, trg in tqdm(dataloader, desc="Training"):
        src, trg = src.to(device), trg.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        output = model(src, trg, teacher_forcing_ratio)
        
        # output: (batch_size, trg_len, vocab_size)
        # trg: (batch_size, trg_len)
        
        # Reshape for loss calculation (ignore <SOS> token)
        output = output[:, 1:, :].contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)
        
        # Calculate loss
        loss = criterion(output, trg)
        
        # Backward pass
        loss.backward()
        
        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Update weights
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    """Evaluate the model"""
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for src, trg in tqdm(dataloader, desc="Evaluating"):
            src, trg = src.to(device), trg.to(device)
            
            # Forward pass (no teacher forcing during evaluation)
            output = model(src, trg, teacher_forcing_ratio=0)
            
            # Reshape for loss calculation
            output = output[:, 1:, :].contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)
            
            # Calculate loss
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

def plot_history(train_losses, val_losses):
    """Plot training history"""
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Train Loss', marker='o')
    plt.plot(val_losses, label='Validation Loss', marker='s')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Seq2Seq with Attention - Training Progress')
    plt.legend()
    plt.grid(True)
    plt.show()

## Training Loop

In [11]:
# Training hyperparameters
EPOCHS = 10
LEARNING_RATE = 0.001
CLIP = 1
TEACHER_FORCING_RATIO = 0.5

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=fr_vocab.word2index[PAD_TOKEN])
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training history
train_losses = []
val_losses = []

print("Starting training...\n")

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    
    # Train
    train_loss = train_epoch(model, train_loader, criterion, optimizer, CLIP, TEACHER_FORCING_RATIO)
    
    # Evaluate
    val_loss = evaluate(model, val_loader, criterion)
    
    # Save history
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}\n")

# Plot training history
plot_history(train_losses, val_losses)

Starting training...

Epoch 1/10


Training:   0%|          | 2/625 [00:49<4:19:10, 24.96s/it]


KeyboardInterrupt: 