In [1]:
from datasets import load_dataset
from Transformer import Transformer
import torchtext
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [2]:
dataset = load_dataset("Helsinki-NLP/opus_books", "en-fr")
dataset = dataset.remove_columns(['id'])

In [3]:
from transformers import AutoTokenizer

# Load a tokenizer for the English-French translation task
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")



In [4]:
def tokenize_function(batch):
    # Extract lists of source and target sentences from the batch
    source_sentences = [example['en'] for example in batch['translation']]  # English sentences
    target_sentences = [example['fr'] for example in batch['translation']]  # French sentences

    # Tokenize the source and target sentences without returning tensors
    source = tokenizer(source_sentences, padding="max_length", truncation=True)
    target = tokenizer(target_sentences, padding="max_length", truncation=True)
    
    return {
        'input_ids': source['input_ids'],       # Tokenized source sentences (English)
        'attention_mask': source['attention_mask'],  # Attention masks for source
        'labels': target['input_ids']           # Tokenized target sentences (French)
    }

# Apply the tokenization to the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [5]:
# Convert the tokenized dataset to PyTorch format
tokenized_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
# Create a DataLoader for the dataset
train_loader = DataLoader(tokenized_dataset['train'], batch_size=16, shuffle=True)

In [6]:
# Initialize the model
src_vocab_size = tokenizer.vocab_size  # Size of the source vocabulary (English)
tgt_vocab_size = tokenizer.vocab_size  # Size of the target vocabulary (French)
d_model = 512  # Dimensionality of the model
num_heads = 8  # Number of attention heads
num_layers = 6  # Number of encoder and decoder layers
d_ff = 2048  # Dimensionality of the feed-forward layer
dropout = 0.1  # Dropout rate
max_len = 512  # Maximum sequence length

In [7]:
model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_len).to('cpu')

Below is a custom implementation of the learning rate scheduler based on the paper's formula:

In [9]:
class CustomLRScheduler(torch.optim.lr_scheduler.LambdaLR):
    def __init__(self, optimizer, d_model, warmup_steps=4000):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        super(CustomLRScheduler, self).__init__(optimizer, self.lr_lambda)

    def lr_lambda(self, step):
        # Calculate the learning rate using the formula from the paper
        step = max(1, step)  # Prevent step from being zero
        return (self.d_model ** -0.5) * min(step ** -0.5, step * (self.warmup_steps ** -1.5))

The authors used the Adam optimizer in the original paper with beta1=0.9, beta2=0.98, and epsilon=1e-9.

the warmup_steps parameter determines how many steps the learning rate increases. In the "Attention is All You Need" paper, this was typically set to 4000. However, you can experiment with this value depending on your dataset and model size.

In [10]:
import torch.optim as optim

warmup_steps = 4000
# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)  # Ignore padding token during loss calculation
optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
scheduler = CustomLRScheduler(optimizer, d_model, warmup_steps)

In [12]:
# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    epoch_loss = 0

    for batch in train_loader:
        src = batch['input_ids'].to('cpu')
        trg = batch['labels'].to('cpu')

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass through the Transformer model
        output = model(src, trg[:, :-1])  # Pass source and target input (excluding last token)

        # Compute loss (output is [batch_size, seq_len, vocab_size], target is [batch_size, seq_len])
        loss = criterion(output.reshape(-1, output.shape[-1]), trg[:, 1:].reshape(-1)) 
        loss.backward()  
        scheduler.step()  

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}")

KeyboardInterrupt: 