In [7]:
from datasets import load_dataset
from transformers import BertTokenizer

# Load the OPUS100 dataset for English-Spanish
dataset = load_dataset("opus100", "en-es")

# Split the dataset into training and validation sets
train_data = dataset['train']
val_data = dataset['validation']

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Preprocess the dataset: tokenize both English and Spanish sentences in batches
def preprocess_function(examples):
    # Extract English and Spanish sentences from the 'translation' field
    inputs = [ex['en'] for ex in examples['translation']]  # List of English sentences
    targets = [ex['es'] for ex in examples['translation']]  # List of Spanish sentences

    # Tokenize the inputs (English sentences)
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)

    # Tokenize the targets (Spanish sentences)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)

    # Add the tokenized Spanish sentences as labels
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

# Apply preprocessing to the dataset with batched=True
tokenized_train_data = train_data.map(preprocess_function, batched=True)
tokenized_val_data = val_data.map(preprocess_function, batched=True)

split_dataset = tokenized_train_data.train_test_split(test_size=0.05)
tokenized_train_data = split_dataset['test']

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Transformer model
class TransformerTranslator(nn.Module):
    def __init__(self, input_dim, output_dim, n_heads, ff_dim, n_layers, vocab_size):
        super(TransformerTranslator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, input_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=n_heads, dim_feedforward=ff_dim, batch_first=True)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=output_dim, nhead=n_heads, dim_feedforward=ff_dim, batch_first=True)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=n_layers)
        self.fc_out = nn.Linear(output_dim, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        return self.fc_out(output)

# Define model hyperparameters
input_dim = 512
output_dim = 512
n_heads = 8
ff_dim = 2048
n_layers = 6
vocab_size = tokenizer.vocab_size

# Instantiate the model
model = TransformerTranslator(input_dim, output_dim, n_heads, ff_dim, n_layers, vocab_size)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [None]:
from torch.utils.data import DataLoader

# Convert the preprocessed dataset into a PyTorch DataLoader
train_dataset = tokenized_train_data.with_format("torch")
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
def train_model(model, dataloader, optimizer, criterion, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            src = batch['input_ids']  # English inputs
            tgt_input = batch['labels'][:, :-1]  # Spanish targets excluding the last token
            tgt_output = batch['labels'][:, 1:]  # Shifted Spanish targets for teacher forcing

            optimizer.zero_grad()

            # Forward pass
            output = model(src, tgt_input)
            output = output.permute(0, 2, 1)  # Rearrange for cross-entropy (batch, vocab_size, seq_len)
            loss = criterion(output, tgt_output)

            # Backpropagation and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}')

# Train the model
train_model(model, train_dataloader, optimizer, criterion, num_epochs=10)

Epoch 1/10, Loss: 5.514295684170128
Epoch 2/10, Loss: 3.946640731887183


In [None]:
def evaluate(model, sentence, tokenizer, max_len=50):
    model.eval()

    # Tokenize the input sentence
    tokens = tokenizer(sentence, return_tensors='pt')['input_ids']

    # Initialize with start token
    tgt_tokens = torch.tensor([[tokenizer.cls_token_id]], dtype=torch.long)

    for _ in range(max_len):
        output = model(tokens, tgt_tokens)
        next_token = torch.argmax(output[:, -1, :], dim=1).unsqueeze(0)
        tgt_tokens = torch.cat((tgt_tokens, next_token), dim=1)

        if next_token.item() == tokenizer.sep_token_id:
            break

    # Decode the generated token sequence
    translated_sentence = tokenizer.decode(tgt_tokens.squeeze(), skip_special_tokens=True)
    return translated_sentence

# Test the model with a sentence
translated_sentence = evaluate(model, "Hello, how are you?", tokenizer)
print("Translated sentence:", translated_sentence)
