In [1]:
!pip install datasets transformers

from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import math

# Dataset Loading
dataset = load_dataset("Helsinki-NLP/opus_books", "de-en")
train_data = dataset['train']
print(f"Total number of samples: {len(train_data)}")

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51467 [00:00<?, ? examples/s]

Total number of samples: 51467


In [2]:
def simple_tokenizer(text):
    return text.split()

def build_vocab(data_iter, lang):
    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    for data in data_iter:
        for token in simple_tokenizer(data['translation'][lang].lower()):
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

de_vocab = build_vocab(train_data, 'de')
en_vocab = build_vocab(train_data, 'en')

print(f"German vocabulary size: {len(de_vocab)}")
print(f"English vocabulary size: {len(en_vocab)}")

German vocabulary size: 102728
English vocabulary size: 73515


In [3]:
class TranslationDataset(Dataset):
    def __init__(self, data, de_vocab, en_vocab, max_len=100):
        self.data = data
        self.de_vocab = de_vocab
        self.en_vocab = en_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        de_text = self.data[idx]['translation']['de']
        en_text = self.data[idx]['translation']['en']

        de_tokens = simple_tokenizer(de_text.lower())
        en_tokens = simple_tokenizer(en_text.lower())

        de_ids = [self.de_vocab.get(token, self.de_vocab['<unk>']) for token in de_tokens]
        en_ids = [self.en_vocab.get(token, self.en_vocab['<unk>']) for token in en_tokens]

        de_ids = [self.de_vocab['<sos>']] + de_ids[:self.max_len-2] + [self.de_vocab['<eos>']]
        en_ids = [self.en_vocab['<sos>']] + en_ids[:self.max_len-2] + [self.en_vocab['<eos>']]

        de_ids += [self.de_vocab['<pad>']] * (self.max_len - len(de_ids))
        en_ids += [self.en_vocab['<pad>']] * (self.max_len - len(en_ids))

        return torch.tensor(de_ids), torch.tensor(en_ids)

# Prepare the Dataset for training
train_dataset = TranslationDataset(train_data, de_vocab, en_vocab)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, dropout=0.1):
        super(Transformer, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src_embedded = self.positional_encoding(self.src_embedding(src))
        tgt_embedded = self.positional_encoding(self.tgt_embedding(tgt))
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        src_padding_mask = (src == 0).to(src.device)
        tgt_padding_mask = (tgt == 0).to(tgt.device)
        output = self.transformer(src_embedded, tgt_embedded, tgt_mask=tgt_mask,
                                  src_key_padding_mask=src_padding_mask,
                                  tgt_key_padding_mask=tgt_padding_mask)
        return self.fc(output)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

In [9]:
# Model Initialization
src_vocab_size = len(de_vocab)  # This depends on your dataset, so I'll leave it as is
tgt_vocab_size = len(en_vocab)  # This depends on your dataset, so I'll leave it as is
d_model = 128  # Reduced from 256
nhead = 4  # Reduced from 8
num_encoder_layers = 3  # Reduced from 6
num_decoder_layers = 3  # Reduced from 6
dim_feedforward = 512  # Reduced from 1024
max_seq_length = 64  # Reduced from 100

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f"Using device: {device}")

Using device: cuda


In [10]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        output = model(src, tgt_input)
        output_flat = output.contiguous().view(-1, output.size(-1))
        tgt_output_flat = tgt_output.contiguous().view(-1)
        loss = criterion(output_flat, tgt_output_flat)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(output_flat, dim=1)
        correct_predictions += (predicted == tgt_output_flat).sum().item()
        total_predictions += tgt_output_flat.size(0)

    epoch_loss = total_loss / len(train_loader)
    epoch_accuracy = correct_predictions / total_predictions
    print(f"Epoch: {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

Epoch: 1, Loss: 7.8440, Accuracy: 0.0179
Epoch: 2, Loss: 6.7910, Accuracy: 0.0286
Epoch: 3, Loss: 6.4957, Accuracy: 0.0321
Epoch: 4, Loss: 6.3021, Accuracy: 0.0341
Epoch: 5, Loss: 6.1548, Accuracy: 0.0356
Epoch: 6, Loss: 6.0318, Accuracy: 0.0369
Epoch: 7, Loss: 5.9245, Accuracy: 0.0380
Epoch: 8, Loss: 5.8279, Accuracy: 0.0390
Epoch: 9, Loss: 5.7376, Accuracy: 0.0399
Epoch: 10, Loss: 5.6573, Accuracy: 0.0406


In [12]:
def translate(model, sentence, de_vocab, en_vocab, device, max_length=100):
    model.eval()
    tokens = sentence.lower().split()
    src_indices = [de_vocab.get(token, de_vocab['<unk>']) for token in tokens]
    src_indices = [de_vocab['<sos>']] + src_indices + [de_vocab['<eos>']]
    src_indices += [de_vocab['<pad>']] * (max_length - len(src_indices))
    src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)
    tgt_tensor = torch.LongTensor([[en_vocab['<sos>']]])

    for _ in range(max_length):
        tgt_tensor = tgt_tensor.to(device)
        output = model(src_tensor, tgt_tensor)
        next_word = output.argmax(2)[:, -1].item()
        tgt_tensor = torch.cat([tgt_tensor, torch.LongTensor([[next_word]]).to(device)], dim=1)
        if next_word == en_vocab['<eos>']:
            break

    en_vocab_inv = {v: k for k, v in en_vocab.items()}
    translated_tokens = [en_vocab_inv[idx.item()] for idx in tgt_tensor[0][1:]]
    return ' '.join(translated_tokens[:-1])

# Testing with Custom input
german_sentence = "Was ist das?"
english_translation = translate(model, german_sentence, de_vocab, en_vocab, device)
print(f"German: {german_sentence}")
print(f"English: {english_translation}")
print("---------------------------------")
german_sentence = "Wie geht es dir?"
english_translation = translate(model, german_sentence, de_vocab, en_vocab, device)
print(f"German: {german_sentence}")
print(f"English: {english_translation}")
print("---------------------------------")
german_sentence = "Ich mag Programmieren."
english_translation = translate(model, german_sentence, de_vocab, en_vocab, device)
print(f"German: {german_sentence}")
print(f"English: {english_translation}")

German: Was ist das?
English: what is what is it?
---------------------------------
German: Wie geht es dir?
English: how is it is the same
---------------------------------
German: Ich mag Programmieren.
English: i am a very much.


