In [4]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
import os
import sentencepiece as spm
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math

In [5]:
dataset = load_dataset("opus100", "de-en")

train_data = dataset["train"].shuffle(seed=42).select(range(10000))

src_lang = "en"
tgt_lang = "de"

In [6]:
def is_valid(example):
    src = example["translation"][src_lang]
    tgt = example["translation"][tgt_lang]

    if src.startswith("[{") or tgt.startswith("[{"):
        return False
    if len(src.split()) < 3 or len(tgt.split()) < 3:
        return False
    return True

train_data = train_data.filter(is_valid)

In [7]:
df = train_data.to_pandas()
df["en"] = df["translation"].apply(lambda x: x["en"])
df["de"] = df["translation"].apply(lambda x: x["de"])
df = df.drop(columns=["translation"])

In [8]:
df["en"].to_csv("train.en", index=False, header=False)
df["de"].to_csv("train.de", index=False, header=False)

In [9]:
spm.SentencePieceTrainer.train(
    input="train.en,train.de",
    model_prefix="bpe",
    vocab_size=8000,
    model_type="bpe",
    bos_id=1,
    eos_id=2,
    pad_id=3,
    unk_id=0
)


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: train.en
  input: train.de
  input_format: 
  model_prefix: bpe
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: 3
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  d

In [10]:
sp = spm.SentencePieceProcessor()
sp.load("bpe.model")

PAD_ID = sp.pad_id() 
BOS_ID = sp.bos_id()
EOS_ID = sp.eos_id()
VOCAB_SIZE = sp.get_piece_size()

In [11]:
src_ids = []
tgt_ids = []

for _, row in df.iterrows():
    src = [sp.bos_id()] + sp.encode(row["en"], out_type=int) + [sp.eos_id()]
    tgt = [sp.bos_id()] + sp.encode(row["de"], out_type=int) + [sp.eos_id()]
    src_ids.append(src)
    tgt_ids.append(tgt)

In [12]:
print(src_ids[0])
print(sp.decode(src_ids[0][1:-1]))

[1, 61, 1991, 502, 1017, 2747, 2064, 2927, 31, 303, 2]
1 x Spider Straps System f...


In [13]:
MAX_LEN = 128

class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = src
        self.tgt = tgt

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        src = self.src[idx][:MAX_LEN]
        tgt = self.tgt[idx][:MAX_LEN]

        src += [sp.pad_id()] * (MAX_LEN - len(src))
        tgt += [sp.pad_id()] * (MAX_LEN - len(tgt))

        return torch.tensor(src), torch.tensor(tgt)

In [14]:
dataset = TranslationDataset(src_ids, tgt_ids)

train_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True
)

In [15]:
s, t = next(iter(train_loader))
print(s.shape, t.shape) 

torch.Size([32, 128]) torch.Size([32, 128])


In [16]:
def create_pad_mask(seq, pad_id):
    return (seq != pad_id).unsqueeze(1).unsqueeze(2)

In [26]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, masked=False):
        super().__init__()
        assert d_model % n_heads == 0

        self.d_k = d_model // n_heads
        self.n_heads = n_heads
        self.masked = masked

        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)
        self.W_o = nn.Linear(d_model, d_model, bias=False)

    def forward(self, q, k, v, pad_mask=None):
        B, Tq, D = q.shape
        B, Tk, D = k.shape
        B, Tv, D = v.shape

        Q = self.W_q(q).view(B, Tq, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(k).view(B, Tk, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(v).view(B, Tv, self.n_heads, self.d_k).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if self.masked:
            causal = torch.tril(torch.ones(Tq, Tk)).to(q.device)
            scores = scores.masked_fill(causal == 0, float("-inf"))

        if pad_mask is not None:
            scores = scores.masked_fill(pad_mask == 0, float("-inf"))

        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(attn, V)

        out = out.transpose(1, 2).contiguous().view(B, Tq, D)
        return self.W_o(out)

In [27]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, src_mask):
        x = self.norm1(x + self.attn(x, x, x, src_mask))
        x = self.norm2(x + self.ffn(x))
        return x

In [28]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads, masked=True)
        self.cross_attn = MultiHeadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        x = self.norm1(x + self.self_attn(x, x, x, tgt_mask))
        x = self.norm2(x + self.cross_attn(x, enc_out, enc_out, src_mask))
        x = self.norm3(x + self.ffn(x))
        return x

In [29]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=4, n_layers=3, d_ff=512, max_len=512):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(max_len, d_model)

        self.encoders = nn.ModuleList([
            EncoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)
        ])
        self.decoders = nn.ModuleList([
            DecoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)
        ])

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src_ids, tgt_ids):
        B, S = src_ids.shape
        _, T = tgt_ids.shape

        src_mask = create_pad_mask(src_ids, PAD_ID)
        tgt_mask = create_pad_mask(tgt_ids, PAD_ID)

        src_pos = torch.arange(S).to(src_ids.device)
        tgt_pos = torch.arange(T).to(tgt_ids.device)

        src = self.tok_emb(src_ids) + self.pos_emb(src_pos)
        tgt = self.tok_emb(tgt_ids) + self.pos_emb(tgt_pos)

        for enc in self.encoders:
            src = enc(src, src_mask)

        for dec in self.decoders:
            tgt = dec(tgt, src, src_mask, tgt_mask)

        return self.fc_out(tgt)

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [49]:
my_model = Transformer(
    vocab_size=VOCAB_SIZE,
    d_model=256,
    n_heads=4,
    n_layers=3,
    d_ff=512,
    max_len=512
).to(device)

In [50]:
print("d_model:", my_model.tok_emb.embedding_dim)
print("n_heads:", my_model.encoders[0].attn.n_heads)
print("d_k:", my_model.encoders[0].attn.d_k)

d_model: 256
n_heads: 4
d_k: 64


In [51]:
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = torch.optim.AdamW(my_model.parameters(), lr=3e-4)

EPOCHS = 20

for epoch in range(EPOCHS):
    my_model.train()
    total_loss = 0

    for src, tgt in train_loader:
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_in = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        logits = my_model(src, tgt_in)

        loss = loss_fn(
            logits.reshape(-1, logits.size(-1)),
            tgt_out.reshape(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print("Epoch # {}, Train Loss Value = {}, Perplexity = {}".format(
        epoch + 1, total_loss / len(train_loader), math.exp(total_loss / len(train_loader))
    ))

Epoch # 1, Train Loss Value = 7.188423224335886, Perplexity = 1324.0138859715635
Epoch # 2, Train Loss Value = 6.532391608446494, Perplexity = 687.0393773744576
Epoch # 3, Train Loss Value = 6.02761598930505, Perplexity = 414.72514072729433
Epoch # 4, Train Loss Value = 5.544588920257101, Perplexity = 255.8493821245312
Epoch # 5, Train Loss Value = 5.067313994484386, Perplexity = 158.74735789995077
Epoch # 6, Train Loss Value = 4.575680712630466, Perplexity = 97.09410983371083
Epoch # 7, Train Loss Value = 4.085420188319181, Perplexity = 59.46691980737294
Epoch # 8, Train Loss Value = 3.5909170373646235, Perplexity = 36.26731916829141
Epoch # 9, Train Loss Value = 3.096351748682073, Perplexity = 22.117115121396534
Epoch # 10, Train Loss Value = 2.612100699852253, Perplexity = 13.627648402865157
Epoch # 11, Train Loss Value = 2.1418086167039543, Perplexity = 8.514823763371739
Epoch # 12, Train Loss Value = 1.6997189613137664, Perplexity = 5.472409216897158
Epoch # 13, Train Loss Value =

In [52]:
def translate_my_model(sentence, max_len=50, beam_size=5):
    my_model.eval()
    device = next(my_model.parameters()).device

    src = [sp.bos_id()] + sp.encode(sentence, out_type=int) + [sp.eos_id()]
    src = src[:MAX_LEN]
    src += [PAD_ID] * (MAX_LEN - len(src))
    src = torch.tensor(src).unsqueeze(0).to(device)

    beams = [([sp.bos_id()], 0.0)]

    for _ in range(max_len):
        new_beams = []

        for tokens, score in beams:
            if tokens[-1] == EOS_ID:
                new_beams.append((tokens, score))
                continue

            tgt = torch.tensor(tokens).unsqueeze(0).to(device)

            with torch.no_grad():
                logits = my_model(src, tgt)
                log_probs = torch.log_softmax(logits[0, -1], dim=-1)

            topk = torch.topk(log_probs, beam_size)

            for i in range(beam_size):
                new_tokens = tokens + [topk.indices[i].item()]
                new_score = score + topk.values[i].item()
                new_beams.append((new_tokens, new_score))

        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]

    best_tokens = beams[0][0]
    return sp.decode(best_tokens[1:-1])

In [53]:
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-de"
marian_tokenizer = MarianTokenizer.from_pretrained(model_name)
marian_model = MarianMTModel.from_pretrained(model_name)



In [54]:
def translate_marian(sentence):
    inputs = marian_tokenizer(sentence, return_tensors="pt", truncation=True)
    outputs = marian_model.generate(
        **inputs,
        num_beams=6,
        length_penalty=1.1,
        early_stopping=True
    )
    return marian_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [55]:
s = "This project demonstrates a transformer based translation model."

print("MY TRANSFORMER:")
print(translate_my_model(s))

print("\nMARIAN TRANSFORMER:")
print(translate_marian(s))

MY TRANSFORMER:
Die medizinernizund beseit ⁇ glicht ein S.

MARIAN TRANSFORMER:
Dieses Projekt zeigt ein transformatorbasiertes Übersetzungsmodell.
