In [3]:
import torch
import torchtext
import torch.optim as optim
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import torch.nn as nn
from torchtext.data import Field

In [4]:
!pip install tensorboard



In [5]:
!pip install torchtext==0.6.0 --no-deps



In [6]:
!pip install utils



In [7]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [8]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm


Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
     ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/14.6 MB 12.0 MB/s eta 0:00:02
     --------- ------------------------------ 3.4/14.6 MB 10.1 MB/s eta 0:00:02
     ------------ --------------------------- 4.7/14.6 MB 8.7 MB/s eta 0:00:02
     ----------------- ---------------------- 6.6/14.6 MB 8.8 MB/s eta 0:00:01
     ----------------------- ---------------- 8.7/14.6 MB 8.7 MB/s eta 0:00:01
     --------------------------- ------------ 10.2/14.6 MB 8.6 MB/s eta 0:00:01
     ------------------------------ --------- 11.3/14.6 MB 8.1 MB/s eta 0:00:01
     ---------------------------------- ----- 12.6/14.6 MB 7.8 MB/s eta 0:00:01
     ------------------------------------- -- 13.9/14.6 MB 7.7 MB/s eta 0:00:01
     ---------------------------

In [9]:
import spacy

spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

print("German and English Spacy models loaded successfully!")



German and English Spacy models loaded successfully!


In [10]:
import spacy

# Load German and English tokenizers from spaCy
spacy_de = spacy.load("de_core_news_sm")  # German tokenizer
spacy_en = spacy.load("en_core_web_sm")   # English tokenizer

# Define tokenization functions
def tokenize_ger(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_eng(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# Now define the fields
german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")

In [11]:
torchtext.__version__

'0.6.0'

In [12]:
import os
import urllib.request

os.makedirs(".data/multi30k", exist_ok=True)

base_url = "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/"

files = {
    "train.de": "train.de.gz",
    "train.en": "train.en.gz",
    "val.de": "val.de.gz",
    "val.en": "val.en.gz",
    "test.de": "test_2016_flickr.de.gz",
    "test.en": "test_2016_flickr.en.gz"
}

# Download and unzip
for filename, gzipped in files.items():
    url = base_url + gzipped
    local_gz_path = f".data/multi30k/{gzipped}"
    local_txt_path = f".data/multi30k/{filename}"

    print(f"Downloading {url}...")
    urllib.request.urlretrieve(url, local_gz_path)

    # Unzip the .gz file
    import gzip
    with gzip.open(local_gz_path, 'rt', encoding='utf-8') as f_in:
        with open(local_txt_path, 'w', encoding='utf-8') as f_out:
            f_out.write(f_in.read())

    # Remove .gz file to save space
    os.remove(local_gz_path)

print("All files downloaded and extracted successfully.")


Downloading https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.de.gz...
Downloading https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/train.en.gz...
Downloading https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.de.gz...
Downloading https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/val.en.gz...
Downloading https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/test_2016_flickr.de.gz...
Downloading https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/test_2016_flickr.en.gz...
All files downloaded and extracted successfully.


In [13]:
import os

os.rename(".data/multi30k/test.de", ".data/multi30k/test2016.de")
os.rename(".data/multi30k/test.en", ".data/multi30k/test2016.en")

print(" Renamed test files successfully.")


 Renamed test files successfully.


In [14]:
train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(german, english)
)


In [15]:
german.build_vocab(train_data,max_size = 10000, min_freq =2)
english.build_vocab(train_data, max_size = 10000, min_freq = 2)

In [16]:
class Transformer(nn.Module):
    def __init__(
            self,
            embedding_size,
            src_vocab_size,
            trg_vocab_size,
            src_pad_idx,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
            max_len,
            device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx
    def make_src_mask(self, src):
        # src shape: (src_len, N)
        src_mask = src.transpose(0, 1)  == self.src_pad_idx
        # (N, src_len)
        return src_mask
        
    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape
        src_positions = (
            torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length,N).to(self.device)
        )
        trg_positions = (
            torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length,N).to(self.device)
        )
        embed_src = self.dropout(
            (self.src_word_embedding(src)+ self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            self.trg_word_embedding(trg)+self.trg_position_embedding(trg_positions)
        )
        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask = src_padding_mask,
            tgt_mask = trg_mask
        )
        out = self.fc_out(out)
        return out


In [17]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    model.eval()

    # Tokenize using spaCy
    tokens = [tok.text.lower() for tok in spacy_ger(sentence)]
    tokens = ['<sos>'] + tokens + ['<eos>']

    # Convert tokens to indices
    src_indexes = [german.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    with torch.no_grad():
        src_mask = model.make_src_mask(src_tensor)
        enc_src = model.transformer.encoder(
            model.dropout(model.src_word_embedding(src_tensor) +
                          model.src_position_embedding(torch.arange(0, src_tensor.shape[0]).unsqueeze(1).to(device))),
            src_key_padding_mask=src_mask
        )

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        trg_mask = model.transformer.generate_square_subsequent_mask(trg_tensor.shape[0]).to(device)

        with torch.no_grad():
            out = model.transformer.decoder(
                model.dropout(model.trg_word_embedding(trg_tensor) +
                              model.trg_position_embedding(torch.arange(0, trg_tensor.shape[0]).unsqueeze(1).to(device))),
                enc_src,
                tgt_mask=trg_mask,
                memory_key_padding_mask=src_mask
            )

            out = model.fc_out(out)
            best_guess = out.argmax(2)[-1, :].item()
            outputs.append(best_guess)

            if best_guess == english.vocab.stoi["<eos>"]:
                break

    translated_tokens = [english.vocab.itos[idx] for idx in outputs]
    return translated_tokens[1:]  # remove <sos>


In [18]:
device = torch.device("cuda" if torch.cuda.is_available()else "cpu")
print(device)
load_model= False
save_model = True

cuda


In [19]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [20]:
# Training hyperparameters
num_epochs = 5
learning_rate = 3e-4
batch_size = 16

In [21]:
# Model hyperparameters
src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = english.vocab.stoi["<pad>"]

# Tensorboard for nice plots 
writer = SummaryWriter("run/loss_plots")
step = 0
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_sizes=(batch_size,batch_size,batch_size),
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device,

)
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)



In [22]:
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.ptar"), model, optimizer)
sentence = "ein pferd geht unter einer brucke neben einmen boot." 

In [23]:
import torch

def save_checkpoint(checkpoint, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(checkpoint, filename)

def load_checkpoint(checkpoint_file, model, optimizer):
    print("=> Loading checkpoint")
    checkpoint = torch.load(checkpoint_file, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])


In [24]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch}/{num_epochs}]")
    if save_model:
        checkpoint = {
            "state_dict" : model.state_dict(),
            "optimizer" : optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)
    model.eval()
    translated_output = translate_sentence(

        model, sentence, german, english, device, max_length=100

        )
    print(f"Translated example sentence:\n{' '.join(translated_output)}")

    model.train()
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        # forward prop
        output = model(inp_data, target[:-1])
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        optimizer.step()
        writer.add_scalar("Training loss", loss, global_step=step)
        step +=1


[Epoch 0/5]
=> Saving checkpoint
Translated example sentence:
dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer driving dancer dancer dancer driving dancer dancer dancer dancer dancer dancer ways dancer dancer driving vegetable dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer dancer beside dancer dancer dancer beside dancer dancer dancer dancer dancer dancer dancer dancer dancer beside dancer dancer follows dancer dancer dancer hanging dancer dancer wavy dancer dancer dancer ways dancer dancer rig ways dancer dancer ways dancer dancer dancer follows beside dancer
[Epoch 1/5]
=> Saving checkpoint
Translated example sentence:
a horse walks under a <unk> next to a <unk> . <eos>
[Epoch 2/5]
=> Saving checkpoint
Translated example sentence:
a horse is walking under a <unk> next to a boat . <eos>
[Epoch 3/5

In [25]:
import torch
from torchtext.data.metrics import bleu_score

def bleu(data, model, german, english, device):
    """Calculates BLEU score for the given dataset"""
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        # Convert src tokens back into a sentence
        src_sentence = " ".join(src)

        # Translate the sentence
        translated_tokens = translate_sentence(model, src_sentence, german, english, device)
        translated_tokens = translated_tokens[:-1]  # remove <eos>

        targets.append([trg])
        outputs.append(translated_tokens)

    return bleu_score(outputs, targets)


In [26]:
from torchtext.data.metrics import bleu_score


In [27]:
score = bleu(test_data, model, german, english, device)
print(f"Bleu score{score*100:.2f}")

Bleu score33.20
