In [None]:
# --- Core Imports ---
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import pandas as pd
import spacy
import random

if torch.cuda.is_available():
    device = torch.device(type='cuda', index=0)
else:
    device = torch.device(type='cpu', index=0)
print(f"Using device: {device}")

# --- Data Downloading and Loading ---
print("Downloading and extracting data...")
!wget -q -O training.tar.gz "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
!wget -q -O validation.tar.gz "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
!tar -xzf training.tar.gz
!tar -xzf validation.tar.gz
print("Data download complete.")

def create_translation_dataframe(german_filepath, english_filepath):
    with open(german_filepath, 'r', encoding='utf-8') as f:
        german_lines = [line.strip() for line in f.readlines()]
    with open(english_filepath, 'r', encoding='utf-8') as f:
        english_lines = [line.strip() for line in f.readlines()]
    return pd.DataFrame({'german': german_lines, 'english': english_lines})

train_df = create_translation_dataframe('train.de', 'train.en')
valid_df = create_translation_dataframe('val.de', 'val.en')


Using device: cuda:0
Downloading and extracting data...
Data download complete.


In [None]:
# ---Tokenization and Vocabulary Building ---
print("Setting up tokenizers and building vocabularies...")
!python -m spacy download en_core_web_sm -q
!python -m spacy download de_core_news_sm -q

de_nlp = spacy.load('de_core_news_sm')
en_nlp = spacy.load('en_core_web_sm')

def de_tokenizer(text):
    return [token.text for token in de_nlp.tokenizer(text)]

def en_tokenizer(text):
    return [token.text for token in en_nlp.tokenizer(text)]

class Vocab:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.word2index = {'<unk>': 0, '<pad>': 1, '<bos>': 2, '<eos>': 3}
        self.index2word = {0: '<unk>', 1: '<pad>', 2: '<bos>', 3: '<eos>'}
        self.n_words = 4

    def build_vocab(self, sentence_list):
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in self.word2index:
                    self.word2index[word] = self.n_words
                    self.index2word[self.n_words] = word
                    self.n_words += 1

vocab_de = Vocab(de_tokenizer)
vocab_de.build_vocab(train_df['german'])
print(f"Built a German vocabulary with {vocab_de.n_words} words.")

vocab_en = Vocab(en_tokenizer)
vocab_en.build_vocab(train_df['english'])
print(f"Built an English vocabulary with {vocab_en.n_words} words.")

Setting up tokenizers and building vocabularies...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m134.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel'

In [None]:
# ---Custom Dataset for DataLoader ---

class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, vocab_de, vocab_en):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.vocab_de = vocab_de
        self.vocab_en = vocab_en

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src_text = self.src_sentences[idx]
        tgt_text = self.tgt_sentences[idx]
        src_ids = [self.vocab_de.word2index.get(token, 0) for token in self.vocab_de.tokenizer(src_text)]
        tgt_ids = [self.vocab_en.word2index.get(token, 0) for token in self.vocab_en.tokenizer(tgt_text)]
        src_ids.append(self.vocab_de.word2index['<eos>'])
        tgt_ids.insert(0, self.vocab_en.word2index['<bos>'])
        tgt_ids.append(self.vocab_en.word2index['<eos>'])
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

PAD_IDX = vocab_de.word2index['<pad>']
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(src_sample)
        tgt_batch.append(tgt_sample)
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, tgt_batch

train_dataset = TranslationDataset(train_df['german'].tolist(), train_df['english'].tolist(), vocab_de, vocab_en)
valid_dataset = TranslationDataset(valid_df['german'].tolist(), valid_df['english'].tolist(), vocab_de, vocab_en)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, dropout_p=0.1):
        super().__init__()
        self.e = nn.Embedding(input_size, embed_size)
        self.dropout = nn.Dropout(dropout_p)
        # Using a unidirectional GRU as requested
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)

    def forward(self, x, lengths):
        x = self.e(x)
        x = self.dropout(x)
        # Pack sequence to handle padding correctly
        x = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.gru(x)
        # Unpack sequence
        outputs, _ = pad_packed_sequence(outputs, batch_first=True)
        return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size):
        super().__init__()
        self.e = nn.Embedding(output_size, embed_size)
        self.dropout = nn.Dropout()
        # Decoder GRU input is the concatenation of the embedded word and the context vector
        self.gru = nn.GRU(embed_size + hidden_size, hidden_size, batch_first=True)
        self.lin = nn.Linear(hidden_size, output_size)
        self.lsoftmax = nn.LogSoftmax(dim=-1)

    def forward(self, x, context, prev_hidden):
        x = self.e(x) # [batch_size, 1, embed_size]
        x = self.dropout(x)

        # Concatenate the embedded input and the context vector
        input_gru = torch.cat((x, context), dim=2) # [batch_size, 1, embed_size + hidden_size]

        # Pass through GRU
        output, hidden = self.gru(input_gru, prev_hidden)

        # Get prediction
        y = self.lin(output) # [batch_size, 1, output_size]
        y = self.lsoftmax(y) # Apply LogSoftmax for NLLLoss
        return y, hidden

In [None]:
# ---Training-evaluation ---

def train_one_epoch(encoder, decoder, opt_e, opt_d, loss_fn, dataloader):
    encoder.train()
    decoder.train()
    track_loss = 0

    for i, (s_ids, t_ids) in enumerate(dataloader):
        s_ids = s_ids.to(device)
        t_ids = t_ids.to(device)

        src_lengths = torch.sum(s_ids != PAD_IDX, dim=1)

        opt_e.zero_grad()
        opt_d.zero_grad()

        _, encoder_hidden = encoder(s_ids, src_lengths)

        decoder_hidden = encoder_hidden
        decoder_input = t_ids[:, 0].unsqueeze(1)

        context = encoder_hidden.permute(1, 0, 2)

        batch_loss = 0
        for t in range(1, t_ids.shape[1]):
            yhats, decoder_hidden = decoder(decoder_input, context, decoder_hidden)

            decoder_input = t_ids[:, t].unsqueeze(1)
            batch_loss += loss_fn(yhats.squeeze(1), t_ids[:, t])

        batch_loss.backward()
        opt_e.step()
        opt_d.step()

        track_loss += batch_loss.item() / (t_ids.shape[1] - 1)

    return track_loss / len(dataloader)

def eval_one_epoch(encoder, decoder, loss_fn, dataloader):
    encoder.eval()
    decoder.eval()
    track_loss = 0

    with torch.no_grad():
        for i, (s_ids, t_ids) in enumerate(dataloader):
            s_ids = s_ids.to(device)
            t_ids = t_ids.to(device)

            src_lengths = torch.sum(s_ids != PAD_IDX, dim=1)

            _, encoder_hidden = encoder(s_ids, src_lengths)

            decoder_hidden = encoder_hidden
            decoder_input = t_ids[:, 0].unsqueeze(1)

            context = encoder_hidden.permute(1, 0, 2)

            batch_loss = 0
            for t in range(1, t_ids.shape[1]):
                yhats, decoder_hidden = decoder(decoder_input, context, decoder_hidden)
                decoder_input = t_ids[:, t].unsqueeze(1)
                batch_loss += loss_fn(yhats.squeeze(1), t_ids[:, t])

            track_loss += batch_loss.item() / (t_ids.shape[1] - 1)

    return track_loss / len(dataloader)

In [None]:
# ---Train Test main function ---

embed_size = 300
hidden_size = 512

encoder = Encoder(vocab_de.n_words, embed_size, hidden_size).to(device)
decoder = Decoder(vocab_en.n_words, embed_size, hidden_size).to(device)

loss_fn = nn.NLLLoss(ignore_index=PAD_IDX).to(device)
lr = 0.001
opt_e = optim.Adam(params=encoder.parameters(), lr=lr)
opt_d = optim.Adam(params=decoder.parameters(), lr=lr)

n_epochs = 10
print("\nStarting training...")
for e in range(n_epochs):
    train_loss = train_one_epoch(encoder, decoder, opt_e, opt_d, loss_fn, train_dataloader)
    eval_loss = eval_one_epoch(encoder, decoder, loss_fn, valid_dataloader)
    print(f"Epoch {e+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}")


Starting training...
Epoch 1/10, Train Loss: 3.7504, Eval Loss: 3.2127
Epoch 2/10, Train Loss: 2.7633, Eval Loss: 2.8713
Epoch 3/10, Train Loss: 2.2208, Eval Loss: 2.7555
Epoch 4/10, Train Loss: 1.8398, Eval Loss: 2.7047
Epoch 5/10, Train Loss: 1.5814, Eval Loss: 2.6927
Epoch 6/10, Train Loss: 1.4270, Eval Loss: 2.7151
Epoch 7/10, Train Loss: 1.3114, Eval Loss: 2.7132
Epoch 8/10, Train Loss: 1.2064, Eval Loss: 2.7663
Epoch 9/10, Train Loss: 1.1309, Eval Loss: 2.7766
Epoch 10/10, Train Loss: 1.0579, Eval Loss: 2.8423


In [None]:
# ---Final Test(convetts tensors to token id to words) ---
def translate_sentence(german_sentence, encoder, decoder, vocab_de, vocab_en, device, max_length=50):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        src_tokens = vocab_de.tokenizer(german_sentence)
        src_ids = [vocab_de.word2index.get(token, 0) for token in src_tokens]
        src_ids.append(vocab_de.word2index['<eos>'])
        src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)

        src_length = torch.tensor([len(src_ids)])

        _, encoder_hidden = encoder(src_tensor, src_length)
        context = encoder_hidden.permute(1, 0, 2)

        decoder_hidden = encoder_hidden
        decoder_input = torch.tensor([[vocab_en.word2index['<bos>']]], device=device)

        translated_ids = []
        for _ in range(max_length):
            yhats, decoder_hidden = decoder(decoder_input, context, decoder_hidden)

            predicted_id = yhats.squeeze(1).argmax(dim=1).item()
            translated_ids.append(predicted_id)

            if predicted_id == vocab_en.word2index['<eos>']:
                break

            decoder_input = torch.tensor([[predicted_id]], device=device)

        translated_tokens = [vocab_en.index2word.get(idx, '<unk>') for idx in translated_ids]
        return " ".join(translated_tokens)

def run_translation_test(german_sentence, actual_english_sentence):
    model_translation = translate_sentence(
        german_sentence, encoder, decoder, vocab_de, vocab_en, device
    )
    print(f"Original (de):         {german_sentence}")
    print(f"Model's Translation (en): {model_translation}")
    print(f"Actual Translation (en):  {actual_english_sentence}")


# --- Test 1: Random Row from the Validation DataFrame ---
8print("\n\n--- Random Test from Validation Set ---")
try:
    random_sample = valid_df.sample(n=1)
    random_german_sentence = random_sample['german'].iloc[0]
    random_english_sentence = random_sample['english'].iloc[0]
    run_translation_test(random_german_sentence, random_english_sentence)
except Exception as e:
    print(f"Could not perform the random test: {e}")


# --- Test 2: Static, User-Provided Sentence ---
print("\n\n--- Static User-Provided Test ---")
static_german_sentence = "Wir schlagen eine neue einfache Netzwerkarchitektur vor, den Transformer, der ausschließlich auf Aufmerksamkeitsmechanismen basiert und vollständig auf Wiederholungen und Faltungen verzichtet."
static_english_translation = "We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely."
run_translation_test(static_german_sentence, static_english_translation)



--- Random Test from Validation Set ---
Original (de):         Eine junge Frau und eine ältere Frau in traditionellen Saris spinnen Textilien, während drei weitere Personen in moderner Kleidung nur von der Taille abwärts auf dem Bild zu sehen sind.
Model's Translation (en): A young woman and a young woman are lined up from various , as they walk in a public place , as they walk by different colored , as they walk by different colored , - style outfits . <eos>
Actual Translation (en):  A young woman and older woman wear traditional saris as they spin textiles, three people are pictured at only the waists, and wear modern clothes.


--- Static User-Provided Test ---
Original (de):         Wir schlagen eine neue einfache Netzwerkarchitektur vor, den Transformer, der ausschließlich auf Aufmerksamkeitsmechanismen basiert und vollständig auf Wiederholungen und Faltungen verzichtet.
Model's Translation (en): Pitcher on a stage , a man in a spiked , and a spiked , working , and accordion for