**Step1. Run the demo and train a model on the original German-to-English training set.**

In [1]:
%matplotlib inline

*Data Sourcing and Processing*
============================

We will use
[Multi30k dataset from torchtext
library](https://pytorch.org/text/stable/datasets.html#multi30k) that
yields a pair of source-target raw sentences.

To access torchtext datasets, please install torchdata following
instructions at <https://github.com/pytorch/data>.


In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List

multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [3]:
!pip install portalocker>=2.0.0

In [4]:
import torchdata

In [5]:
import portalocker

In [6]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download de_core_news_sm

In [7]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

"""
Purpose: Defines special tokens used in the vocabulary for machine learning tasks with text data.
Special Tokens:
<unk>: "Unknown" token (represents words not in the vocabulary)
<pad>: Padding token (to make sequences the same length)
<bos>: "Beginning of Sequence"
<eos>: "End of Sequence"
"""

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [8]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [9]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

We now define the parameters of our model and instantiate the same.
Below, we also define our loss function which is the cross-entropy loss
and the optimizer used for training.


In [10]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



Collation
=========
we define our collate function that converts a
batch of raw strings into batch tensors that can be fed directly into
our model.


In [11]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

We now define training and evaluation loop that will be called for each
epoch.


In [12]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [13]:
from timeit import default_timer as timer
NUM_EPOCHS = 25

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")



Epoch: 1, Train loss: 5.344, Val loss: 4.103, Epoch time = 114.758s
Epoch: 2, Train loss: 3.759, Val loss: 3.310, Epoch time = 113.627s
Epoch: 3, Train loss: 3.159, Val loss: 2.891, Epoch time = 114.837s
Epoch: 4, Train loss: 2.768, Val loss: 2.643, Epoch time = 115.061s
Epoch: 5, Train loss: 2.479, Val loss: 2.444, Epoch time = 115.090s
Epoch: 6, Train loss: 2.252, Val loss: 2.311, Epoch time = 116.286s
Epoch: 7, Train loss: 2.062, Val loss: 2.198, Epoch time = 115.066s
Epoch: 8, Train loss: 1.899, Val loss: 2.110, Epoch time = 114.630s
Epoch: 9, Train loss: 1.756, Val loss: 2.062, Epoch time = 115.050s
Epoch: 10, Train loss: 1.636, Val loss: 2.018, Epoch time = 115.461s
Epoch: 11, Train loss: 1.524, Val loss: 1.974, Epoch time = 115.160s
Epoch: 12, Train loss: 1.423, Val loss: 1.956, Epoch time = 114.551s
Epoch: 13, Train loss: 1.333, Val loss: 1.941, Epoch time = 116.141s
Epoch: 14, Train loss: 1.253, Val loss: 1.924, Epoch time = 117.210s
Epoch: 15, Train loss: 1.177, Val loss: 1.9

In [26]:
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu."))

 A group of people standing in front an igloo . 


In [15]:
print(translate(transformer, "Das Auto steht vor dem Haus."))

 The car is standing in front of the house . 


In [16]:
print(translate(transformer, "Die Katze liegt auf dem Sofa."))

 The cat is laying on the couch . 


**Step3. Insert novel sentences into your English-to-German model. Take the output and feed it to the original German-to-English model. Observe and report qualitatively on the results.**

In [20]:
print(translate(transformer, "Ein Tischler fängt einen Wettbewerb für eine Aufgabe."))

 There is a male athlete catching an open task for an task . 


In [21]:
print(translate(transformer, " Eine Gruppe von Menschen steht vor einem Iglu . "))

 A group of people standing in front an igloo . 


In [22]:
print(translate(transformer, "  Eine Gruppe von Personen steht vor einem Labor."))

 A group of people stand in front of a lab . 


In [23]:
print(translate(transformer, " Der freien Himmel scheint hell erleuchteten Himmel auf den klaren Himmel."))

 A professional sky - deep lit sky in the clear blue sky . 


In [24]:
print(translate(transformer, " Kinder spielen im Park ."))

 There are kids playing in the park . 


In [29]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Data initialization
data = [
    ("A group of people stand in front of an igloo.", "A group of people standing in front an igloo."),
    ("The car is in front of the house", "The car is standing in front of the house."),
    ("The cat is lying on the sofa.", "The cat is laying on the couch."),
    ("A carpenter starts a competition for a task.", "There is a male athlete catching an open task for an task."),
    ("A group of people stand in front of a laboratory.", "A group of people stand in front of a lab."),
    ("The open sky shines brightly lit sky on the clear sky.", "A professional sky - deep lit sky in the clear blue sky."),
    ("Children play in the park.", "There are kids playing in the park.")
]

# Function to calculate BLEU scores
def calculate_bleu(data):
    scores = []
    for actual, translated in data:
        reference = [actual.split()]
        candidate = translated.split()
        score = round(sentence_bleu(reference, candidate),2)
        scores.append(score)
        print(f"Actual: {actual}\nTranslated: {translated}\nBLEU score: {score}\n")
    return scores

# Calculate and print BLEU scores
bleu_scores = calculate_bleu(data)


Actual: A group of people stand in front of an igloo.
Translated: A group of people standing in front an igloo.
BLEU score: 0.36

Actual: The car is in front of the house
Translated: The car is standing in front of the house.
BLEU score: 0.43

Actual: The cat is lying on the sofa.
Translated: The cat is laying on the couch.
BLEU score: 0.0

Actual: A carpenter starts a competition for a task.
Translated: There is a male athlete catching an open task for an task.
BLEU score: 0.0

Actual: A group of people stand in front of a laboratory.
Translated: A group of people stand in front of a lab.
BLEU score: 0.88

Actual: The open sky shines brightly lit sky on the clear sky.
Translated: A professional sky - deep lit sky in the clear blue sky.
BLEU score: 0.0

Actual: Children play in the park.
Translated: There are kids playing in the park.
BLEU score: 0.0



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


| SNO | German                                           | Actual English                                  | Translated English                                     |
|-----|--------------------------------------------------|-------------------------------------------------|--------------------------------------------------------|
| 1   | Eine Gruppe von Menschen steht vor einem Iglu. | A group of people stand in front of an igloo. | A group of people standing in front an igloo .        |
| 2   | Das Auto steht vor dem Haus                    | The car is in front of the house               | The car is standing in front of the house .          |
| 3   | Die Katze liegt auf dem Sofa.                  | The cat is lying on the sofa.                  | The cat is laying on the couch .                      |
| 4   | Ein Tischler fängt einen Wettbewerb für eine Aufgabe. | A carpenter starts a competition for a task. | There is a male athlete catching an open task for an task . |
| 5   | Eine Gruppe von Personen steht vor einem Labor. | A group of people stand in front of a laboratory. | A group of people stand in front of a lab .         |
| 6   | Der freien Himmel scheint hell erleuchteten Himmel auf den klaren Himmel. | The open sky shines brightly lit sky on the clear sky. | A professional sky - deep lit sky in the clear blue sky . |
| 7   | Kinder spielen im Park .                       | Children play in the park.                     | There are kids playing in the park .                  |


The BLEU scores suggest that the translation model performs poorly on some sentences (e.g., sentences 3, 4, 6, and 7) while performing relatively better on others (e.g., sentences 1, 2, and 5).

In [17]:
# Save the above model

torch.save(transformer.state_dict(), "modelGE.pt")


In [None]:
# Load the saved model
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer.load_state_dict(torch.load("modelGE.pt"))

We can improve the model performance by:
1. Training the model on a larger and more diverse dataset, which can help it learn a wider range of language patterns, idiomatic expressions, and domain-specific terminology.
2. Augmenting the training data with techniques like back-translation, where sentences are translated back and forth between the source and target languages, can help expose the model to more diverse language patterns and improve its robustness
3. Applying regularization techniques such as dropout, weight decay, or layer normalization during training can prevent overfitting and improve the generalization ability of the model.