# Machine Translation with different encoders

> THIS IS A DRAFT - Unfortunately, I couldn't implement and evaluate all different methods so far. So this file contains only the first steps of creating the RNN encoder/decoder.

Most state-of-the-art methods of machine translation use currently an encoder-decoder structure. The encoder tries to find a vector representation for the phrase in the source language and the decoder takes this representation as a basis to generate the phrase in the target language. The goal of the following study is to compare different kinds of encoders for representing the meaning of a source phrase in a vector. For this, I will focus on three different types:
- recurrent neural networks (i.e. LSTM) ([3], [4])
- transformer ([5], [6])
- convolutional neural networks ([1], [2])

The structure of the encoders will be based on the work in the referenced papers. For the decoder, I will always use an LSTM, to generate the output sentence. This will allow me, to only compare the differences of the methods in encoding the meaning of a phrase.

## 0 - Constants/Imports

In [1]:
import math
import random
import sys
from pprint import pprint

import numpy as np
import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, precision_score, recall_score
from torch import optim
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset

In [2]:
PADDING_TOKEN = '<PAD>'
UNKNOWN_TOKEN = '<UNK>'
START_TOKEN = '<SOS>'
END_TOKEN = '<EOS>'

device = torch.device('cuda:0')

In [19]:
hyperparameters = {
    'batch_size': 128,
    'embedding_dim': 256,
    'lstm_out_dim': 512,
    'epochs': 10,
    'learning_rate': 0.002
}

## 1 - Loading Data
I will use the Multi30k dataset, which contains source phrases in German and target phrases in English.

In [20]:
class MTDataset(Dataset):
    def __init__(self, path, max_lines=1000, dataset=None):
        data_file = self._read_file(path, max_lines)

        if dataset is None:
            self.max_length_source = -1
            self.max_length_target = -1
            vocab_source_lang = {PADDING_TOKEN, UNKNOWN_TOKEN, START_TOKEN, END_TOKEN}
            vocab_target_lang = {PADDING_TOKEN, UNKNOWN_TOKEN, START_TOKEN, END_TOKEN}
            for sample in data_file:
                vocab_source_lang.update(sample['vocab_source_lang'])
                vocab_target_lang.update(sample['vocab_target_lang'])
                self.max_length_source = max(self.max_length_source, len(sample['vocab_source_lang']))
                self.max_length_target = max(self.max_length_target, len(sample['vocab_target_lang']))

            self.vocab_source_lang = {word: index for index, word in enumerate(list(vocab_source_lang))}
            self.vocab_target_lang = {word: index for index, word in enumerate(list(vocab_target_lang))}

            # START token, END token
            self.max_length_source += 2
            self.max_length_target += 2
        else:
            self.vocab_source_lang = dataset.vocab_source_lang
            self.vocab_target_lang = dataset.vocab_target_lang
            self.max_length_source = dataset.max_length_source
            self.max_length_target = dataset.max_length_target

        self.samples = []
        for sample in data_file:
            source = [self.get_encoded_source_word(word) for word in sample['vocab_source_lang']]
            source.insert(0, self.get_encoded_source_word(START_TOKEN))
            source.append(self.get_encoded_source_word(END_TOKEN))
            source.extend([self.get_encoded_source_word(PADDING_TOKEN)] * (
                    self.max_length_source - len(sample['vocab_source_lang'])))

            target = [self.get_encoded_target_word(word) for word in sample['vocab_target_lang']]
            target.insert(0, self.get_encoded_target_word(START_TOKEN))
            target.append(self.get_encoded_target_word(END_TOKEN))
            target.extend([self.get_encoded_target_word(PADDING_TOKEN)] * (
                    self.max_length_target - len(sample['vocab_target_lang'])))

            self.samples.append({
                'source': torch.tensor(source),
                'target': torch.tensor(target)
            })

    def _read_file(self, path, max_lines):
        lines = []
        with open(path) as f:
            for line_index, sample in enumerate(f):
                split = sample.rstrip().split('\t')
                if len(split) == 2:
                    vocab_source_lang, vocab_target_lang = split
                    lines.append({
                        'vocab_source_lang': [word.lower() for word in word_tokenize(vocab_source_lang)],
                        'vocab_target_lang': [word.lower() for word in word_tokenize(vocab_target_lang)],
                    })

                    if line_index == max_lines:
                        break
        return lines

    def get_encoded_source_word(self, word):
        if word in self.vocab_source_lang:
            return self.vocab_source_lang[word]
        else:
            return self.vocab_source_lang[UNKNOWN_TOKEN]

    def get_encoded_target_word(self, word):
        if word in self.vocab_target_lang:
            return self.vocab_target_lang[word]
        else:
            return self.vocab_target_lang[UNKNOWN_TOKEN]

    def get_decoded_target_word(self, index):
        found = list(filter(lambda x: x[1] == index, self.vocab_target_lang.items()))
        if len(found) > 0:
            return found[0][0]
        else:
            return UNKNOWN_TOKEN

    def get_decoded_source_word(self, index):
        found = list(filter(lambda x: x[1] == index, self.vocab_source_lang.items()))
        if len(found) > 0:
            return found[0][0]
        else:
            return UNKNOWN_TOKEN

    def __getitem__(self, item):
        return self.samples[item]

    def __len__(self):
        return len(self.samples)

In [21]:
dataset = MTDataset('data/multi30k_dev.txt')
print(dataset[:10])

[{'source': tensor([1432, 1221, 1358,  526,  562, 1089,  870, 1727, 1669,  302, 1934, 1020,
          94, 1657, 2194, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956,
        1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956,
        1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956]), 'target': tensor([1197,  209,  908, 1551,  841,  843, 1370, 1030,  970, 1108,  700, 1398,
        1849, 1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663,
        1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663, 1663,
        1663, 1663, 1663])}, {'source': tensor([1432,  316,  562,  671, 1392, 1643, 1238, 1438, 1657, 2194, 1956, 1956,
        1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956,
        1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956,
        1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956, 1956]), 'target': tensor([1197, 1681, 1592, 1414, 1652, 1654, 

In [22]:
def split_data(source_path, target_path_train, target_path_test, train_split=0.8):
    with open(source_path, 'r') as source:
        lines = source.readlines()

    delimiter = int(len(lines) * train_split)

    with open(target_path_train, 'w') as target_train:
        for line in lines[:delimiter]:
            target_train.write(line)
    with open(target_path_test, 'w') as target_test:
        for line in lines[delimiter:]:
            target_test.write(line)

In [23]:
split_data('data/multi30k_dev.txt', 'data/dev_train', 'data/dev_test')

In [24]:
def dataloader(path_train, path_test, batch_size):
    train_dataset = MTDataset(path_train, max_lines=-1)
    test_dataset = MTDataset(path_test, max_lines=-1, dataset=train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 shuffle=True)

    return train_dataloader, test_dataloader

In [25]:
train_dataloader, test_dataloader = dataloader('data/dev_train', 'data/dev_test', hyperparameters['batch_size'])
train_dataset = train_dataloader.dataset

## 2 - Models
### 2.1 - recurrent neural network (LSTM)

In [26]:
class DCEPEncoder(nn.Module):
    def __init__(self, source_vocab_size, embedding_dim, encoder_out_dim, padding_idx, dropout_prob):
        super(DCEPEncoder, self).__init__()

        self.embeddings = nn.Embedding(source_vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, encoder_out_dim, 8, batch_first=True)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, source):
        embedding = self.embeddings(source)
        dropped_out = self.dropout(embedding)
        _, states = self.lstm(dropped_out)

        return states

In [27]:
class DCEPDecoder(nn.Module):
    def __init__(self, target_vocab_size, embedding_dim, decoder_out_dim, padding_idx, dropout_prob):
        super(DCEPDecoder, self).__init__()

        self.embeddings = nn.Embedding(target_vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, decoder_out_dim, 8, batch_first=True)
        self.classifier = nn.Linear(decoder_out_dim, target_vocab_size)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, target_word, input_states):
        embedding = self.embeddings(target_word.unsqueeze(0).transpose(0,1))
        dropped_out = self.dropout(embedding)
        output, output_states = self.lstm(dropped_out, input_states)
        prediction = self.classifier(output).squeeze(1)

        return prediction, output_states

In [36]:
class DCEPSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, encoded_target_SOS, encoded_target_EOS):
        super(DCEPSeq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.encoded_target_SOS = encoded_target_SOS
        self.encoded_target_EOS = encoded_target_EOS

    def forward(self, source, target=None):
        predicted_sentence = []

        states = self.encoder(source)

        if target is not None:
            target = target.transpose(0,1)
            predicted_word = target[0]
            for word in target:
                base_word = word if random.random() > 1 else predicted_word

                predicted_word_layer, states = self.decoder(base_word, states)
                predicted_word = torch.max(predicted_word_layer, 1).indices
                predicted_sentence.append(predicted_word_layer)
        else:
            sentence_length = 0
            predicted_word = torch.tensor([self.encoded_target_SOS] * source.shape[0], device=device)
            while predicted_word != torch.tensor(self.encoded_target_EOS, device=device):
                predicted_word_layer, states = self.decoder(predicted_word, states)
                predicted_word = torch.max(predicted_word_layer, 1).indices
                predicted_sentence.append(predicted_word)
                sentence_length += 1

                if sentence_length > 30:
                    break
        return torch.stack(predicted_sentence)

In [48]:
loss_function = CrossEntropyLoss(ignore_index=train_dataset.get_encoded_target_word(PADDING_TOKEN))

dcepEncoder = DCEPEncoder(len(train_dataset.vocab_source_lang),
                          hyperparameters['embedding_dim'],
                          hyperparameters['lstm_out_dim'],
                          train_dataset.get_encoded_source_word(PADDING_TOKEN),
                          0)

dcepDecoder = DCEPDecoder(len(train_dataset.vocab_target_lang),
                          hyperparameters['embedding_dim'],
                          hyperparameters['lstm_out_dim'],
                          train_dataset.get_encoded_target_word(PADDING_TOKEN),
                          0)
dcepSeq2seq = DCEPSeq2Seq(dcepEncoder,
                          dcepDecoder,
                          train_dataset.get_encoded_target_word(START_TOKEN),
                          train_dataset.get_encoded_target_word(END_TOKEN))
dcepSeq2seq.to(device)

optimizer = optim.Adam(dcepSeq2seq.parameters(), lr=hyperparameters['learning_rate'])

In [49]:
def translate_test():
    sentence = "<SOS> Jungen tanzen mitten in der Nacht auf Pfosten .".split(' ')
    encoded_sentence = torch.tensor([train_dataset.get_encoded_source_word(word.lower()) for word in sentence], device=device).unsqueeze(0)
    translated = dcepSeq2seq(encoded_sentence).squeeze(0)
    return [train_dataset.get_decoded_target_word(int(word)) for word in translated]

In [50]:
print(f'{hyperparameters["epochs"]} EPOCHS - {math.floor(len(train_dataset) / train_dataloader.batch_size)} BATCHES PER EPOCH')

for epoch in range(hyperparameters['epochs']):
    total_loss = 0
    for i, batch in enumerate(train_dataloader):
        source = batch['source'].to(device)
        target = batch['target'].type(torch.LongTensor).to(device)

        output = dcepSeq2seq(source, target)
#        print(output.transpose(0,1).size())
#        print(target.size())
#        print(torch.max(output.transpose(0,1), 2).indices.size())
#        print()
#        print([train_dataset.get_decoded_source_word(int(word)) for word in source[0]])
#        print([train_dataset.get_decoded_target_word(int(word)) for word in target[:, 1:][0]])
#        max_output = torch.max(output.transpose(0,1)[:, :-1], 2).indices
#        print([train_dataset.get_decoded_target_word(int(word)) for word in max_output[0]])
        loss = loss_function(output.transpose(0,1)[:, :-1].reshape(-1, output.shape[2]), target[:, 1:].reshape(-1))
        total_loss += loss.item()

        # print average loss for the epoch
        sys.stdout.write(f'\repoch {epoch}, batch {i}: {np.round(total_loss / (i + 1), 4)}')

        # compute gradients
        loss.backward()

        # update parameters
        optimizer.step()

        # reset gradients
        optimizer.zero_grad()
    print()
    print(translate_test())

5 EPOCHS - 40 BATCHES PER EPOCH
epoch 0, batch 39: 5.7259
['a', 'a', 'a', 'a', 'a', '<EOS>']
epoch 1, batch 39: 5.0754
['a', 'a', 'a', 'a', 'a', '<EOS>']
epoch 2, batch 39: 4.9849
['a', 'a', 'a', 'a', 'a', '<EOS>']
epoch 3, batch 39: 4.9089
['a', 'man', 'a', 'a', 'a', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '<EOS>']
epoch 4, batch 39: 4.8579
['a', 'man', 'in', 'a', 'a', 'a', '.', '<EOS>']


In [51]:
dcepSeq2seq.eval()
print(translate_test())

['a', 'man', 'in', 'a', 'a', 'a', '.', '<EOS>']


### 2.2 - transformer

### 2.3 - convolutional neural network

## 3 - Evaluation

## 4 - Discussion

## References
[1] Gehring et al. 2017. Convolutional Sequence to Sequence Learning
[2] Gehring et al. 2017. A Convolutional Encoder Model for Neural Machine Translation
[3] Cho et al. 2014. Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation
[4] Zhou et al. 2016. Deep Recurrent Models with Fast-Forward Connections for Neural Machine Translation
[5] Zhou et al. 2020. Incorporating BERT into Neural Machine Translation
[6] Vaswani et al. 2017. Attention is All you Need