# NEURAL MACHINE TRANSLATION - GRU

## Required Module & Config files

In [1]:
import numpy as np
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam

from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model

from src.Normalizer import preprocess_data
import evaluate

In [2]:
# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using device: mps


## Data Preprocessing

In [3]:
# TRAIN_DATA
preprocess_data(config.TRAIN_RAW, config.TRAIN_DATA, config.TRAIN_SOURCE, "english")
preprocess_data(config.TRAIN_RAW, config.TRAIN_DATA, config.TRAIN_TARGET, "afrikaans")

# VAL_DATA
preprocess_data(config.VAL_RAW, config.VAL_DATA, config.VAL_SOURCE, "english")
preprocess_data(config.VAL_RAW, config.VAL_DATA, config.VAL_TARGET, "afrikaans")

Done for english!
Done for afrikaans!
Done for english!
Done for afrikaans!


## Load the dataset

In [4]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [5]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-3
BATCH_SIZE = 128
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

## Set the model

In [6]:
import torch
import torch.nn as nn
import numpy as np

class Encoder(nn.Module):
    def __init__(self, input_size, embd_size, hidden_size, num_layers, bidirectional=False) -> None:
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embd_size)
        self.gru = nn.GRU(embd_size, hidden_size, num_layers, bidirectional=bidirectional)

    def forward(self, x):
        # x: L x B
        embedded = self.embedding(x)
        # embedded: L x B x E
        output, hidden = self.gru(embedded)
        return output, hidden


class Decoder(nn.Module):
    def __init__(self, input_size, embd_size, hidden_size, num_layers, bidirectional=False) -> None:
        super(Decoder, self).__init__()
        d =  4 if bidirectional else 2
        self.embedding = nn.Embedding(input_size, embd_size)
        self.gru = nn.GRU(embd_size, hidden_size, num_layers, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size * d, input_size)  # Changed concatenation dimension

    def forward(self, x, hidden, encoder_outputs):
        # x: B -> 1 x B
        embedded = self.embedding(x.unsqueeze(0))  # Embedded: 1 x B x E
        decoded, hidden = self.gru(embedded, hidden)  # Output: 1 x B x H
        ##############################################################################################
        encoder_outputs = encoder_outputs.permute(1,0,2)
        decoded = decoded.permute(1,0,2)
        attn_scores = torch.einsum('blh,bih->bl', encoder_outputs, decoded) / np.sqrt(self.gru.hidden_size) 
        alpha = attn_scores.softmax(dim=1)  # Alpha: B x L (L - encoder output sequence length)
        context = torch.bmm(alpha.unsqueeze(1), encoder_outputs) # Context: 1 x B x H
        output = torch.cat((decoded.permute(1,0,2), context.permute(1,0,2)), dim=-1)  # Concatenate on hidden size dimension
        ##############################################################################################
        prediction = self.fc(output)  # Prediction: 1 x B x V -> B x V_out
        return prediction.squeeze(0), hidden

    
class NeuralMachineTranslation(nn.Module):
    def __init__(self, encoder, decoder, target_vocab_size):
        super(NeuralMachineTranslation, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.target_size = target_vocab_size

    def forward(self, source, target, tch_force=0.9):
        target_len, batch_size = target.shape
        encoder_output, hidden = self.encoder(source)
        
        outputs = torch.zeros(batch_size, target_len, self.target_size).to(
            source.device
        )
        x = target[0]
        hidden = torch.zeros_like(hidden)
        for t in range(1, target_len):
            output, hidden = self.decoder(x, hidden, encoder_output)
            outputs[:, t, :] = output
            yhat = output.softmax(1).argmax(1)
            x = target[t] if np.random.random() < tch_force else yhat
        return outputs
    
    
def greedy_search(model, source, max_len=20):
    end_token = 2
    inputs = source[0]
    sequence = [1]

    encoder_out, hidden = model.encoder(source)
    hidden = torch.zeros_like(hidden)
    for _ in range(max_len):
        output, hidden = model.decoder(inputs, hidden, encoder_out)
        top1 = output.argmax(1)
        next_token = top1.item()
        sequence.append(next_token)

        if next_token == end_token:
            break

        inputs = top1

    return sequence

class Translator:
    def __init__(self, model, source_lang, target_lang, device):
        self.model = model
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.device = device

    def translate_sentence(self, sentence, method="greedy", max_len=20):
        text = [
            (
                self.source_lang.stoi[word]
                if word in self.source_lang.stoi
                else self.source_lang.stoi["<unk>"]
            )
            for word in sentence.strip().split()
        ]
        text = torch.tensor(text, dtype=torch.long).unsqueeze(1).to(self.device)

        if method == "greedy":
            translated = greedy_search(self.model, text, max_len)
        else:
            raise ValueError("Unknown method: choose between 'greedy' or 'beam'")

        return " ".join([self.target_lang.itos[idx] for idx in translated])




In [7]:
encoder_net = Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS, bidirectional=True).to(device)
decoder_net = Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS, bidirectional=True).to(device)
model = NeuralMachineTranslation(encoder_net, decoder_net, OUT_DECODER)

In [8]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = Translator(model, english_data, afrikaans_data, device)

In [11]:
metric = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [15]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows: <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>"
predicted = translator.translate_sentence(mytext)
bleu = metric.compute(predictions=[predicted], references=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
for key, val in bleu.items():
	print(f"{key:<20}: {val}")

Pred: <sos> "so vind vind sukses kode gesondheid (idft) belê." gee gee studie studie pong pong glas afstaan omstander omstander soet soet
Refe: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
bleu                : 0.0
precisions          : [0.10714285714285714, 0.07407407407407407, 0.038461538461538464, 0.0]
brevity_penalty     : 0.6751251871527363
length_ratio        : 0.717948717948718
translation_length  : 28
reference_length    : 39


## Train the data

In [16]:
EPOCHS = 10
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "target_test": ground,
    "translator": translator
}

train_model(**params)

Epoch 1/10: 100%|██████████| 20/20 [00:24<00:00,  1.23s/batch, loss=1.715]


Predicted: <sos> die filter filter die kat filter die filter filter die oordragsfunksie filter die filter filter die oordragsfunksie filter die filter
Reference: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU Score: 0.0


Epoch 2/10: 100%|██████████| 20/20 [00:23<00:00,  1.18s/batch, loss=1.336]


Predicted: <sos> ons het die data , met die frekwensie frekwensie , en die frekwensie in die tyd-gebied <eos>
Reference: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU Score: 0.0


Epoch 3/10: 100%|██████████| 20/20 [00:23<00:00,  1.19s/batch, loss=0.941]


Predicted: <sos> ons het die hele van die stelsel wat deur die volgende vergelyking beskryf word : <eos>
Reference: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU Score: 0.0


Epoch 4/10: 100%|██████████| 20/20 [00:23<00:00,  1.20s/batch, loss=0.613]


Predicted: <sos> as ons die teikenuittree voorstel voorstel en ons het ons die $2n$ log-waarskynlikheidskostefunksie voorstel voorstel en ons salaris die $2n$
Reference: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU Score: 0.2631388306617737


Epoch 5/10: 100%|██████████| 20/20 [00:24<00:00,  1.21s/batch, loss=0.424]


Predicted: <sos> as ons die teikenuittree voorstel voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte , dan kan ons die negatiewe voorstel as
Reference: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU Score: 0.6496115922927856


Epoch 6/10: 100%|██████████| 20/20 [00:59<00:00,  2.95s/batch, loss=0.354]


Predicted: <sos> as ons die teikenuittree voorstel voorstel en ons het $n$ monsters , en ons kan die negatiewe log-waarskynlikheidskostefunksie skryf as
Reference: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU Score: 0.3687663674354553


Epoch 7/10: 100%|██████████| 20/20 [00:24<00:00,  1.20s/batch, loss=0.307]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte afrigpunte , kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as
Reference: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU Score: 0.7020458579063416


Epoch 8/10: 100%|██████████| 20/20 [00:24<00:00,  1.21s/batch, loss=0.280]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
Reference: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU Score: 0.9091564416885376


Epoch 9/10:  10%|█         | 2/20 [00:01<00:16,  1.11batch/s, loss=0.020]

## EVALUATE

In [22]:
EN_STR = [[' '.join(sent)] for sent in english_data.data_str]
AF_STR = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [[translator.translate_sentence(sent[0])] for sent in EN_STR]

In [23]:
BLEU_SCORE = [torch_bleu_score(a, b) for a, b in zip(TRANSLATED, AF_STR)]
print(f"Mean BLEU TRAIN {np.mean(BLEU_SCORE)}")

Mean BLEU TRAIN 0.9447808861732483


In [24]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_test = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_test = data.read().strip().split("\n")
AF_TEST = [[sent] for sent in afrikaans_test]

In [25]:
TRANSLATED_VAL = [[translator.translate_sentence(sent)] for sent in english_test]

In [26]:
BLEU_VAL = [torch_bleu_score(a, b) for a, b in zip(TRANSLATED_VAL, AF_TEST)]
print(f"Mean BLEU VAL {np.mean(BLEU_VAL)}")

Mean BLEU VAL 0.03426128998398781


In [310]:
data_eng = [sent.strip().split() for sent in english_test]

In [311]:
data_eng1 = []
for sent in data_eng:
    for word in sent:
        data_eng1.append(word if word in english_data.stoi else '<unk>')

In [312]:
from collections import Counter

In [313]:
A = Counter(data_eng1)

In [314]:
A

Counter({'<unk>': 758,
         'the': 236,
         '<eos>': 194,
         '<sos>': 182,
         '<num>': 100,
         'a': 86,
         'of': 82,
         'is': 58,
         'and': 55,
         'to': 54,
         '<com>': 43,
         'what': 41,
         'for': 33,
         'system': 25,
         'describe': 25,
         '<opn>': 24,
         '<cld>': 24,
         'in': 23,
         'an': 22,
         'below': 19,
         'this': 17,
         'on': 16,
         'between': 15,
         'be': 15,
         'that': 15,
         'are': 15,
         'each': 14,
         'c': 14,
         'code': 14,
         'with': 14,
         '<ltx>': 13,
         'used': 13,
         'your': 12,
         'diagram': 12,
         'design': 11,
         'which': 10,
         'from': 10,
         'as': 9,
         'will': 9,
         'time': 9,
         'block': 9,
         'within': 9,
         'you': 9,
         'two': 9,
         'using': 8,
         'write': 8,
         'processor': 8,
         'ca