# NEURAL MACHINE TRANSLATION - Vanilla RNN with Attention

## Required Module & Config files

In [1]:
import src.RNN_GRUAttention as rnnANMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model, sentence_bleu, corpus_bleu
from src.TranslatorAtt import TranslatorAtt
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
import evaluate
import numpy as np
from torchinfo import summary

# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Using device: mps


## Load the dataset

In [2]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [3]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-3
BATCH_SIZE = 128

## Set the model

In [4]:
encoder_net = rnnANMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
decoder_net = rnnANMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
model = rnnANMT.RNNAtt(encoder_net, decoder_net, OUT_DECODER)

summary(model)

Layer (type:depth-idx)                   Param #
RNNAtt                                   --
├─Encoder: 1-1                           --
│    └─RNN: 2-1                          3,411,968
│    └─Embedding: 2-2                    744,448
├─Decoder: 1-2                           --
│    └─RNN: 2-3                          3,411,968
│    └─Embedding: 2-4                    737,280
│    └─Linear: 2-5                       5,901,120
Total params: 14,206,784
Trainable params: 14,206,784
Non-trainable params: 0

In [5]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = TranslatorAtt(model, english_data, afrikaans_data, device)

In [6]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows: <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>"

predicted = translator.translate_sentence(mytext)
bleu = sentence_bleu(prediction=[predicted], reference=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU SCORES: {bleu}")

Pred: <sos> dromme twee regressiemodel dieselfde \textit{hierdie} diamant akkuraatheid gedoen." (\textit{aliasing}) lukrake katte gemaak byvoorbeeld hooflob gekoop." metode stroom aangehad ms potlood byhou snork drywingssein navorsingsgroep dom dam reis d.m.v. "ons digtheid
Refe: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU SCORES: [0.151, 0.076, 0.048, 0.0]


## Train the data

In [7]:
EPOCHS = 15
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "reference": ground,
	"translator":translator
}

train_loss = train_model(**params)
np.save('rnn_att_train_loss.npy', np.array(train_loss))

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Epoch 1/15: 100%|██████████| 20/20 [00:07<00:00,  2.65batch/s, loss=1.792]


Predicted: <sos> ek sal niks om die volgende te <eos>
BLEU Score: [0.073, 0.057, 0.043, 0.0]


Epoch 2/15: 100%|██████████| 20/20 [00:07<00:00,  2.78batch/s, loss=1.521]


Predicted: <sos> ek het 'n kinders <eos>
BLEU Score: [0.039, 0.031, 0.023, 0.0]


Epoch 3/15: 100%|██████████| 20/20 [00:06<00:00,  2.90batch/s, loss=1.274]


Predicted: <sos> sy het twee van die klassifiseerder <eos>
BLEU Score: [0.07, 0.052, 0.038, 0.0]


Epoch 4/15: 100%|██████████| 20/20 [00:07<00:00,  2.81batch/s, loss=1.150]


Predicted: <sos> neem het ons die aantal re\"ele vermenigvuldigings wat vir die berekening benodig word 'n maatstaaf gee vir die berekeningspoed <eos>
BLEU Score: [0.265, 0.182, 0.12, 0.0]


Epoch 5/15: 100%|██████████| 20/20 [00:06<00:00,  2.88batch/s, loss=0.890]


Predicted: <sos> ons het nog herfs die tifone gehad het , en ons die data van die klassifiseerder in ons <eos>
BLEU Score: [0.335, 0.233, 0.138, 0.0]


Epoch 6/15: 100%|██████████| 20/20 [00:06<00:00,  2.89batch/s, loss=0.721]


Predicted: <sos> het dit dat die gierige aanvanklike toewysing problematies kan ons die data log-waarskynlikheidskostefunksie van die data , en ons die data , met gelyke waardigheid en regte , gebore. ons
BLEU Score: [0.33, 0.228, 0.141, 0.0]


Epoch 7/15: 100%|██████████| 20/20 [00:07<00:00,  2.84batch/s, loss=0.562]


Predicted: <sos> het dit nog die bandwydte op die plaas -as as $y\in\{0,1\}$ toevoegingsmodel , en die derde van die model in die biblioteek <eos>
BLEU Score: [0.563, 0.456, 0.395, 0.343]


Epoch 8/15: 100%|██████████| 20/20 [00:07<00:00,  2.83batch/s, loss=0.422]


Predicted: <sos> die onderstaande tabel toon die skoenlapper-berekeninge van die model in die datastel $\{\mathbf{x}^{(n)}\}_{n=1}^n$ as ons slegs $w[n]$ wat as die som data het <eos>
BLEU Score: [0.397, 0.236, 0.126, 0.0]


Epoch 9/15: 100%|██████████| 20/20 [00:06<00:00,  2.93batch/s, loss=0.333]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 10/15: 100%|██████████| 20/20 [00:07<00:00,  2.84batch/s, loss=0.310]


Predicted: <sos> die plan het gevra dat die negatiewe log waarskynlikheid (nll) te minimaliseer <eos>
BLEU Score: [0.193, 0.14, 0.095, 0.0]


Epoch 11/15: 100%|██████████| 20/20 [00:07<00:00,  2.75batch/s, loss=0.299]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 12/15: 100%|██████████| 20/20 [00:07<00:00,  2.79batch/s, loss=0.286]


Predicted: <sos> gegee dat ons die diskrete konvolusie as volg : , het ons getrou." die negatiewe log-waarskynlikheidskostefunksie uit <eos>
BLEU Score: [0.366, 0.247, 0.166, 0.0]


Epoch 13/15: 100%|██████████| 20/20 [00:07<00:00,  2.74batch/s, loss=0.279]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 14/15: 100%|██████████| 20/20 [00:07<00:00,  2.82batch/s, loss=0.278]


Predicted: <sos> die diskrete intree ook in die geval en ons kan as ideaal die outokorrelasie en ons kan as ideaal die outokorrelasie en ons kan as ideaal die outokorrelasie en ons
BLEU Score: [0.354, 0.166, 0.091, 0.0]


Epoch 15/15: 100%|██████████| 20/20 [00:06<00:00,  2.89batch/s, loss=0.270]


Predicted: <sos> gegee dat ons die teikenuittree gebruik as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [0.925, 0.898, 0.87, 0.84]


## Evauate on the Training set

In [8]:
EN_SRC = [' '.join(sent) for sent in english_data.data_str]
AF_REF = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [translator.translate_sentence(sent) for sent in EN_SRC]
corpus_bleu(TRANSLATED, AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.9718772967711626
precisions          : [0.9753975805355444]
brevity_penalty     : 0.9963909242398891
length_ratio        : 0.9963974213120971
translation_length  : 36785
reference_length    : 36918
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.9619853354419441
precisions          : [0.9753975805355444, 0.9556430446194226]
brevity_penalty     : 0.9963909242398891
length_ratio        : 0.9963974213120971
translation_length  : 36785
reference_length    : 36918
******************************************************************************************
          

## Evaluate on the Test set

In [9]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [10]:
VAL_AF_REF = [[sent] for sent in afrikaans_val]
VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.6677620523043519
precisions          : [0.6733650990953663]
brevity_penalty     : 0.9916790359367573
length_ratio        : 0.9917134639090536
translation_length  : 14481
reference_length    : 14602
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.579771051291336
precisions          : [0.6733650990953663, 0.5075983313468415]
brevity_penalty     : 0.9916790359367573
length_ratio        : 0.9917134639090536
translation_length  : 14481
reference_length    : 14602
******************************************************************************************
           

### Beam search

In [11]:
VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in english_val]
corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5657817011366056
precisions          : [0.7839433293978748]
brevity_penalty     : 0.7217125012992546
length_ratio        : 0.7540747842761265
translation_length  : 11011
reference_length    : 14602
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.49445966953959664
precisions          : [0.7839433293978748, 0.5987542696403456]
brevity_penalty     : 0.7217125012992546
length_ratio        : 0.7540747842761265
translation_length  : 11011
reference_length    : 14602
******************************************************************************************
         

## Evaluate on the SUN validation set only

In [12]:
with open(f"{config.VAL_DATA}/sun_english.txt") as data:
    sun_english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/sun_afrikaans.txt") as data:
    sun_afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [13]:
SUN_VAL_AF = [[sent] for sent in sun_afrikaans_val]
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.43861302634262944
precisions          : [0.4668523676880223]
brevity_penalty     : 0.9395111960441764
length_ratio        : 0.9412690089145255
translation_length  : 3590
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.34300335645628593
precisions          : [0.4668523676880223, 0.28550469483568075]
brevity_penalty     : 0.9395111960441764
length_ratio        : 0.9412690089145255
translation_length  : 3590
reference_length    : 3814
******************************************************************************************
           

### Beam Search

In [14]:
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.36773497232921387
precisions          : [0.6266077170418006]
brevity_penalty     : 0.5868663317223118
length_ratio        : 0.6523335081279497
translation_length  : 2488
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.29989491839957594
precisions          : [0.6266077170418006, 0.41673894189071986]
brevity_penalty     : 0.5868663317223118
length_ratio        : 0.6523335081279497
translation_length  : 2488
reference_length    : 3814
******************************************************************************************
           

In [15]:
metric = evaluate.load("bleu")
predictions = [translator.translate_sentence(sent, method="beam", beam_width=5) for sent in sun_english_val[10:20]]
labels = SUN_VAL_AF[10:20]
for source, pred, lab in zip(sun_english_val[10:20],predictions, labels):
    print(f"Source    : {source}")
    print(f"Prediction: {pred[:150]}")
    print(f"Label     : {lab[0][:150]}")
    print(f"BLEU      : {metric.compute(predictions=[pred], references=lab)['bleu']}")
    print()

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Source    : <sos> component <eos>
Prediction: <sos> toon <eos>
Label     : <sos> komponent <eos>
BLEU      : 0.0

Source    : <sos> architecture <eos>
Prediction: <sos> toon <eos>
Label     : <sos> argitektuur <eos>
BLEU      : 0.0

Source    : <sos> specification <eos>
Prediction: <sos> toon <eos>
Label     : <sos> spesifikasies <eos>
BLEU      : 0.0

Source    : <sos> at which stage of the design process would we choose the communication protocol between subsystems <eos>
Prediction: <sos> op watter met vermenigvuldiging <eos>
Label     : <sos> by watter stap van die ontwerpsproses word die kommunikasie-kanaal tussen substelsels gekies <eos>
BLEU      : 0.0

Source    : <sos> motivate your answer <eos>
Prediction: <sos> motiveer jou antwoord <eos>
Label     : <sos> motiveer jou antwoord <eos>
BLEU      : 1.0

Source    : <sos> describe the meaning if a system is described as a cyber-physical system <eos>
Prediction: <sos> beskryf word gegee <eos>
Label     : <sos> wat is die betekenis