# NEURAL MACHINE TRANSLATION - Vanilla RNN with Attention

## Required Module & Config files

In [1]:
import src.RNN_GRUAttention as gruANMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model, sentence_bleu, corpus_bleu
from src.TranslatorAtt import TranslatorAtt
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
import evaluate
import numpy as np
from torchinfo import summary

# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Using device: mps


## Load the dataset

In [2]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [3]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-3
BATCH_SIZE = 128

## Set the model

In [4]:
encoder_net = gruANMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS, type='GRU').to(device)
decoder_net = gruANMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS, type='GRU').to(device)
model = gruANMT.RNNAtt(encoder_net, decoder_net, OUT_DECODER)

summary(model)

Layer (type:depth-idx)                   Param #
RNNAtt                                   --
├─Encoder: 1-1                           --
│    └─GRU: 2-1                          10,235,904
│    └─Embedding: 2-2                    744,448
├─Decoder: 1-2                           --
│    └─GRU: 2-3                          10,235,904
│    └─Embedding: 2-4                    737,280
│    └─Linear: 2-5                       5,901,120
Total params: 27,854,656
Trainable params: 27,854,656
Non-trainable params: 0

In [5]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = TranslatorAtt(model, english_data, afrikaans_data, device)

In [6]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows : <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>"

predicted = translator.translate_sentence(mytext)
bleu = sentence_bleu(prediction=[predicted], reference=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU SCORES: {bleu}")

Pred: <sos> $\mathbf{c}$ ho\"er-frekwensie onderwyseresse herfs herfs help faktor onlangs daaraan minute minute minute vlak vlak tjirp minimum pere geleerde definisies soet verdraagsaam beleefd beleefd gepraat gepraat fouttiewelik eendag skaak resulterende tom. tom.
Refe: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU SCORES: [0.2, 0.117, 0.068, 0.0]


## Train the data

In [7]:
EPOCHS = 13
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "reference": ground,
	"translator":translator
}

train_loss = train_model(**params)
np.save('gru_att_train_loss.npy', np.array(train_loss))

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Epoch 1/13: 100%|██████████| 20/20 [00:11<00:00,  1.73batch/s, loss=1.753]


Predicted: <sos> die die van die van die van van die van van die van van die van van die van van die van van die van van die van van die van
BLEU Score: [0.127, 0.081, 0.056, 0.0]


Epoch 2/13: 100%|██████████| 20/20 [00:11<00:00,  1.78batch/s, loss=1.442]


Predicted: <sos> die volgende $x(t)$ $x(t)$ $x(t)$ : : : : word die volgende volgende : : : : : : word die volgende volgende : : : : : : : word
BLEU Score: [0.204, 0.092, 0.057, 0.0]


Epoch 3/13: 100%|██████████| 20/20 [00:10<00:00,  1.93batch/s, loss=1.139]


Predicted: <sos> ons het die van die datastel , en ons het , en die gemiddeld , = = (1 : , = geslag (1 , = geslag (1 vroulik vir vroulik om
BLEU Score: [0.282, 0.172, 0.093, 0.0]


Epoch 4/13: 100%|██████████| 20/20 [00:11<00:00,  1.81batch/s, loss=0.976]


Predicted: <sos> ons het die vrae en die vrae as die resultaat as $x[k]$ as $x[k]$ as $x[k]$ : <eos>
BLEU Score: [0.487, 0.277, 0.184, 0.115]


Epoch 5/13: 100%|██████████| 20/20 [00:10<00:00,  1.90batch/s, loss=0.732]


Predicted: <sos> ons het die rede van die oorspronklike voor voor ons voor voor die data van die oorspronklike voor voor 2 ons kan as basismodel as basismodel : <eos>
BLEU Score: [0.404, 0.229, 0.162, 0.104]


Epoch 6/13: 100%|██████████| 20/20 [00:10<00:00,  1.92batch/s, loss=0.566]


Predicted: <sos> as ons die teikenuittree wat ons kan as ons afrigpunte afrigpunte as $y\in\{0,1\}$ afrigpunte afrigpunte het , ons kan die negatiewe log-waarskynlikheidskostefunksie log-waarskynlikheidskostefunksie as as ons terwyl afrigpunte afrigpunte as ons
BLEU Score: [0.651, 0.571, 0.503, 0.442]


Epoch 7/13: 100%|██████████| 20/20 [00:10<00:00,  1.94batch/s, loss=0.435]


Predicted: <sos> neem aan dat ons die teikenuittree voorstel as $y\in\{0,1\}$ en verwys , die \% as $x[k]$ as $x[k]$ , en ons verwys na die $n$ -punt resultaat as $x[k]$ as $x[k]$
BLEU Score: [0.409, 0.355, 0.316, 0.284]


Epoch 8/13: 100%|██████████| 20/20 [00:10<00:00,  1.83batch/s, loss=0.372]


Predicted: <sos> "hoe ons het die week wat die beste skryf , dan ons die negatiewe log-waarskynlikheidskostefunksie , en ons weer het , dan ons die negatiewe log-waarskynlikheidskostefunksie skryf , en ons weer
BLEU Score: [0.382, 0.317, 0.238, 0.166]


Epoch 9/13: 100%|██████████| 20/20 [00:11<00:00,  1.78batch/s, loss=0.327]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 10/13: 100%|██████████| 20/20 [00:11<00:00,  1.82batch/s, loss=0.286]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 11/13: 100%|██████████| 20/20 [00:10<00:00,  1.82batch/s, loss=0.271]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 12/13: 100%|██████████| 20/20 [00:11<00:00,  1.78batch/s, loss=0.264]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 13/13: 100%|██████████| 20/20 [00:10<00:00,  1.84batch/s, loss=0.250]

Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]





## Evauate on the Training set

In [8]:
EN_SRC = [' '.join(sent) for sent in english_data.data_str]
AF_REF = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [translator.translate_sentence(sent) for sent in EN_SRC]
corpus_bleu(TRANSLATED, AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.8467564144659693
precisions          : [0.8467564144659693]
brevity_penalty     : 1.0
length_ratio        : 1.1549379706376293
translation_length  : 42638
reference_length    : 36918
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.8307576435609568
precisions          : [0.8467564144659693, 0.8150611563659916]
brevity_penalty     : 1.0
length_ratio        : 1.1549379706376293
translation_length  : 42638
reference_length    : 36918
******************************************************************************************
                                     BLE

## Evaluate on the Test set

In [9]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [10]:
VAL_AF_REF = [[sent] for sent in afrikaans_val]

VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5614295501255995
precisions          : [0.5614295501255995]
brevity_penalty     : 1.0
length_ratio        : 1.1995617038761814
translation_length  : 17516
reference_length    : 14602
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.4756646857281576
precisions          : [0.5614295501255995, 0.4030013974117504]
brevity_penalty     : 1.0
length_ratio        : 1.1995617038761814
translation_length  : 17516
reference_length    : 14602
******************************************************************************************
                                     BLE

### Beam search

In [11]:
VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in english_val]
corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.6286356991690613
precisions          : [0.716542750929368]
brevity_penalty     : 0.8773177850919713
length_ratio        : 0.8842624298041364
translation_length  : 12912
reference_length    : 14602
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.5382716536215796
precisions          : [0.716542750929368, 0.5253479544495994]
brevity_penalty     : 0.8773177850919713
length_ratio        : 0.8842624298041364
translation_length  : 12912
reference_length    : 14602
******************************************************************************************
            

## Evaluate on the SUN validation set only

In [12]:
with open(f"{config.VAL_DATA}/sun_english.txt") as data:
    sun_english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/sun_afrikaans.txt") as data:
    sun_afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [13]:
SUN_VAL_AF = [[sent] for sent in sun_afrikaans_val]
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.39692880409492787
precisions          : [0.39692880409492787]
brevity_penalty     : 1.0
length_ratio        : 1.1269008914525434
translation_length  : 4298
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.29345691944584296
precisions          : [0.39692880409492787, 0.21695821185617103]
brevity_penalty     : 1.0
length_ratio        : 1.1269008914525434
translation_length  : 4298
reference_length    : 3814
******************************************************************************************
                                     BL

### Beam Search

In [14]:
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.41757159102229285
precisions          : [0.5626276378488768]
brevity_penalty     : 0.7421810855556542
length_ratio        : 0.7703198741478763
translation_length  : 2938
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.3239079656829291
precisions          : [0.5626276378488768, 0.33853410740203194]
brevity_penalty     : 0.7421810855556542
length_ratio        : 0.7703198741478763
translation_length  : 2938
reference_length    : 3814
******************************************************************************************
            

In [15]:
metric = evaluate.load("bleu")
predictions = [translator.translate_sentence(sent, method="beam", beam_width=5) for sent in sun_english_val[10:20]]
labels = SUN_VAL_AF[10:20]
for source, pred, lab in zip(sun_english_val[10:20],predictions, labels):
    print(f"Source    : {source}")
    print(f"Prediction: {pred[:150]}")
    print(f"Label     : {lab[0][:150]}")
    print(f"BLEU      : {metric.compute(predictions=[pred], references=lab)['bleu']}")
    print()

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Source    : <sos> component <eos>
Prediction: <sos> miskien <eos>
Label     : <sos> komponent <eos>
BLEU      : 0.0

Source    : <sos> architecture <eos>
Prediction: <sos> miskien <eos>
Label     : <sos> argitektuur <eos>
BLEU      : 0.0

Source    : <sos> specification <eos>
Prediction: <sos> miskien <eos>
Label     : <sos> spesifikasies <eos>
BLEU      : 0.0

Source    : <sos> at which stage of the design process would we choose the communication protocol between subsystems <eos>
Prediction: <sos> op watter waarde van die eenheidssirkel te maak van die interval te verwys <eos>
Label     : <sos> by watter stap van die ontwerpsproses word die kommunikasie-kanaal tussen substelsels gekies <eos>
BLEU      : 0.0

Source    : <sos> motivate your answer <eos>
Prediction: <sos> motiveer jou antwoord <eos>
Label     : <sos> motiveer jou antwoord <eos>
BLEU      : 1.0

Source    : <sos> describe the meaning if a system is described as a cyber-physical system <eos>
Prediction: <sos> die klank '