# NEURAL MACHINE TRANSLATION - Vanilla RNN with Attention

## Required Module & Config files

In [1]:
import src.RNN_GRUAttention as rnnANMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model, sentence_bleu, corpus_bleu
from src.TranslatorAtt import TranslatorAtt
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
import evaluate
import numpy as np
from torchinfo import summary

# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Using device: mps


## Load the dataset

In [2]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [3]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-3
BATCH_SIZE = 128

## Set the model

In [4]:
encoder_net = rnnANMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
decoder_net = rnnANMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
model = rnnANMT.RNNAtt(encoder_net, decoder_net, OUT_DECODER)

summary(model)

Layer (type:depth-idx)                   Param #
RNNAtt                                   --
├─Encoder: 1-1                           --
│    └─RNN: 2-1                          3,411,968
│    └─Embedding: 2-2                    756,480
├─Decoder: 1-2                           --
│    └─RNN: 2-3                          3,411,968
│    └─Embedding: 2-4                    743,424
│    └─Linear: 2-5                       5,950,296
Total params: 14,274,136
Trainable params: 14,274,136
Non-trainable params: 0

In [5]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = TranslatorAtt(model, english_data, afrikaans_data, device)

In [6]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows: <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>"

predicted = translator.translate_sentence(mytext)
bleu = sentence_bleu(prediction=[predicted], reference=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU SCORES: {bleu}")

Pred: <sos> rok nêrens beset kat $\mathbf{x}^{(1)},\mathbf{x}^{(2)},\ldots\mathbf{x}^{(n)}$ so hemp verandering euklidiese huur nie-periodiese fft horlosie line\^{e}r nie-periodiese $[0,1]^\top$ foto mnr. gebak definisie bereik baas." kok boekwurm evalueer radiale/sekonde : afgebreek mans ."
Refe: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>
BLEU SCORES: [0.178, 0.077, 0.041, 0.0]


## Train the data

In [7]:
EPOCHS = 20
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "reference": ground,
	"translator":translator
}

train_loss = train_model(**params)
np.save('rnn_att_train_loss.npy', np.array(train_loss))

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Epoch 1/20: 100%|██████████| 20/20 [00:07<00:00,  2.58batch/s, loss=1.843]


Predicted: <sos> ek wil nie <eos>
BLEU Score: [0.024, 0.021, 0.016, 0.0]


Epoch 2/20: 100%|██████████| 20/20 [00:07<00:00,  2.71batch/s, loss=1.612]


Predicted: <sos> die diskrete 'n bietjie met 'n bietjie <eos>
BLEU Score: [0.073, 0.057, 0.043, 0.0]


Epoch 3/20: 100%|██████████| 20/20 [00:07<00:00,  2.75batch/s, loss=1.438]


Predicted: <sos> die 'n $n$ -punt van die model , en , en , en , en , en , en , en , en , en , en , en ,
BLEU Score: [0.255, 0.164, 0.113, 0.0]


Epoch 4/20: 100%|██████████| 20/20 [00:07<00:00,  2.73batch/s, loss=1.300]


Predicted: <sos> van die volgende oordragsfunksie met 'n afnyfrekwensie 3 afgemonster (downsample) , met 'n faktor 3 afgemonster (downsample) , met 'n faktor 3 afgemonster (downsample) , met 'n faktor 3 afgemonster
BLEU Score: [0.128, 0.082, 0.057, 0.0]


Epoch 5/20: 100%|██████████| 20/20 [00:06<00:00,  2.89batch/s, loss=1.135]


Predicted: <sos> die diskrete sein word , deur 'n laagdeurlaatfilter (lpf) met 'n afnyfrekwensie van 2 khz , en laastens diskrete uittreeveranderlike <eos>
BLEU Score: [0.217, 0.147, 0.104, 0.0]


Epoch 6/20: 100%|██████████| 20/20 [00:07<00:00,  2.83batch/s, loss=1.028]


Predicted: <sos> in die volgende syfersein $x[n]$ : <eos>
BLEU Score: [0.194, 0.129, 0.097, 0.065]


Epoch 7/20: 100%|██████████| 20/20 [00:07<00:00,  2.74batch/s, loss=0.908]


Predicted: <sos> geld dit ons geheim doen , met $x[n]$ as intree tyd-domein sein en $x[k]$ die uitree frekwensie-ruimte sein $x[n]$ , veral intree : , en toon die vector verandering ,
BLEU Score: [0.312, 0.115, 0.066, 0.0]


Epoch 8/20: 100%|██████████| 20/20 [00:07<00:00,  2.82batch/s, loss=0.770]


Predicted: <sos> "dankie. ""jy die dft $y[k]$ van : <eos>
BLEU Score: [0.231, 0.175, 0.128, 0.085]


Epoch 9/20: 100%|██████████| 20/20 [00:07<00:00,  2.84batch/s, loss=0.611]


Predicted: <sos> beskryf word om die aantal wisselpunt vermenigvuldigings per uittree monster te minimeer <eos>
BLEU Score: [0.121, 0.094, 0.07, 0.0]


Epoch 10/20: 100%|██████████| 20/20 [00:06<00:00,  2.97batch/s, loss=0.482]


Predicted: <sos> as kan ons die data hier bo op , het ons aangeteken <eos>
BLEU Score: [0.225, 0.17, 0.131, 0.083]


Epoch 11/20: 100%|██████████| 20/20 [00:07<00:00,  2.69batch/s, loss=0.418]


Predicted: <sos> gegee word deur die volgende blokdiagram : <eos>
BLEU Score: [0.083, 0.069, 0.056, 0.039]


Epoch 12/20: 100%|██████████| 20/20 [00:07<00:00,  2.71batch/s, loss=0.356]


Predicted: <sos> gegee word deur die volgende blokdiagram : <eos>
BLEU Score: [0.083, 0.069, 0.056, 0.039]


Epoch 13/20: 100%|██████████| 20/20 [00:06<00:00,  2.86batch/s, loss=0.318]


Predicted: <sos> gegee word deur die volgende voorbeeld beskryf word : <eos>
BLEU Score: [0.108, 0.088, 0.071, 0.05]


Epoch 14/20: 100%|██████████| 20/20 [00:07<00:00,  2.77batch/s, loss=0.301]


Predicted: <sos> gegee die greep van die oorsprong sinusgolf deur 'n analoog-na-syfer omsetter (adc) teen 'n monsterfrekwensie van $f_s=5$ khz <eos>
BLEU Score: [0.251, 0.161, 0.112, 0.0]


Epoch 15/20: 100%|██████████| 20/20 [00:07<00:00,  2.85batch/s, loss=0.284]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 16/20: 100%|██████████| 20/20 [00:07<00:00,  2.72batch/s, loss=0.289]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 17/20: 100%|██████████| 20/20 [00:07<00:00,  2.84batch/s, loss=0.272]


Predicted: <sos> gegee word deur die volgende voorbeeld beskryf word : <eos>
BLEU Score: [0.108, 0.088, 0.071, 0.05]


Epoch 18/20: 100%|██████████| 20/20 [00:07<00:00,  2.76batch/s, loss=0.277]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 19/20: 100%|██████████| 20/20 [00:06<00:00,  2.88batch/s, loss=0.267]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 20/20: 100%|██████████| 20/20 [00:06<00:00,  2.93batch/s, loss=0.262]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


## Evauate on the Training set

In [8]:
EN_SRC = [' '.join(sent) for sent in english_data.data_str]
AF_REF = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [translator.translate_sentence(sent) for sent in EN_SRC]
corpus_bleu(TRANSLATED, AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.9512561735022547
precisions          : [0.9512561735022547]
brevity_penalty     : 1.0
length_ratio        : 1.0105240316805901
translation_length  : 37256
reference_length    : 36868
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.9381540053945262
precisions          : [0.9512561735022547, 0.9252323005667271]
brevity_penalty     : 1.0
length_ratio        : 1.0105240316805901
translation_length  : 37256
reference_length    : 36868
******************************************************************************************
                                     BLE

## Evaluate on the Test set

In [9]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [10]:
VAL_AF_REF = [[sent] for sent in afrikaans_val]
VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.6207384536215104
precisions          : [0.6207384536215104]
brevity_penalty     : 1.0
length_ratio        : 1.061015561015561
translation_length  : 15546
reference_length    : 14652
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.534000637400689
precisions          : [0.6207384536215104, 0.4593829801918697]
brevity_penalty     : 1.0
length_ratio        : 1.061015561015561
translation_length  : 15546
reference_length    : 14652
******************************************************************************************
                                     BLEU-3

### Beam search

In [11]:
VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in english_val]
corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5834222896383235
precisions          : [0.7422305021593699]
brevity_penalty     : 0.7860392262794026
length_ratio        : 0.805965055965056
translation_length  : 11809
reference_length    : 14652
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.5050327973225722
precisions          : [0.7422305021593699, 0.5561755952380952]
brevity_penalty     : 0.7860392262794026
length_ratio        : 0.805965055965056
translation_length  : 11809
reference_length    : 14652
******************************************************************************************
            

## Evaluate on the SUN validation set only

In [12]:
with open(f"{config.VAL_DATA}/sun_english.txt") as data:
    sun_english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/sun_afrikaans.txt") as data:
    sun_afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [13]:
SUN_VAL_AF = [[sent] for sent in sun_afrikaans_val]
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.3911764705882353
precisions          : [0.3911764705882353]
brevity_penalty     : 1.0
length_ratio        : 1.069743051914001
translation_length  : 4080
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.30186194695000707
precisions          : [0.3911764705882353, 0.23293996921498203]
brevity_penalty     : 1.0
length_ratio        : 1.069743051914001
translation_length  : 4080
reference_length    : 3814
******************************************************************************************
                                     BLEU-3 

### Beam Search

In [14]:
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.37862798566651024
precisions          : [0.576536312849162]
brevity_penalty     : 0.65672877358823
length_ratio        : 0.7039853172522287
translation_length  : 2685
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.305910514477234
precisions          : [0.576536312849162, 0.37634838194167]
brevity_penalty     : 0.65672877358823
length_ratio        : 0.7039853172522287
translation_length  : 2685
reference_length    : 3814
******************************************************************************************
                      

In [15]:
metric = evaluate.load("bleu")
predictions = [translator.translate_sentence(sent, method="beam", beam_width=5) for sent in sun_english_val[10:20]]
labels = SUN_VAL_AF[10:20]
for source, pred, lab in zip(sun_english_val[10:20],predictions, labels):
    print(f"Source    : {source}")
    print(f"Prediction: {pred[:150]}")
    print(f"Label     : {lab[0][:150]}")
    print(f"BLEU      : {metric.compute(predictions=[pred], references=lab)['bleu']}")
    print()

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Source    : <sos> component <eos>
Prediction: <sos> dankie <eos>
Label     : <sos> komponent <eos>
BLEU      : 0.0

Source    : <sos> architecture <eos>
Prediction: <sos> dankie <eos>
Label     : <sos> argitektuur <eos>
BLEU      : 0.0

Source    : <sos> specification <eos>
Prediction: <sos> dankie <eos>
Label     : <sos> spesifikasies <eos>
BLEU      : 0.0

Source    : <sos> at which stage of the design process would we choose the communication protocol between subsystems <eos>
Prediction: <sos> op watter wat is <eos>
Label     : <sos> by watter stap van die ontwerpsproses word die kommunikasie-kanaal tussen substelsels gekies <eos>
BLEU      : 0.0

Source    : <sos> motivate your answer <eos>
Prediction: <sos> motiveer u <eos>
Label     : <sos> motiveer jou antwoord <eos>
BLEU      : 0.4412484512922977

Source    : <sos> describe the meaning if a system is described as a cyber-physical system <eos>
Prediction: <sos> beskryf die duiwe as beskryf word gegee word as beskryf word gewoonl