# NEURAL MACHINE TRANSLATION - Vanilla RNN with Attention

## Required Module & Config files

In [1]:
import src.LSTMAttention as lstmANMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model, sentence_bleu, corpus_bleu
from src.TranslatorAtt import TranslatorAtt
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
import evaluate
import numpy as np
from torchinfo import summary

# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Using device: mps


## Load the dataset

In [2]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [3]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-3
BATCH_SIZE = 128

## Set the model

In [4]:
encoder_net = lstmANMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
decoder_net = lstmANMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
model = lstmANMT.LSTMANMT(encoder_net, decoder_net, OUT_DECODER)

summary(model)

Layer (type:depth-idx)                   Param #
LSTMANMT                                 --
├─Encoder: 1-1                           --
│    └─Embedding: 2-1                    744,448
│    └─LSTM: 2-2                         13,647,872
├─Decoder: 1-2                           --
│    └─Embedding: 2-3                    737,280
│    └─LSTM: 2-4                         13,647,872
│    └─Linear: 2-5                       5,901,120
Total params: 34,678,592
Trainable params: 34,678,592
Non-trainable params: 0

In [5]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = TranslatorAtt(model, english_data, afrikaans_data, device, lstm=True)

In [6]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows : <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>"

predicted = translator.translate_sentence(mytext)
bleu = sentence_bleu(prediction=[predicted], reference=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU SCORES: {bleu}")

Pred: <sos> effek wakker wakker basisfunksies basisfunksies enigste onafhanklike bronne ernstig (i) (i) rooi rooi geloof voordeel kode kode $x(z)$ \textit{boosting}-algoritme \textit{boosting}-algoritme \textit{boosting}-algoritme regulariseringsparameter klassifikasie- regulariseringsparameter \textit{oor-geparameteriseerde} \textit{oor-geparameteriseerde} \textit{oor-geparameteriseerde} los los los los
Refe: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU SCORES: [0.143, 0.064, 0.039, 0.0]


## Train the data

In [7]:
EPOCHS = 15
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "reference": ground,
	"translator":translator
}

train_loss = train_model(**params)
np.save('lstm_att_train_loss.npy', np.array(train_loss))

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Epoch 1/15: 100%|██████████| 20/20 [00:10<00:00,  1.83batch/s, loss=1.742]


Predicted: <sos> ek is 'n 'n 'n 'n 'n <eos>
BLEU Score: [0.062, 0.053, 0.041, 0.0]


Epoch 2/15: 100%|██████████| 20/20 [00:10<00:00,  1.90batch/s, loss=1.503]


Predicted: <sos> die die volgende van die volgende , , die , , , , , , , , , , , , , , , , , , , , , ,
BLEU Score: [0.152, 0.089, 0.06, 0.0]


Epoch 3/15: 100%|██████████| 20/20 [00:11<00:00,  1.80batch/s, loss=1.426]


Predicted: <sos> die die volgende van die volgende van die volgende impulsweergawe van die volgende impulsweergawe van die volgende impulsweergawe van die volgende impulsweergawe van die volgende impulsweergawe van die volgende impulsweergawe van
BLEU Score: [0.127, 0.081, 0.056, 0.0]


Epoch 4/15: 100%|██████████| 20/20 [00:10<00:00,  1.88batch/s, loss=1.275]


Predicted: <sos> ons het 'n data waarde en ons en ons het 'n data waarde en ons en ons het in die data van die overlap , en ons en ons , ons
BLEU Score: [0.305, 0.155, 0.086, 0.0]


Epoch 5/15: 100%|██████████| 20/20 [00:10<00:00,  1.82batch/s, loss=1.111]


Predicted: <sos> ons het die datastel van die oorspronklike -gemiddelde en matriek , en $d$ in die finale in die kontinue-tyd omgeskakel en ons sal in die data van die oorspronklike -gemiddelde en
BLEU Score: [0.332, 0.162, 0.089, 0.0]


Epoch 6/15: 100%|██████████| 20/20 [00:10<00:00,  1.91batch/s, loss=0.896]


Predicted: <sos> ons het die data van die oorspronklike -gemiddelde -gemiddelde , wat ons kan as die outokorrelasie as ons stompies vir die data -gemiddelde wat vir die volgende blokdiagram beskryf word :
BLEU Score: [0.355, 0.167, 0.091, 0.0]


Epoch 7/15: 100%|██████████| 20/20 [00:11<00:00,  1.81batch/s, loss=0.782]


Predicted: <sos> ons het die tyd en die die som van die $n$ sein $x[n]$ , en ons die negatiewe log-waarskynlikheidskostefunksie as : , en dan die negatiewe log-waarskynlikheidskostefunksie : <eos>
BLEU Score: [0.561, 0.41, 0.296, 0.192]


Epoch 8/15: 100%|██████████| 20/20 [00:10<00:00,  1.83batch/s, loss=0.640]


Predicted: <sos> ons het die volgende ontwerpsmatriks : , en ons die teikenuittree voorstel as ons gebruik <eos>
BLEU Score: [0.364, 0.278, 0.205, 0.142]


Epoch 9/15: 100%|██████████| 20/20 [00:10<00:00,  1.86batch/s, loss=0.499]


Predicted: <sos> veronderstel ons het die volgende datastel en wat gebruik die gulsige van $n$ kan (soos wat ons kan gebruik as basismodel gebruik en dan weer die $k$ van die $k$ -gemiddelde
BLEU Score: [0.415, 0.204, 0.129, 0.0]


Epoch 10/15: 100%|██████████| 20/20 [00:10<00:00,  1.93batch/s, loss=0.403]


Predicted: <sos> ons het die volgende datastel : en ons kan die outokorrelasie as ons stompies as basismodel <eos>
BLEU Score: [0.357, 0.217, 0.13, 0.0]


Epoch 11/15: 100%|██████████| 20/20 [00:10<00:00,  1.82batch/s, loss=0.363]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 12/15: 100%|██████████| 20/20 [00:10<00:00,  1.84batch/s, loss=0.314]


Predicted: <sos> ons het die teikenuittree as wat ons en die salaris kan as ons gebruik <eos>
BLEU Score: [0.329, 0.2, 0.12, 0.0]


Epoch 13/15: 100%|██████████| 20/20 [00:10<00:00,  1.92batch/s, loss=0.281]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 14/15: 100%|██████████| 20/20 [00:10<00:00,  1.94batch/s, loss=0.258]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 15/15: 100%|██████████| 20/20 [00:11<00:00,  1.81batch/s, loss=0.262]

Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]





## Evauate on the Training set

In [8]:
EN_SRC = [' '.join(sent) for sent in english_data.data_str]
AF_REF = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [translator.translate_sentence(sent) for sent in EN_SRC]
corpus_bleu(TRANSLATED, AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.8342486270594108
precisions          : [0.8342486270594108]
brevity_penalty     : 1.0
length_ratio        : 1.139362912400455
translation_length  : 42063
reference_length    : 36918
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.8114758486733747
precisions          : [0.8342486270594108, 0.7893247068338051]
brevity_penalty     : 1.0
length_ratio        : 1.139362912400455
translation_length  : 42063
reference_length    : 36918
******************************************************************************************
                                     BLEU-

## Evaluate on the Test set

In [9]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [10]:
VAL_AF_REF = [[sent] for sent in afrikaans_val]
VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5271952259164535
precisions          : [0.5271952259164535]
brevity_penalty     : 1.0
length_ratio        : 1.2049719216545678
translation_length  : 17595
reference_length    : 14602
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.4433492731495433
precisions          : [0.5271952259164535, 0.37283831176684]
brevity_penalty     : 1.0
length_ratio        : 1.2049719216545678
translation_length  : 17595
reference_length    : 14602
******************************************************************************************
                                     BLEU-

### Beam search

In [11]:
VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in english_val]
corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5876556813093318
precisions          : [0.6964428777439513]
brevity_penalty     : 0.8437959523873322
length_ratio        : 0.8548144089850706
translation_length  : 12482
reference_length    : 14602
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.5056031039859779
precisions          : [0.6964428777439513, 0.5155361050328228]
brevity_penalty     : 0.8437959523873322
length_ratio        : 0.8548144089850706
translation_length  : 12482
reference_length    : 14602
******************************************************************************************
          

## Evaluate on the SUN validation set only

In [12]:
with open(f"{config.VAL_DATA}/sun_english.txt") as data:
    sun_english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/sun_afrikaans.txt") as data:
    sun_afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [13]:
SUN_VAL_AF = [[sent] for sent in sun_afrikaans_val]
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.3455019556714472
precisions          : [0.3455019556714472]
brevity_penalty     : 1.0
length_ratio        : 1.2066072364971159
translation_length  : 4602
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.25761311286442024
precisions          : [0.3455019556714472, 0.19208144796380092]
brevity_penalty     : 1.0
length_ratio        : 1.2066072364971159
translation_length  : 4602
reference_length    : 3814
******************************************************************************************
                                     BLEU-

### Beam Search

In [14]:
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.39082437971436856
precisions          : [0.5708604483007954]
brevity_penalty     : 0.6846233276060439
length_ratio        : 0.7252228631358154
translation_length  : 2766
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.30881631089282136
precisions          : [0.5708604483007954, 0.35642414860681115]
brevity_penalty     : 0.6846233276060439
length_ratio        : 0.7252228631358154
translation_length  : 2766
reference_length    : 3814
******************************************************************************************
           

In [15]:
metric = evaluate.load("bleu")
predictions = [translator.translate_sentence(sent, method="beam", beam_width=5) for sent in sun_english_val[10:20]]
labels = SUN_VAL_AF[10:20]
for source, pred, lab in zip(sun_english_val[10:20],predictions, labels):
    print(f"Source    : {source}")
    print(f"Prediction: {pred[:150]}")
    print(f"Label     : {lab[0][:150]}")
    print(f"BLEU      : {metric.compute(predictions=[pred], references=lab)['bleu']}")
    print()

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Source    : <sos> component <eos>
Prediction: <sos> dankie <eos>
Label     : <sos> komponent <eos>
BLEU      : 0.0

Source    : <sos> architecture <eos>
Prediction: <sos> dankie <eos>
Label     : <sos> argitektuur <eos>
BLEU      : 0.0

Source    : <sos> specification <eos>
Prediction: <sos> dankie <eos>
Label     : <sos> spesifikasies <eos>
BLEU      : 0.0

Source    : <sos> at which stage of the design process would we choose the communication protocol between subsystems <eos>
Prediction: <sos> aanvaar van die oorspronklike : <eos>
Label     : <sos> by watter stap van die ontwerpsproses word die kommunikasie-kanaal tussen substelsels gekies <eos>
BLEU      : 0.0

Source    : <sos> motivate your answer <eos>
Prediction: <sos> motiveer jou antwoorde <eos>
Label     : <sos> motiveer jou antwoord <eos>
BLEU      : 0.5969491792019646

Source    : <sos> describe the meaning if a system is described as a cyber-physical system <eos>
Prediction: <sos> deur 'n monsterfrekwensie te bereken : <e