# NEURAL MACHINE TRANSLATION - LSTM with Attention

## Required Module & Config files

In [1]:
import src.LSTMAttention as lstmANMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model, sentence_bleu, corpus_bleu
from src.TranslatorAtt import TranslatorAtt
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
import evaluate
import numpy as np
from torchinfo import summary

# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Using device: mps


## Load the dataset

In [2]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [3]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-3
BATCH_SIZE = 128

## Set the model

In [4]:
encoder_net = lstmANMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
decoder_net = lstmANMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
model = lstmANMT.LSTMANMT(encoder_net, decoder_net, OUT_DECODER)

summary(model)

Layer (type:depth-idx)                   Param #
LSTMANMT                                 --
├─Encoder: 1-1                           --
│    └─Embedding: 2-1                    756,480
│    └─LSTM: 2-2                         13,647,872
├─Decoder: 1-2                           --
│    └─Embedding: 2-3                    743,424
│    └─LSTM: 2-4                         13,647,872
│    └─Linear: 2-5                       5,950,296
Total params: 34,745,944
Trainable params: 34,745,944
Non-trainable params: 0

In [5]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = TranslatorAtt(model, english_data, afrikaans_data, device, lstm=True)

In [6]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows : <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>"

predicted = translator.translate_sentence(mytext)
bleu = sentence_bleu(prediction=[predicted], reference=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU SCORES: {bleu}")

Pred: <sos> oliebol makk oliebol makk verskilvergelyking verskilvergelyking verskilvergelyking bekostig vakansiedag regressieprobleem rbf himekusa himekusa teenoor sheila wintersport wintersport staat aangesien wisselpunt regressieprobleem benaderings makk skedule mislukking oliebol oliebol oliebol makk verskilvergelyking verskilvergelyking
Refe: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU SCORES: [0.076, 0.063, 0.048, 0.0]


## Train the data

In [7]:
EPOCHS = 20
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "reference": ground,
	"translator":translator
}

train_loss = train_model(**params)
np.save('lstm_att_train_loss.npy', np.array(train_loss))

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Epoch 1/20: 100%|██████████| 20/20 [00:11<00:00,  1.68batch/s, loss=1.858]


Predicted: <sos> die die die die , die , , die , die , , die , , die , die , , die , , die , die , , die ,
BLEU Score: [0.152, 0.089, 0.06, 0.0]


Epoch 2/20: 100%|██████████| 20/20 [00:11<00:00,  1.75batch/s, loss=1.621]


Predicted: <sos> die volgende van die volgende van die volgende , , en die volgende , , en die volgende , , en die volgende , , en die volgende , , en
BLEU Score: [0.178, 0.096, 0.063, 0.0]


Epoch 3/20: 100%|██████████| 20/20 [00:11<00:00,  1.75batch/s, loss=1.445]


Predicted: <sos> ons het die volgende van die volgende van die gemiddeld , en die volgende van die gemiddeld , en die gemiddeld , en die gemiddeld , en die gemiddeld , en
BLEU Score: [0.229, 0.109, 0.069, 0.0]


Epoch 4/20: 100%|██████████| 20/20 [00:11<00:00,  1.80batch/s, loss=1.305]


Predicted: <sos> die monstertempo van die dac van die dft , en die gemiddeld , en die gemiddeld , en die gemiddeld , en die gemiddeld , en die gemiddeld , en die
BLEU Score: [0.178, 0.096, 0.063, 0.0]


Epoch 5/20: 100%|██████████| 20/20 [00:11<00:00,  1.79batch/s, loss=1.186]


Predicted: <sos> ons wil die data van die datastel van die datastel , en die \% , en die \% van die datastel , en die \% van die datastel , en die
BLEU Score: [0.282, 0.122, 0.074, 0.0]


Epoch 6/20: 100%|██████████| 20/20 [00:10<00:00,  1.88batch/s, loss=0.963]


Predicted: <sos> ons die volgende -gemiddelde bepaal en die resultaat van die resulterende van $x[n]$ en $y[n]$ : <eos>
BLEU Score: [0.427, 0.298, 0.192, 0.118]


Epoch 7/20: 100%|██████████| 20/20 [00:11<00:00,  1.81batch/s, loss=0.847]


Predicted: <sos> ons stel die teikenuittree vektor en die derde , en die derde en $x[k]$ die derde en $x[k]$ die derde en $x[k]$ die derde en $x[k]$ die derde en $x[k]$ die
BLEU Score: [0.22, 0.107, 0.058, 0.0]


Epoch 8/20: 100%|██████████| 20/20 [00:11<00:00,  1.79batch/s, loss=0.687]


Predicted: <sos> as ons nou die dft van $x[n]$ en presies , en dan na die $n$ -punt dft en $x[k]$ : <eos>
BLEU Score: [0.487, 0.358, 0.275, 0.204]


Epoch 9/20: 100%|██████████| 20/20 [00:11<00:00,  1.82batch/s, loss=0.555]


Predicted: <sos> as ons nou dieselfde as as ons afrigpunte het , en ons afrigpunte afrigpunte het , en dan weer na die dft log-waarskynlikheidskostefunksie <eos>
BLEU Score: [0.464, 0.325, 0.24, 0.166]


Epoch 10/20: 100%|██████████| 20/20 [00:10<00:00,  1.83batch/s, loss=0.458]


Predicted: <sos> as ons die teikenuittree voorstel as ons 'n $n$ -punt resultaat , en ons die $n$ -punt resultaat as $x[k]$ <eos>
BLEU Score: [0.586, 0.48, 0.383, 0.311]


Epoch 11/20: 100%|██████████| 20/20 [00:11<00:00,  1.75batch/s, loss=0.393]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie as ons as ons afrigpunte het , dan kan ons die negatiewe
BLEU Score: [0.756, 0.741, 0.734, 0.729]


Epoch 12/20: 100%|██████████| 20/20 [00:11<00:00,  1.80batch/s, loss=0.335]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 13/20: 100%|██████████| 20/20 [00:11<00:00,  1.77batch/s, loss=0.302]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 14/20: 100%|██████████| 20/20 [00:11<00:00,  1.77batch/s, loss=0.281]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 15/20: 100%|██████████| 20/20 [00:11<00:00,  1.79batch/s, loss=0.270]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 16/20: 100%|██████████| 20/20 [00:11<00:00,  1.79batch/s, loss=0.261]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 17/20: 100%|██████████| 20/20 [00:11<00:00,  1.81batch/s, loss=0.252]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 18/20: 100%|██████████| 20/20 [00:10<00:00,  1.87batch/s, loss=0.242]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 19/20: 100%|██████████| 20/20 [00:11<00:00,  1.80batch/s, loss=0.251]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 20/20: 100%|██████████| 20/20 [00:10<00:00,  1.84batch/s, loss=0.242]

Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]





## Evauate on the Training set

In [8]:
EN_SRC = [' '.join(sent) for sent in english_data.data_str]
AF_REF = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [translator.translate_sentence(sent) for sent in EN_SRC]
corpus_bleu(TRANSLATED, AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.8593717656799834
precisions          : [0.8593717656799834]
brevity_penalty     : 1.0
length_ratio        : 1.048280351524357
translation_length  : 38648
reference_length    : 36868
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.8166556780844313
precisions          : [0.8593717656799834, 0.776062844024009]
brevity_penalty     : 1.0
length_ratio        : 1.048280351524357
translation_length  : 38648
reference_length    : 36868
******************************************************************************************
                                     BLEU-3

## Evaluate on the Test set

In [9]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [10]:
VAL_AF_REF = [[sent] for sent in afrikaans_val]
VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5760429007919187
precisions          : [0.5760429007919187]
brevity_penalty     : 1.0
length_ratio        : 1.0945263445263445
translation_length  : 16037
reference_length    : 14652
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.4780511560884897
precisions          : [0.5760429007919187, 0.3967289719626168]
brevity_penalty     : 1.0
length_ratio        : 1.0945263445263445
translation_length  : 16037
reference_length    : 14652
******************************************************************************************
                                     BLE

### Beam search

In [11]:
VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in english_val]
corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5619680908908634
precisions          : [0.7437958755679832]
brevity_penalty     : 0.755540746258816
length_ratio        : 0.7810537810537811
translation_length  : 11444
reference_length    : 14652
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.48006743135503344
precisions          : [0.7437958755679832, 0.5427938769615867]
brevity_penalty     : 0.755540746258816
length_ratio        : 0.7810537810537811
translation_length  : 11444
reference_length    : 14652
******************************************************************************************
           

## Evaluate on the SUN validation set only

In [12]:
with open(f"{config.VAL_DATA}/sun_english.txt") as data:
    sun_english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/sun_afrikaans.txt") as data:
    sun_afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [13]:
SUN_VAL_AF = [[sent] for sent in sun_afrikaans_val]
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.37435657463734207
precisions          : [0.37435657463734207]
brevity_penalty     : 1.0
length_ratio        : 1.1206082852648138
translation_length  : 4274
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.2740605332743947
precisions          : [0.37435657463734207, 0.20063538611925708]
brevity_penalty     : 1.0
length_ratio        : 1.1206082852648138
translation_length  : 4274
reference_length    : 3814
******************************************************************************************
                                     BLE

### Beam Search

In [14]:
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.3616698119041299
precisions          : [0.5781972265023112]
brevity_penalty     : 0.6255128792159369
length_ratio        : 0.680650235972732
translation_length  : 2596
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.28389310903262394
precisions          : [0.5781972265023112, 0.35625517812758906]
brevity_penalty     : 0.6255128792159369
length_ratio        : 0.680650235972732
translation_length  : 2596
reference_length    : 3814
******************************************************************************************
              

In [15]:
metric = evaluate.load("bleu")
predictions = [translator.translate_sentence(sent, method="beam", beam_width=5) for sent in sun_english_val[10:20]]
labels = SUN_VAL_AF[10:20]
for source, pred, lab in zip(sun_english_val[10:20],predictions, labels):
    print(f"Source    : {source}")
    print(f"Prediction: {pred[:150]}")
    print(f"Label     : {lab[0][:150]}")
    print(f"BLEU      : {metric.compute(predictions=[pred], references=lab)['bleu']}")
    print()

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Source    : <sos> component <eos>
Prediction: <sos> geëet van slegtes oorbetaal <eos>
Label     : <sos> komponent <eos>
BLEU      : 0.0

Source    : <sos> architecture <eos>
Prediction: <sos> geëet van slegtes oorbetaal <eos>
Label     : <sos> argitektuur <eos>
BLEU      : 0.0

Source    : <sos> specification <eos>
Prediction: <sos> geëet van slegtes oorbetaal <eos>
Label     : <sos> spesifikasies <eos>
BLEU      : 0.0

Source    : <sos> at which stage of the design process would we choose the communication protocol between subsystems <eos>
Prediction: <sos> watter van die dft van $w_0$ sal die outokorrelasie <eos>
Label     : <sos> by watter stap van die ontwerpsproses word die kommunikasie-kanaal tussen substelsels gekies <eos>
BLEU      : 0.0

Source    : <sos> motivate your answer <eos>
Prediction: <sos> motiveer jou <eos>
Label     : <sos> motiveer jou antwoord <eos>
BLEU      : 0.6101950432112578

Source    : <sos> describe the meaning if a system is described as a cyber-physical