# NEURAL MACHINE TRANSLATION - LSTM

## Required Module & Config files

In [1]:
import src.LSTM as lstmNMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model, sentence_bleu, corpus_bleu
from src.Translator import Translator
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
import evaluate
import numpy as np
from torchinfo import summary

# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using device: mps


## Load the dataset

In [2]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [3]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-3
BATCH_SIZE = 128

## Set the model

In [4]:
encoder_net = lstmNMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
decoder_net = lstmNMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
model = lstmNMT.LSTM_NMT(encoder_net, decoder_net, OUT_DECODER)
summary(model)

Layer (type:depth-idx)                   Param #
LSTM_NMT                                 --
├─Encoder: 1-1                           --
│    └─Embedding: 2-1                    744,448
│    └─LSTM: 2-2                         13,647,872
├─Decoder: 1-2                           --
│    └─Embedding: 2-3                    737,280
│    └─LSTM: 2-4                         13,647,872
│    └─Linear: 2-5                       2,952,000
Total params: 31,729,472
Trainable params: 31,729,472
Non-trainable params: 0

In [5]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = Translator(model, english_data, afrikaans_data, device)

In [6]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows: <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>"

predicted = translator.translate_sentence(mytext)
bleu = sentence_bleu(prediction=[predicted], reference=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU SCORES: {bleu}")

Pred: <sos> snork bleek wek faktore faktore faktore faktore asseblief." asseblief." na\"{i}ewe na\"{i}ewe na\"{i}ewe na\"{i}ewe na\"{i}ewe gewoonlik voorstelle voorstelle voorstelle voorstelle voorstelle \texttt{artist} \texttt{artist} bladsye m.b.v.\twee m.b.v.\twee berekeninge dink verkry verkry verkry regtig regtig voëls m.b.v.\twee m.b.v.\twee dink m.b.v.\twee m.b.v.\twee berekeninge ouer \it plek waar waar frekwensie-ruimte frekwensie-ruimte regtig regtig wek geen bondels soms faktore faktore spoed spoed duiwels $y\in\{0,1\}$ rad/s sproeier tuiste maan tuiste soms manne soen frekwensie-ruimte yanni yanni verhuur engelse winkelsentrum winkelsentrum plek plek swanger gewoonlik waar waar onewe-indeks snork snork uittreeveranderlike voëls uittreeveranderlike doen doen opgewonde lys maan verkry regtig regtig beduidend regtig geregeer soms saag verhouding faktore faktore spoed spoed spoed duiwels duiwels $y\in\{0,1\}$ soos $\omega_c$ afgeleefde afgeleefde wes-europa rok rok m.b.v.\twee 

## Train the data

In [7]:
EPOCHS = 20
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "reference": ground,
	"translator":translator
}

train_loss = train_model(**params)
np.save('lstm_train_loss.npy', np.array(train_loss))

Epoch 1/20: 100%|██████████| 20/20 [00:08<00:00,  2.43batch/s, loss=1.777]


Predicted: <sos> ek het 'n 'n <eos>
BLEU Score: [0.039, 0.031, 0.023, 0.0]


Epoch 2/20: 100%|██████████| 20/20 [00:08<00:00,  2.42batch/s, loss=1.586]


Predicted: <sos> ek het nie <eos>
BLEU Score: [0.028, 0.022, 0.017, 0.0]


Epoch 3/20: 100%|██████████| 20/20 [00:08<00:00,  2.45batch/s, loss=1.490]


Predicted: <sos> ek is nie <eos>
BLEU Score: [0.024, 0.021, 0.016, 0.0]


Epoch 4/20: 100%|██████████| 20/20 [00:08<00:00,  2.42batch/s, loss=1.455]


Predicted: <sos> die volgende is die stelsel van die volgende <eos>
BLEU Score: [0.096, 0.07, 0.052, 0.0]


Epoch 5/20: 100%|██████████| 20/20 [00:08<00:00,  2.43batch/s, loss=1.375]


Predicted: <sos> die sein $x(t)$ word deur die volgende van die volgende van die volgende van die volgende <eos>
BLEU Score: [0.237, 0.153, 0.106, 0.0]


Epoch 6/20: 100%|██████████| 20/20 [00:07<00:00,  2.50batch/s, loss=1.285]


Predicted: <sos> die volgende sein van die volgende sein $x[n]$ : <eos>
BLEU Score: [0.232, 0.154, 0.115, 0.077]


Epoch 7/20: 100%|██████████| 20/20 [00:08<00:00,  2.45batch/s, loss=1.226]


Predicted: <sos> die volgende sein $x(t)$ word deur 'n faktor omsetter met 'n laagdeurlaatfilter van 'n syfer-na-analoog omsetter (dac) met 'n syfer-na-analoog omsetter (dac) met 'n syfer-na-analoog omsetter (dac) <eos>
BLEU Score: [0.205, 0.138, 0.097, 0.0]


Epoch 8/20: 100%|██████████| 20/20 [00:08<00:00,  2.45batch/s, loss=1.121]


Predicted: <sos> die volgende sein word word deur 'n analoog-na-syfer omsetter (adc) teen 'n monsterfrekwensie van $f_s=5$ khz , en die gemiddeld , en die gemiddeld van die gemiddeld <eos>
BLEU Score: [0.293, 0.171, 0.114, 0.0]


Epoch 9/20: 100%|██████████| 20/20 [00:07<00:00,  2.70batch/s, loss=1.047]


Predicted: <sos> die volgende sein word word deur 'n analoog-na-syfer te maak van die berekeningspoed , met 'n faktor van 'n syfer-na-analoog -punt (dac) , met 'n faktor van 'n syfer-na-analoog <eos>
BLEU Score: [0.23, 0.156, 0.11, 0.0]


Epoch 10/20: 100%|██████████| 20/20 [00:07<00:00,  2.66batch/s, loss=0.976]


Predicted: <sos> die sein $x(t)$ word 'n faktor met 'n faktor met 'n faktor van 2 khz , en die derde in die spektrale frekwensie-ruimte <eos>
BLEU Score: [0.33, 0.186, 0.123, 0.0]


Epoch 11/20: 100%|██████████| 20/20 [00:07<00:00,  2.69batch/s, loss=0.864]


Predicted: <sos> ons het 'n datastel met vyf kenmerke , $x_1$ = matriek gemiddeld , $x_2$ = ik geslag , $x_3$ = geslag (1 vir vroulik en 0 vir manlik) <eos>
BLEU Score: [0.292, 0.158, 0.103, 0.0]


Epoch 12/20: 100%|██████████| 20/20 [00:07<00:00,  2.70batch/s, loss=0.831]


Predicted: <sos> veronderstel ons het 'n datastel met vyf kenmerke , $x_1$ = matriek gemiddeld , $x_2$ = ik toetspunt , $x_3$ = geslag (1 vir vroulik en 0 vir manlik) , $x_4=x_1\cdotx_2$ , en $x_5=x_1\cdotx_3$ <eos>
BLEU Score: [0.205, 0.103, 0.065, 0.0]


Epoch 13/20: 100%|██████████| 20/20 [00:07<00:00,  2.73batch/s, loss=0.741]


Predicted: <sos> ons wil 'n datastel met vyf kenmerke , $x_1$ = matriek gemiddeld , $x_2$ = ik toetspunt , $x_3$ = geslag (1 vir vroulik en 0 vir manlik) , $x_4=x_1\cdotx_2$ , en $x_5=x_1\cdotx_3$ <eos>
BLEU Score: [0.195, 0.101, 0.065, 0.0]


Epoch 14/20: 100%|██████████| 20/20 [00:07<00:00,  2.70batch/s, loss=0.668]


Predicted: <sos> veronderstel ons het die volgende datastel : <eos>
BLEU Score: [0.104, 0.077, 0.06, 0.041]


Epoch 15/20: 100%|██████████| 20/20 [00:07<00:00,  2.69batch/s, loss=0.598]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 16/20: 100%|██████████| 20/20 [00:07<00:00,  2.77batch/s, loss=0.556]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 17/20: 100%|██████████| 20/20 [00:07<00:00,  2.79batch/s, loss=0.487]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 18/20: 100%|██████████| 20/20 [00:07<00:00,  2.79batch/s, loss=0.437]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


Epoch 19/20: 100%|██████████| 20/20 [00:07<00:00,  2.77batch/s, loss=0.410]


Predicted: <sos> as ons die data hier bo op hierdie manier verwerk , sal dit nie beteken dat vir elk van die drie klasse al die data rondom die oorsprong sal l\^{e} met eenheidsvariansie nie <eos>
BLEU Score: [0.318, 0.228, 0.183, 0.146]


Epoch 20/20: 100%|██████████| 20/20 [00:07<00:00,  2.72batch/s, loss=0.381]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [1.0, 1.0, 1.0, 1.0]


## Evaluate on the training set

In [8]:
EN_SRC = [' '.join(sent) for sent in english_data.data_str]
AF_REF = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [translator.translate_sentence(sent) for sent in EN_SRC]
corpus_bleu(TRANSLATED, AF_REF)

                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.6615589962626802
precisions          : [0.6615589962626802]
brevity_penalty     : 1.0
length_ratio        : 1.0146811853296496
translation_length  : 37460
reference_length    : 36918
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.5829844332544669
precisions          : [0.6615589962626802, 0.5137423137423137]
brevity_penalty     : 1.0
length_ratio        : 1.0146811853296496
translation_length  : 37460
reference_length    : 36918
******************************************************************************************
                                     BLE

## Evaluate on the validation set

In [9]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [10]:
VAL_AF_REF = [[sent] for sent in afrikaans_val]

VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5688924218335983
precisions          : [0.5688924218335983]
brevity_penalty     : 1.0
length_ratio        : 1.0338309820572524
translation_length  : 15096
reference_length    : 14602
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.47534438680868857
precisions          : [0.5688924218335983, 0.39717928627395116]
brevity_penalty     : 1.0
length_ratio        : 1.0338309820572524
translation_length  : 15096
reference_length    : 14602
******************************************************************************************
                                     B

### Beam Search

In [11]:
VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5011631964638827
precisions          : [0.5011631964638827]
brevity_penalty     : 1.0
length_ratio        : 1.1775099301465553
translation_length  : 17194
reference_length    : 14602
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.4165130868062826
precisions          : [0.5011631964638827, 0.34616099646774495]
brevity_penalty     : 1.0
length_ratio        : 1.1775099301465553
translation_length  : 17194
reference_length    : 14602
******************************************************************************************
                                     BL

## Evaluate on the SUN validation set only

In [12]:
with open(f"{config.VAL_DATA}/sun_english.txt") as data:
    sun_english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/sun_afrikaans.txt") as data:
    sun_afrikaans_val = data.read().strip().split("\n")

### Greedy search

In [13]:
SUN_VAL_AF = [[sent] for sent in sun_afrikaans_val]
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.35993597072947625
precisions          : [0.3599359707294763]
brevity_penalty     : 1.0
length_ratio        : 1.1465652857891977
translation_length  : 4373
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.27034514516871405
precisions          : [0.3599359707294763, 0.20305416368408494]
brevity_penalty     : 1.0
length_ratio        : 1.1465652857891977
translation_length  : 4373
reference_length    : 3814
******************************************************************************************
                                     BLEU

### Beam search

In [14]:
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=3) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.30831363278171786
precisions          : [0.30831363278171786]
brevity_penalty     : 1.0
length_ratio        : 1.330886208704772
translation_length  : 5076
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.2282533375158917
precisions          : [0.30831363278171786, 0.1689824274621986]
brevity_penalty     : 1.0
length_ratio        : 1.330886208704772
translation_length  : 5076
reference_length    : 3814
******************************************************************************************
                                     BLEU-3

In [15]:
metric = evaluate.load("bleu")
predictions = [translator.translate_sentence(sent, method="beam", beam_width=5) for sent in sun_english_val[10:20]]
labels = SUN_VAL_AF[10:20]
for source, pred, lab in zip(sun_english_val[10:20],predictions, labels):
    print(f"Source    : {source}")
    print(f"Prediction: {pred[:150]}")
    print(f"Label     : {lab[0][:150]}")
    print(f"BLEU      : {metric.compute(predictions=[pred], references=lab)['bleu']}")
    print()

Source    : <sos> component <eos>
Prediction: <sos> vir my rekenaar <eos>
Label     : <sos> komponent <eos>
BLEU      : 0.0

Source    : <sos> architecture <eos>
Prediction: <sos> vir my rekenaar <eos>
Label     : <sos> argitektuur <eos>
BLEU      : 0.0

Source    : <sos> specification <eos>
Prediction: <sos> vir my rekenaar <eos>
Label     : <sos> spesifikasies <eos>
BLEU      : 0.0

Source    : <sos> at which stage of the design process would we choose the communication protocol between subsystems <eos>
Prediction: <sos> as spar die musiek wil speel teen $1.1$ keer die spoed van die oorspronklike musiek , watter waardes sal jy vir $l_1$ en $l_2$ aanbeveel <eos>
Label     : <sos> by watter stap van die ontwerpsproses word die kommunikasie-kanaal tussen substelsels gekies <eos>
BLEU      : 0.0

Source    : <sos> motivate your answer <eos>
Prediction: <sos> motiveer jou antwoorde <eos>
Label     : <sos> motiveer jou antwoord <eos>
BLEU      : 0.5969491792019646

Source    : <sos> descri