# NEURAL MACHINE TRANSLATION - Vanilla RNN

## Required Module & Config files

In [1]:
import src.RNN_GRU as rnnNMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model, sentence_bleu, corpus_bleu
from src.Translator import Translator
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
import evaluate
import numpy as np
from torchinfo import summary

# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Using device: mps


## Load the dataset

In [2]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [3]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-4
BATCH_SIZE = 128

## Set the model

In [4]:
encoder_net = rnnNMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
decoder_net = rnnNMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
model = rnnNMT.RNN_NMT(encoder_net, decoder_net, OUT_DECODER)

summary(model)

Layer (type:depth-idx)                   Param #
RNN_NMT                                  --
├─Encoder: 1-1                           --
│    └─RNN: 2-1                          3,411,968
│    └─Embedding: 2-2                    743,936
├─Decoder: 1-2                           --
│    └─RNN: 2-3                          3,411,968
│    └─Embedding: 2-4                    737,024
│    └─Linear: 2-5                       2,950,975
Total params: 11,255,871
Trainable params: 11,255,871
Non-trainable params: 0

In [5]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = Translator(model, english_data, afrikaans_data, device)

In [6]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows: <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as: <eos>"

predicted = translator.translate_sentence(mytext)
bleu = sentence_bleu(prediction=[predicted], reference=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU SCORES: {bleu}")

Pred: <sos> hengel $l_2$ verskaf $\omega=\frac{\pi}{2}$ \textbf{nie} grappe maklike metode i mnr standardiseer twyfel nag oorsee $x[n]\,\raisebox{.5pt}{\textcircled{\footnotesize\raisebox{-.9pt}{$ gewoonlik probleem mnr. asemhaal laas voorspel hout italiaans susters onmiddelike reguit wolfgang sin daarntoe gestraf verdrink situasie greep voorspellings walglike anti-kousaal spookasem ure ag aangetoon hieroor gevier (statistiese lemoene $\hat{x}[k]$ kenmerk $w_3=35$ snobisties stabiel rook tydens aangesien "lisa stasie bestuur duurder rede nag bed kleurbeeld gewigsvektor gesing s proe straat ik hiervan klip bedink werkloos parys huwelik bewerkings indink gewees hierdie doodgemaak pretparke dam gedank van spontaan opeet sap meisie tand woordebook minimum oorbetaal verloor daarvan getuienis voldoen gekies $x_1$ beramings $x[n]=\cos(2\pif_{\omega,0}n)$ lyn geplaas gebeur." stedelike 7 $w_5=-10$ winkelier stoof som matriek situasie vervang klim opgee beloof aandag "pragtige wesens onafhankli

## Train the data

In [7]:
EPOCHS = 20
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "reference": ground,
	"translator":translator
}

train_model(**params)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Epoch 1/20: 100%|██████████| 20/20 [00:04<00:00,  4.33batch/s, loss=1.942]


Predicted: <sos> ek het nie <eos>
BLEU Score: [0.028, 0.022, 0.017, 0.0]


Epoch 2/20: 100%|██████████| 20/20 [00:03<00:00,  5.58batch/s, loss=1.579]


Predicted: <sos> ek het nie <eos>
BLEU Score: [0.028, 0.022, 0.017, 0.0]


Epoch 3/20: 100%|██████████| 20/20 [00:03<00:00,  5.14batch/s, loss=1.575]


Predicted: <sos> ek het nie <eos>
BLEU Score: [0.028, 0.022, 0.017, 0.0]


Epoch 4/20: 100%|██████████| 20/20 [00:03<00:00,  5.20batch/s, loss=1.604]


Predicted: <sos> ek is nie <eos>
BLEU Score: [0.024, 0.021, 0.016, 0.0]


Epoch 5/20: 100%|██████████| 20/20 [00:03<00:00,  5.22batch/s, loss=1.583]


Predicted: <sos> ek is nie <eos>
BLEU Score: [0.024, 0.021, 0.016, 0.0]


Epoch 6/20: 100%|██████████| 20/20 [00:03<00:00,  5.27batch/s, loss=1.572]


Predicted: <sos> ek is nie <eos>
BLEU Score: [0.024, 0.021, 0.016, 0.0]


Epoch 7/20: 100%|██████████| 20/20 [00:03<00:00,  5.21batch/s, loss=1.585]


Predicted: <sos> ek het nie <eos>
BLEU Score: [0.028, 0.022, 0.017, 0.0]


Epoch 8/20: 100%|██████████| 20/20 [00:03<00:00,  5.04batch/s, loss=1.535]


Predicted: <sos> ek het nie <eos>
BLEU Score: [0.028, 0.022, 0.017, 0.0]


Epoch 9/20: 100%|██████████| 20/20 [00:03<00:00,  5.22batch/s, loss=1.530]


Predicted: <sos> die is van die volgende <eos>
BLEU Score: [0.057, 0.042, 0.031, 0.0]


Epoch 10/20: 100%|██████████| 20/20 [00:03<00:00,  5.35batch/s, loss=1.468]


Predicted: <sos> die is is die <eos>
BLEU Score: [0.044, 0.033, 0.025, 0.0]


Epoch 11/20: 100%|██████████| 20/20 [00:03<00:00,  5.36batch/s, loss=1.476]


Predicted: <sos> die is is <eos>
BLEU Score: [0.028, 0.022, 0.017, 0.0]


Epoch 12/20: 100%|██████████| 20/20 [00:03<00:00,  5.29batch/s, loss=1.485]


Predicted: <sos> die is is die <eos>
BLEU Score: [0.044, 0.033, 0.025, 0.0]


Epoch 13/20: 100%|██████████| 20/20 [00:03<00:00,  5.24batch/s, loss=1.495]


Predicted: <sos> die is is 'n <eos>
BLEU Score: [0.039, 0.031, 0.023, 0.0]


Epoch 14/20: 100%|██████████| 20/20 [00:03<00:00,  5.28batch/s, loss=1.479]


Predicted: <sos> die die is van die <eos>
BLEU Score: [0.057, 0.042, 0.031, 0.0]


Epoch 15/20: 100%|██████████| 20/20 [00:03<00:00,  5.62batch/s, loss=1.422]


Predicted: <sos> die die is van die <eos>
BLEU Score: [0.057, 0.042, 0.031, 0.0]


Epoch 16/20: 100%|██████████| 20/20 [00:03<00:00,  5.37batch/s, loss=1.443]


Predicted: <sos> die die is van die <eos>
BLEU Score: [0.057, 0.042, 0.031, 0.0]


Epoch 17/20: 100%|██████████| 20/20 [00:03<00:00,  5.40batch/s, loss=1.426]


Predicted: <sos> die die is van die <eos>
BLEU Score: [0.057, 0.042, 0.031, 0.0]


Epoch 18/20: 100%|██████████| 20/20 [00:03<00:00,  5.21batch/s, loss=1.387]


Predicted: <sos> die die is van die <eos>
BLEU Score: [0.057, 0.042, 0.031, 0.0]


Epoch 19/20: 100%|██████████| 20/20 [00:03<00:00,  5.23batch/s, loss=1.420]


Predicted: <sos> die die is van die <eos>
BLEU Score: [0.057, 0.042, 0.031, 0.0]


Epoch 20/20: 100%|██████████| 20/20 [00:03<00:00,  5.12batch/s, loss=1.429]

Predicted: <sos> die die is van die <eos>
BLEU Score: [0.057, 0.042, 0.031, 0.0]





## Evauate on the Training set

In [8]:
EN_SRC = [' '.join(sent) for sent in english_data.data_str]
AF_REF = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [translator.translate_sentence(sent) for sent in EN_SRC]
corpus_bleu(TRANSLATED, AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.42831771642061256
precisions          : [0.6074130935409584]
brevity_penalty     : 0.7051506149195823
length_ratio        : 0.7411009373137563
translation_length  : 27357
reference_length    : 36914
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.3517000896145005
precisions          : [0.6074130935409584, 0.4095406644678626]
brevity_penalty     : 0.7051506149195823
length_ratio        : 0.7411009373137563
translation_length  : 27357
reference_length    : 36914
******************************************************************************************
         

## Evaluate on the Test set

In [9]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [10]:
VAL_AF_REF = [[sent] for sent in afrikaans_val]
VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.4744678564312928
precisions          : [0.6181818181818182]
brevity_penalty     : 0.7675215324623854
length_ratio        : 0.7907709160618924
translation_length  : 11550
reference_length    : 14606
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.38801062613715653
precisions          : [0.6181818181818182, 0.4134184694558277]
brevity_penalty     : 0.7675215324623854
length_ratio        : 0.7907709160618924
translation_length  : 11550
reference_length    : 14606
******************************************************************************************
         

### Beam search

In [11]:
VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=2) for sent in english_val]
corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.4854298546571868
precisions          : [0.6140706916962625]
brevity_penalty     : 0.7905113551605468
length_ratio        : 0.8096672600301246
translation_length  : 11826
reference_length    : 14606
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.3971772551533922
precisions          : [0.6140706916962625, 0.41108738044386667]
brevity_penalty     : 0.7905113551605468
length_ratio        : 0.8096672600301246
translation_length  : 11826
reference_length    : 14606
******************************************************************************************
         

## Evaluate on the SUN validation set only

In [12]:
with open(f"{config.VAL_DATA}/sun_english.txt") as data:
    sun_english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/sun_afrikaans.txt") as data:
    sun_afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [13]:
SUN_VAL_AF = [[sent] for sent in sun_afrikaans_val]
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.2873580445002975
precisions          : [0.7227822580645161]
brevity_penalty     : 0.39757207830445623
length_ratio        : 0.5201887781856319
translation_length  : 1984
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.22323048227434308
precisions          : [0.7227822580645161, 0.4361820199778024]
brevity_penalty     : 0.39757207830445623
length_ratio        : 0.5201887781856319
translation_length  : 1984
reference_length    : 3814
******************************************************************************************
           

### Beam Search

In [15]:
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=2) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.279813426682262
precisions          : [0.7044881492687847]
brevity_penalty     : 0.3971868468940054
length_ratio        : 0.5199265862611432
translation_length  : 1983
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.21769851053863953
precisions          : [0.7044881492687847, 0.4264297612437535]
brevity_penalty     : 0.3971868468940054
length_ratio        : 0.5199265862611432
translation_length  : 1983
reference_length    : 3814
******************************************************************************************
              