# NEURAL MACHINE TRANSLATION - Vanilla RNN with Attention

## Required Module & Config files

In [1]:
import src.RNN_GRUAttention as gruANMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, get_device, train_model, sentence_bleu, corpus_bleu
from src.TranslatorAtt import TranslatorAtt
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
from torchinfo import summary

# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


Using device: mps


## Load the dataset

In [2]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [3]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 256

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 256

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

LR = 1e-3
BATCH_SIZE = 128

## Set the model

In [4]:
encoder_net = gruANMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS, type='GRU').to(device)
decoder_net = gruANMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS, type='GRU').to(device)
model = gruANMT.RNNAtt(encoder_net, decoder_net, OUT_DECODER)

summary(model)

Layer (type:depth-idx)                   Param #
RNNAtt                                   --
├─Encoder: 1-1                           --
│    └─GRU: 2-1                          10,235,904
│    └─Embedding: 2-2                    743,936
├─Decoder: 1-2                           --
│    └─GRU: 2-3                          10,235,904
│    └─Embedding: 2-4                    737,024
│    └─Linear: 2-5                       5,899,071
Total params: 27,851,839
Trainable params: 27,851,839
Non-trainable params: 0

In [5]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)
translator = TranslatorAtt(model, english_data, afrikaans_data, device)

In [6]:
# Data used for follow-up durring training
mytext = "<sos> given that we represent the target output as $y\in\{0,1\}$ and we have $n$ training points , we can write the negative log likelihood of the parameters as follows : <eos>"
ground = "<sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>"

predicted = translator.translate_sentence(mytext)
bleu = sentence_bleu(prediction=[predicted], reference=[ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU SCORES: {bleu}")

Pred: <sos> berlyn waarmee covid-19 covid-19 gegooi gegooi normaal orals normaal (roc) (roc) hardloop stelsels gegooi gegooi oorbly berlyn spektrogram geskied uitvoerende
Refe: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU SCORES: [0.071, 0.059, 0.045, 0.0]


## Train the data

In [7]:
EPOCHS = 20
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": EPOCHS,
    "source_test": mytext,
    "reference": ground,
	"translator":translator
}

train_model(**params)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Epoch 1/20: 100%|██████████| 20/20 [00:11<00:00,  1.80batch/s, loss=1.674]


Predicted: <sos> die volgende die volgende die volgende die volgende : die volgende : die volgende word die volgende : : <eos>
BLEU Score: [0.206, 0.156, 0.122, 0.083]


Epoch 2/20: 100%|██████████| 20/20 [00:10<00:00,  1.85batch/s, loss=1.436]


Predicted: <sos> die rekening die volgende van die volgende van die volgende van die volgende van die volgende van die volgende van
BLEU Score: [0.108, 0.07, 0.049, 0.0]


Epoch 3/20: 100%|██████████| 20/20 [00:10<00:00,  1.85batch/s, loss=1.210]


Predicted: <sos> ons het die data van die data en die data in die gemiddeld van die gemiddeld , en die gemiddeld
BLEU Score: [0.217, 0.099, 0.062, 0.0]


Epoch 4/20: 100%|██████████| 20/20 [00:10<00:00,  1.88batch/s, loss=0.928]


Predicted: <sos> ons wil die data van die dac voor presies en ons bereken kan as ons stompies basismodel basismodel en bereken
BLEU Score: [0.239, 0.147, 0.08, 0.0]


Epoch 5/20: 100%|██████████| 20/20 [00:11<00:00,  1.77batch/s, loss=0.766]


Predicted: <sos> dit ons die die wins bayes aanname en ons kan indink as die outokorrelasie wat die oorsprong se salaris en
BLEU Score: [0.217, 0.14, 0.078, 0.0]


Epoch 6/20: 100%|██████████| 20/20 [00:11<00:00,  1.76batch/s, loss=0.596]


Predicted: <sos> dit lyk ons ons die aanvanklike as die dft van die resultaat en ons kan as ons die vereistes log-waarskynlikheidskostefunksie
BLEU Score: [0.282, 0.196, 0.122, 0.0]


Epoch 7/20: 100%|██████████| 20/20 [00:11<00:00,  1.75batch/s, loss=0.449]


Predicted: <sos> dit is ons as die intree en die hoof van die klassifiseerder voor as ons slegs die dft van die
BLEU Score: [0.217, 0.121, 0.07, 0.0]


Epoch 8/20: 100%|██████████| 20/20 [00:11<00:00,  1.80batch/s, loss=0.370]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 9/20: 100%|██████████| 20/20 [00:11<00:00,  1.76batch/s, loss=0.328]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 10/20: 100%|██████████| 20/20 [00:11<00:00,  1.81batch/s, loss=0.282]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 11/20: 100%|██████████| 20/20 [00:11<00:00,  1.76batch/s, loss=0.273]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 12/20: 100%|██████████| 20/20 [00:10<00:00,  1.83batch/s, loss=0.252]


Predicted: <sos> ons het die volgende ontwerpsmatriks : , en ons het die negatiewe log-waarskynlikheidskostefunksie skryf as : <eos>
BLEU Score: [0.378, 0.302, 0.257, 0.223]


Epoch 13/20: 100%|██████████| 20/20 [00:10<00:00,  1.82batch/s, loss=0.248]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 14/20: 100%|██████████| 20/20 [00:11<00:00,  1.75batch/s, loss=0.249]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 15/20: 100%|██████████| 20/20 [00:12<00:00,  1.66batch/s, loss=0.253]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 16/20: 100%|██████████| 20/20 [00:11<00:00,  1.73batch/s, loss=0.252]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 17/20: 100%|██████████| 20/20 [00:11<00:00,  1.77batch/s, loss=0.244]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 18/20: 100%|██████████| 20/20 [00:11<00:00,  1.73batch/s, loss=0.248]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 19/20: 100%|██████████| 20/20 [00:11<00:00,  1.79batch/s, loss=0.238]


Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]


Epoch 20/20: 100%|██████████| 20/20 [00:11<00:00,  1.74batch/s, loss=0.247]

Predicted: <sos> as ons die teikenuittree voorstel as $y\in\{0,1\}$ en ons $n$ afrigpunte het , dan kan ons die negatiewe log-waarskynlikheidskostefunksie skryf
BLEU Score: [0.863, 0.863, 0.863, 0.863]





## Evauate on the Training set

In [8]:
EN_SRC = [' '.join(sent) for sent in english_data.data_str]
AF_REF = [[' '.join(sent)] for sent in afrikaans_data.data_str]
TRANSLATED = [translator.translate_sentence(sent) for sent in EN_SRC]
corpus_bleu(TRANSLATED, AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.8139750544813836
precisions          : [0.8139750544813836]
brevity_penalty     : 1.0
length_ratio        : 1.1684997561900634
translation_length  : 43134
reference_length    : 36914
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.7986340986378577
precisions          : [0.8139750544813836, 0.7835822731858559]
brevity_penalty     : 1.0
length_ratio        : 1.1684997561900634
translation_length  : 43134
reference_length    : 36914
******************************************************************************************
                                     BLE

## Evaluate on the Test set

In [9]:
with open(f"{config.VAL_DATA}/english.txt") as data:
    english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/afrikaans.txt") as data:
    afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [10]:
VAL_AF_REF = [[sent] for sent in afrikaans_val]
VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in english_val]

corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.5259002919865903
precisions          : [0.5259002919865903]
brevity_penalty     : 1.0
length_ratio        : 1.2661919759003148
translation_length  : 18494
reference_length    : 14606
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.44561595624217826
precisions          : [0.5259002919865903, 0.3775878878247405]
brevity_penalty     : 1.0
length_ratio        : 1.2661919759003148
translation_length  : 18494
reference_length    : 14606
******************************************************************************************
                                     BL

### Beam search

In [11]:
VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=2) for sent in english_val]
corpus_bleu(VAL_TRANSLATED, VAL_AF_REF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.6418864370290636
precisions          : [0.6418864370290636]
brevity_penalty     : 1.0
length_ratio        : 1.0176639737094344
translation_length  : 14864
reference_length    : 14606
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.5477221064678344
precisions          : [0.6418864370290636, 0.46737162308973706]
brevity_penalty     : 1.0
length_ratio        : 1.0176639737094344
translation_length  : 14864
reference_length    : 14606
******************************************************************************************
                                     BL

## Evaluate on the SUN validation set only

In [12]:
with open(f"{config.VAL_DATA}/sun_english.txt") as data:
    sun_english_val = data.read().strip().split("\n")
with open(f"{config.VAL_DATA}/sun_afrikaans.txt") as data:
    sun_afrikaans_val = data.read().strip().split("\n")

### Greedy Search

In [13]:
SUN_VAL_AF = [[sent] for sent in sun_afrikaans_val]
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.3556851311953353
precisions          : [0.3556851311953353]
brevity_penalty     : 1.0
length_ratio        : 1.169113791295228
translation_length  : 4459
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.2658720554172127
precisions          : [0.3556851311953353, 0.19873743277998598]
brevity_penalty     : 1.0
length_ratio        : 1.169113791295228
translation_length  : 4459
reference_length    : 3814
******************************************************************************************
                                     BLEU-3  

### Beam Search

In [14]:
SUN_VAL_TRANSLATED = [translator.translate_sentence(sent, method="beam", beam_width=2) for sent in sun_english_val]
corpus_bleu(SUN_VAL_TRANSLATED, SUN_VAL_AF)

Using the latest cached version of the module from /Users/lucien/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Jul 18 16:29:52 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


                                     BLEU-1                                     
------------------------------------------------------------------------------------------
bleu                : 0.4316832983984206
precisions          : [0.4908284023668639]
brevity_penalty     : 0.8794994265139612
length_ratio        : 0.8862087047718931
translation_length  : 3380
reference_length    : 3814
******************************************************************************************
                                     BLEU-2                                     
------------------------------------------------------------------------------------------
bleu                : 0.3347704922746263
precisions          : [0.4908284023668639, 0.2951844903064415]
brevity_penalty     : 0.8794994265139612
length_ratio        : 0.8862087047718931
translation_length  : 3380
reference_length    : 3814
******************************************************************************************
              