# NEURAL MACHINE TRANSLATION - GRU

## Required Module & Config files

In [1]:
import src.GRU as gruNMT
from src.Tokenizer import Corpus, LangData, dataLoader
from src.utils import load_config, translate_sentece, get_device, train_model
from torch.nn import CrossEntropyLoss
from torch.optim import NAdam
import numpy as np

In [2]:
from src.utils import torch_bleu_score

In [3]:
# Loading config file
config = load_config()
# Get device : GPU/MPS Back-End/CPU
device = get_device()
print(f"Using device: {device}")

Using device: mps


## Data Preprocessing

In [4]:
# # TRAIN_DATA
# preprocess_data(config.TRAIN_RAW, config.TRAIN_DATA, config.TRAIN_AFRIKAANS, "afrikaans")
# preprocess_data(config.TRAIN_RAW, config.TRAIN_DATA, config.TRAIN_ENGLISH, "english")

In [5]:
# # VAL_DATA
# preprocess_data(config.VAL_RAW, config.VAL_DATA, config.VAL_AFRIKAANS, "afrikaans")
# preprocess_data(config.VAL_RAW, config.VAL_DATA, config.VAL_ENGLISH, "english")

## Load the dataset

In [6]:
# Encoder-Source
english_data = Corpus(f"{config.TRAIN_DATA}/english.txt", "English")
afrikaans_data = Corpus(f"{config.TRAIN_DATA}/afrikaans.txt", "Afrikaans")

## Set Hyperparameters

In [7]:
# Encoder - source
IN_ENCODER = english_data.vocab_size
ENCODER_EMB = 128

# Decoder - target
IN_DECODER = afrikaans_data.vocab_size
OUT_DECODER = afrikaans_data.vocab_size
DECODER_EMB = 128

# Shared
HIDDEN_SIZE = 1024
NUM_LAYERS = 2

EPOCHS = 20
LR = 1e-3
BATCH_SIZE = 128

## Set the model

In [8]:
encoder_net = gruNMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
decoder_net = gruNMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
model = gruNMT.GRU_NMT(encoder_net, decoder_net, OUT_DECODER)

In [9]:
train_data = LangData(english_data, afrikaans_data)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = afrikaans_data.stoi['<pad>']
criterion = CrossEntropyLoss(ignore_index=0)

optimizer = NAdam(model.parameters(), LR)

In [10]:
# Data used for follow-up durring training
mytext = "<sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>"
ground = "<sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>"

predicted = translate_sentece(model, mytext, english_data, afrikaans_data, device)
bleu = torch_bleu_score([predicted], [ground])
print(f"Pred: {predicted}")
print(f"Refe: {ground}")
print(f"BLEU: {bleu.item()}")

Pred: <sos> dimensies lewer vermenigvuldiging voeg voeg sou annotasies ik ik veroorsaak gegee gegee gekies dromstel hierdie genoeg alle alle alle onafhanklik toets toets le le stabiel ik vermenigvuldiging initialisering initialisering initialisering
Refe: <sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>
BLEU: 0.0


## Train the data

In [11]:
params = {
    "model": model,
    "train_loader": train_loader,
    "optimizer": optimizer,
    "criterion": criterion,
    "device": device,
    "epochs": 5,
    "source_test": mytext,
    "target_test": ground,
    "source_lang": english_data,
    "target_lang": afrikaans_data,
}

train_model(**params)

Epoch 1/5: 100%|██████████| 4/4 [00:02<00:00,  1.51batch/s, loss=3.122]


Predicted: <sos> is <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n <apo>n
Reference: <sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>
BLEU Score: 0.0


Epoch 2/5: 100%|██████████| 4/4 [00:02<00:00,  1.88batch/s, loss=2.765]


Predicted: <sos> die die van <ltx> <eos>
Reference: <sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>
BLEU Score: 0.0


Epoch 3/5: 100%|██████████| 4/4 [00:01<00:00,  2.05batch/s, loss=2.435]


Predicted: <sos> die <ltx> <ltx> <ltx> <com> <ltx> <ltx> <com> <ltx> <ltx> <com> <ltx> <ltx> <com> <ltx> <ltx> <com> <ltx> <ltx> <com> <ltx> <ltx> <com> <ltx> <ltx> <com> <ltx> <ltx> <com> <ltx>
Reference: <sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>
BLEU Score: 0.0


Epoch 4/5: 100%|██████████| 4/4 [00:02<00:00,  1.91batch/s, loss=2.336]


Predicted: <sos> die volgende van die volgende van die volgende <eos>
Reference: <sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>
BLEU Score: 0.0


Epoch 5/5: 100%|██████████| 4/4 [00:01<00:00,  2.08batch/s, loss=2.155]

Predicted: <sos> die volgende van <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx>
Reference: <sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>
BLEU Score: 0.0





In [12]:
from nltk.translate.bleu_score import sentence_bleu, modified_precision, SmoothingFunction, corpus_bleu

In [13]:
EN_STR = [[' '.join(sent)] for sent in english_data.data_str]
AF_STR = [[' '.join(sent)] for sent in afrikaans_data.data_str]

In [14]:
TRANSLATED = [[translate_sentece(model, sent[0], english_data, afrikaans_data, device)] for sent in EN_STR]

In [15]:
BLEU_SCORE = [sentence_bleu([b[0].split()],a[0].split(), smoothing_function=SmoothingFunction().method1) for a,b in zip(TRANSLATED, AF_STR)]

In [16]:
np.mean(BLEU_SCORE)

0.04856337909640835

In [17]:
from torcheval.metrics.functional import bleu_score as torchblue

In [18]:
def torch_blue_score(candidat, reference, device=None):
    n_gram = min(len(candidat.split()), len(reference.split()), 4)
    score = torchblue(candidat, [reference], n_gram, device=device)
    return score.item()

In [19]:
BLEU_SCORE2 = [torch_blue_score(a[0],b[0]) for a, b in zip(TRANSLATED, AF_STR)]
np.mean(BLEU_SCORE2)

0.02971724069591136