In [74]:
from src.Normalizer import preprocess_data
from typing import Dict

# DATA PERPARATION

In [75]:
DATA_BASE = "./data"
TRAIN_RAW = f"{DATA_BASE}/train"
TRAIN_DATA = f"{TRAIN_RAW}/norm"

VAL_RAW = f"{DATA_BASE}/val"
VAL_DATA = f"{VAL_RAW}/norm"

TRAIN_AFRIKAANS = [
	"data414_2021_a1.af.txt",
	"data414_2021_a2.af.txt",
	"data414_2020_a1.af.txt",
	"ss414_2018_a1.af.txt",
	"ss414_2018_a2.af.txt",
	"ss414_2018_a3.af.txt",
	"ss414_2019_a1.af.txt",
	"ss414_2019_a2.af.txt",
	"ss414_2019_a3.af.txt",]

TRAIN_ENGLISH = [
	"data414_2021_a1.en.txt",
	"data414_2021_a2.en.txt",
	"data414_2020_a1.en.txt",
	"ss414_2018_a1.en.txt",
	"ss414_2018_a2.en.txt",
	"ss414_2018_a3.en.txt",
	"ss414_2019_a1.en.txt",
	"ss414_2019_a2.en.txt",
	"ss414_2019_a3.en.txt",]

VAL_AFRIKAANS = [
	"compsys414_2017_a1.af.txt",
	"compsys414_2017_a2.af.txt",
	"compsys414_2017_a3.af.txt",]

VAL_ENGLISH = [
	"compsys414_2017_a1.en.txt",
	"compsys414_2017_a2.en.txt",
	"compsys414_2017_a3.en.txt",]

In [78]:
# # TRAIN_DATA
# preprocess_data(TRAIN_RAW, TRAIN_DATA, TRAIN_AFRIKAANS, "afrikaans")
# preprocess_data(TRAIN_RAW, TRAIN_DATA, TRAIN_ENGLISH, "english")

In [79]:
# # VAL_DATA
# preprocess_data(VAL_RAW, VAL_DATA, VAL_AFRIKAANS, "afrikaans")
# preprocess_data(VAL_RAW, VAL_DATA, VAL_ENGLISH, "english")

## Corpus

In [80]:
class Corpus:
	def __init__(self, file_name: str, lang:str):
		self.file_name = file_name
		self.lang = lang
		self.vocab_size = 11
		self.data = []
		self.stoi: Dict[str, int] = {
			"<pad>": 0,
			"<sos>": 1,
			"<eos>": 2,
			"<unk>": 3,
			"<num>": 4,
			"<com>": 5,
			"<prc>": 6,
			"<opn>": 7,
			"<cld>": 8,
			"<apo>": 9,
			"<ltx>": 10,
		}
		self.itos: Dict[int, str] = {
			0: "<pad>",
			1: "<sos>",
			2: "<eos>",
			3: "<unk>",
			4: "<num>",
			5: "<com>",
			6: "<prc>",
			7: "<opn>",
			8: "<cld>",
			9: "<apo>",
			10: "<ltx>",
		}
		self.__init_data()
		self.__encode()

	def __init_data(self):
		with open(self.file_name, "r") as file:
			for line in file:
				line = line.strip().split()
				self.data.append(line)
				for word in line:
					if not self.stoi.get(word):
						self.vocab_size += 1
						self.stoi[word] = self.vocab_size - 1
						self.itos[self.vocab_size-1] = word
	def __encode(self):
		_data = [[self.stoi[word] for word in sentence] for sentence in self.data]
		self.data = _data
		
	def decode(self, data):
		_data = [[self.stoi[word] for word in sentence] for sentence in data]
		return _data

## Torch data

In [81]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [82]:
class LangData(Dataset):
	def __init__(self, source, target):
		if len(source.data) != len(target.data):
			raise RuntimeError("Source and target must have the same lenght")
		self.source = source.data
		self.target = target.data
	def __getitem__(self, idx):
		x = torch.tensor(self.source[idx], dtype=torch.long)
		y = torch.tensor(self.target[idx], dtype=torch.long)
		return x, y
	def __len__(self):
		return len(self.source)

def collate_fn(batch):
	"""
	 Pad shorter sequence with 0 (<pad>) to match the longest sequence
	 to obtain a uniform bacht size.
	"""
	source, target = zip(*batch)
	# Pad sequences
	source = pad_sequence(source, batch_first=False, padding_value=0)
	target = pad_sequence(target, batch_first=False, padding_value=0)
	return source, target


def dataLoader(dataset, batch_size):
	return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

## NMT: AFRIKAANS -> ENGLISH

In [83]:
import torch
from torch import nn
from torch import optim
from tqdm import tqdm
import src.GRU as gruNMT

from torch.utils.tensorboard import SummaryWriter

if torch.backends.mps.is_available:
	device = "mps"  # OSX
elif torch.cuda.is_available:
	device = "cuda"
else:
	device = "cpu"
print(device)

mps


In [84]:
# Hyper-params
afrikaans = Corpus(f"{TRAIN_DATA}/afrikaans.txt", "Afrikaans")
english = Corpus(f"{TRAIN_DATA}/english.txt", "English")
IN_ENCODER = afrikaans.vocab_size
IN_DECODER = english.vocab_size
OUT_DECODER = english.vocab_size

ENCODER_EMB = 256
DECODER_EMB = 256

HIDDEN_SIZE = 1024
NUM_LAYERS = 4

In [85]:
encoder_net = gruNMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE,
					  NUM_LAYERS).to(device)
decoder_net = gruNMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, OUT_DECODER,
					  NUM_LAYERS).to(device)
nmt = gruNMT.NMT(encoder_net, decoder_net, OUT_DECODER)

In [89]:
EPOCHS = 20
LR = 1e-3
BATCH_SIZE = 128

train_data = LangData(afrikaans, english)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = english.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.NAdam(nmt.parameters(), LR)

In [90]:
step = 0
# writer = SummaryWriter(f"runs/loss_plot")
N = len(train_data)
text = "<sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>"
grdt = "<sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>"

In [92]:
print(gruNMT.translate(nmt, text, afrikaans, english, device)+"\n")
for epoch in range(EPOCHS):
	pbar = tqdm(train_loader, unit="batch" ,desc=f"Epoch {epoch+1}/{EPOCHS}")
	run_loss = 0
	for source, target_ in pbar:
		source = source.to(device)
		target = target_.to(device)

		output_ = nmt(source, target)
		output = output_.reshape(-1, output_.shape[2])
		target = target.permute(1,0).reshape(-1)

		optimizer.zero_grad()
		loss = criterion(output, target)
		loss.backward()

		torch.nn.utils.clip_grad_norm_(nmt.parameters(), max_norm=2)
		optimizer.step()
		run_loss +=loss.item()*source.size(0)
		pbar.set_postfix(loss=f"{run_loss/N:.3f}")
	print(f"Pred : {gruNMT.translate(nmt, text, afrikaans, english, device)}")
	print(f"Grdt : {grdt}\n")
# 	writer.add_scalar("Loss", run_loss/N, global_step=epoch)
# writer.flush()
# writer.close()

<sos> the following code <com> i <eos>



Epoch 1/20: 100%|██████████| 4/4 [00:04<00:00,  1.06s/batch, loss=0.909]


Pred : <sos> the following code is defined <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 2/20: 100%|██████████| 4/4 [00:03<00:00,  1.02batch/s, loss=0.706]


Pred : <sos> the following code is given by <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 3/20: 100%|██████████| 4/4 [00:04<00:00,  1.04s/batch, loss=0.752]


Pred : <sos> the classifier of each classifier is more except on the next page <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 4/20: 100%|██████████| 4/4 [00:03<00:00,  1.15batch/s, loss=0.570]


Pred : <sos> the sampling rate of the dac is <num> khz <com> and all filters can be considered considered to be ideal <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 5/20: 100%|██████████| 4/4 [00:03<00:00,  1.09batch/s, loss=0.508]


Pred : <sos> the classifier makes correct positive predictions and predictions and predictions correct <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 6/20: 100%|██████████| 4/4 [00:04<00:00,  1.06s/batch, loss=0.569]


Pred : <sos> the classifier of each branch is except when when that that the negative is to the same frequency of <ltx> khz <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 7/20: 100%|██████████| 4/4 [00:04<00:00,  1.06s/batch, loss=0.464]


Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 8/20: 100%|██████████| 4/4 [00:03<00:00,  1.01batch/s, loss=0.416]


Pred : <sos> the classifier of each branch is unity except when a value is indicated <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 9/20: 100%|██████████| 4/4 [00:04<00:00,  1.09s/batch, loss=0.410]


Pred : <sos> the classifier is correct <com> and why <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 10/20: 100%|██████████| 4/4 [00:03<00:00,  1.01batch/s, loss=0.351]


Pred : <sos> the classifier is correct <com> and why <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 11/20: 100%|██████████| 4/4 [00:03<00:00,  1.00batch/s, loss=0.342]


Pred : <sos> the classifier of your more <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 12/20: 100%|██████████| 4/4 [00:04<00:00,  1.03s/batch, loss=0.316]


Pred : <sos> the classifier is correct <com> and why <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 13/20: 100%|██████████| 4/4 [00:04<00:00,  1.06s/batch, loss=0.307]


Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 14/20: 100%|██████████| 4/4 [00:04<00:00,  1.06s/batch, loss=0.293]


Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 15/20: 100%|██████████| 4/4 [00:03<00:00,  1.06batch/s, loss=0.254]


Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 16/20: 100%|██████████| 4/4 [00:03<00:00,  1.09batch/s, loss=0.240]


Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 17/20: 100%|██████████| 4/4 [00:04<00:00,  1.07s/batch, loss=0.286]


Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 18/20: 100%|██████████| 4/4 [00:04<00:00,  1.07s/batch, loss=0.270]


Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 19/20: 100%|██████████| 4/4 [00:03<00:00,  1.00batch/s, loss=0.249]


Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 20/20: 100%|██████████| 4/4 [00:03<00:00,  1.02batch/s, loss=0.247]

Pred : <sos> the classifier makes correct positive predictions and <num> <com> <num> correct negative predictions <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>






In [93]:
text = "<sos> as ons die teikenuittree voorstel as <ltx> en ons <ltx> afrigpunte het <com> dan kan ons die negatiewe log waarskynlikheidskostefunksie skryf as <eos>"
print(gruNMT.translate(nmt, text, afrikaans, english, device)+"\n")
gd = "<sos> given that we represent the target output as <ltx> and we have <ltx> training points <com> we can write the negative log likelihood of the parameters as follows <eos>"
print(gd)

<sos> given that we represent the target output as <ltx> and we have <ltx> training points <com> we can write the negative log likelihood of the parameters as follows <eos>

<sos> given that we represent the target output as <ltx> and we have <ltx> training points <com> we can write the negative log likelihood of the parameters as follows <eos>
