In [1]:
from src.Normalizer import preprocess_data
from typing import Dict

# DATA PERPARATION

In [2]:
DATA_BASE = "./data"
TRAIN_RAW = f"{DATA_BASE}/train"
TRAIN_DATA = f"{TRAIN_RAW}/norm"

VAL_RAW = f"{DATA_BASE}/val"
VAL_DATA = f"{VAL_RAW}/norm"

TRAIN_AFRIKAANS = [
	"data414_2021_a1.af.txt",
	"data414_2021_a2.af.txt",
	"data414_2020_a1.af.txt",
	"ss414_2018_a1.af.txt",
	"ss414_2018_a2.af.txt",
	"ss414_2018_a3.af.txt",
	"ss414_2019_a1.af.txt",
	"ss414_2019_a2.af.txt",
	"ss414_2019_a3.af.txt",]

TRAIN_ENGLISH = [
	"data414_2021_a1.en.txt",
	"data414_2021_a2.en.txt",
	"data414_2020_a1.en.txt",
	"ss414_2018_a1.en.txt",
	"ss414_2018_a2.en.txt",
	"ss414_2018_a3.en.txt",
	"ss414_2019_a1.en.txt",
	"ss414_2019_a2.en.txt",
	"ss414_2019_a3.en.txt",]

VAL_AFRIKAANS = [
	"compsys414_2017_a1.af.txt",
	"compsys414_2017_a2.af.txt",
	"compsys414_2017_a3.af.txt",]

VAL_ENGLISH = [
	"compsys414_2017_a1.en.txt",
	"compsys414_2017_a2.en.txt",
	"compsys414_2017_a3.en.txt",]

In [3]:
# # TRAIN_DATA
# preprocess_data(TRAIN_RAW, TRAIN_DATA, TRAIN_AFRIKAANS, "afrikaans")
# preprocess_data(TRAIN_RAW, TRAIN_DATA, TRAIN_ENGLISH, "english")

In [4]:
# # VAL_DATA
# preprocess_data(VAL_RAW, VAL_DATA, VAL_AFRIKAANS, "afrikaans")
# preprocess_data(VAL_RAW, VAL_DATA, VAL_ENGLISH, "english")

## Corpus

In [5]:
class Corpus:
	def __init__(self, file_name: str, lang:str):
		self.file_name = file_name
		self.lang = lang
		self.vocab_size = 11
		self.data = []
		self.stoi: Dict[str, int] = {
			"<pad>": 0,
			"<sos>": 1,
			"<eos>": 2,
			"<unk>": 3,
			"<num>": 4,
			"<com>": 5,
			"<prc>": 6,
			"<opn>": 7,
			"<cld>": 8,
			"<apo>": 9,
			"<ltx>": 10,
		}
		self.itos: Dict[int, str] = {
			0: "<pad>",
			1: "<sos>",
			2: "<eos>",
			3: "<unk>",
			4: "<num>",
			5: "<com>",
			6: "<prc>",
			7: "<opn>",
			8: "<cld>",
			9: "<apo>",
			10: "<ltx>",
		}
		self.__init_data()
		self.__encode()

	def __init_data(self):
		with open(self.file_name, "r") as file:
			for line in file:
				line = line.strip().split()
				self.data.append(line)
				for word in line:
					if not self.stoi.get(word):
						self.vocab_size += 1
						self.stoi[word] = self.vocab_size - 1
						self.itos[self.vocab_size-1] = word
	def __encode(self):
		_data = [[self.stoi[word] for word in sentence] for sentence in self.data]
		self.data = _data
		
	def decode(self, data):
		_data = [[self.stoi[word] for word in sentence] for sentence in data]
		return _data

## Torch data

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [7]:
class LangData(Dataset):
	def __init__(self, source, target):
		if len(source.data) != len(target.data):
			raise RuntimeError("Source and target must have the same lenght")
		self.source = source.data
		self.target = target.data
	def __getitem__(self, idx):
		x = torch.tensor(self.source[idx], dtype=torch.long)
		y = torch.tensor(self.target[idx], dtype=torch.long)
		return x, y
	def __len__(self):
		return len(self.source)

def collate_fn(batch):
	"""
	 Pad shorter sequence with 0 (<pad>) to match the longest sequence
	 to obtain a uniform bacht size.
	"""
	source, target = zip(*batch)
	# Pad sequences
	source = pad_sequence(source, batch_first=False, padding_value=0)
	target = pad_sequence(target, batch_first=False, padding_value=0)
	return source, target


def dataLoader(dataset, batch_size):
	return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

## NMT: AFRIKAANS -> ENGLISH

In [8]:
import torch
from torch import nn
from torch import optim
from tqdm import tqdm
import src.RNN as rnnNMT

from torch.utils.tensorboard import SummaryWriter

if torch.backends.mps.is_available:
	device = "mps"  # OSX
elif torch.cuda.is_available:
	device = "cuda"
else:
	device = "cpu"
print(device)

mps


In [9]:
# Hyper-params
afrikaans = Corpus(f"{TRAIN_DATA}/afrikaans.txt", "Afrikaans")
english = Corpus(f"{TRAIN_DATA}/english.txt", "English")
IN_ENCODER = afrikaans.vocab_size
IN_DECODER = english.vocab_size
OUT_DECODER = english.vocab_size

ENCODER_EMB = 256
DECODER_EMB = 256

HIDDEN_SIZE = 1024
NUM_LAYERS = 4

In [10]:
encoder_net = rnnNMT.Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE,
					  NUM_LAYERS).to(device)
decoder_net = rnnNMT.Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, OUT_DECODER,
					  NUM_LAYERS).to(device)
nmt = rnnNMT.NMT(encoder_net, decoder_net, OUT_DECODER)

In [11]:
EPOCHS = 20
LR = 1e-3
BATCH_SIZE = 128

train_data = LangData(afrikaans, english)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = english.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.NAdam(nmt.parameters(), LR)

In [12]:
step = 0
# writer = SummaryWriter(f"runs/loss_plot")
N = len(train_data)
text = "<sos> die klassifiseerder maak <num> korrekte positiewe voorspellings en <num> <com> <num> korrekte negatiewe voorspellings <eos>"
grdt = "<sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>"

In [13]:
print(rnnNMT.translate(nmt, text, afrikaans, english, device)+"\n")
for epoch in range(EPOCHS):
	pbar = tqdm(train_loader, unit="batch" ,desc=f"Epoch {epoch+1}/{EPOCHS}")
	run_loss = 0
	for source, target_ in pbar:
		source = source.to(device)
		target = target_.to(device)

		output_ = nmt(source, target)
		output = output_.reshape(-1, output_.shape[2])
		target = target.permute(1,0).reshape(-1)

		optimizer.zero_grad()
		loss = criterion(output, target)
		loss.backward()

		torch.nn.utils.clip_grad_norm_(nmt.parameters(), max_norm=2)
		optimizer.step()
		run_loss +=loss.item()*source.size(0)
		pbar.set_postfix(loss=f"{run_loss/N:.3f}")
	print(f"Pred : {rnnNMT.translate(nmt, text, afrikaans, english, device)}")
	print(f"Grdt : {grdt}\n")
# 	writer.add_scalar("Loss", run_loss/N, global_step=epoch)
# writer.flush()
# writer.close()

<sos> equation exchange nyquist db instead specifically assume features six case power follows use figure original assumption nyquist power phase starting desirable shortly limited i based sensible features spectra too instantaneous six automatically score rbfs sampled number assume following grown upsampled accidentally applied decided slightly develops upsampled around describe ii c



Epoch 1/20: 100%|██████████| 4/4 [00:01<00:00,  2.12batch/s, loss=2.874]


Pred : <sos> is <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx> <ltx>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 2/20: 100%|██████████| 4/4 [00:01<00:00,  2.56batch/s, loss=2.946]


Pred : <sos> consider the the the the the <ltx> <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 3/20: 100%|██████████| 4/4 [00:01<00:00,  2.71batch/s, loss=2.674]


Pred : <sos> it there is <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 4/20: 100%|██████████| 4/4 [00:01<00:00,  2.56batch/s, loss=2.699]


Pred : <sos> the the is <ltx> <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 5/20: 100%|██████████| 4/4 [00:01<00:00,  2.67batch/s, loss=2.499]


Pred : <sos> consider the answer your answer <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 6/20: 100%|██████████| 4/4 [00:01<00:00,  2.63batch/s, loss=2.452]


Pred : <sos> determine the following form of <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx> <com> <ltx>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 7/20: 100%|██████████| 4/4 [00:01<00:00,  2.58batch/s, loss=2.352]


Pred : <sos> consider the following block <ltx> for the following of the resulting is the input <ltx> <com> and the your answer on the next page <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 8/20: 100%|██████████| 4/4 [00:01<00:00,  2.61batch/s, loss=2.193]


Pred : <sos> consider the following block signal <ltx> <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 9/20: 100%|██████████| 4/4 [00:01<00:00,  2.81batch/s, loss=2.107]


Pred : <sos> consider the following of the <ltx> is the following of the resulting is the input of <ltx> <com> <ltx> <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 10/20: 100%|██████████| 4/4 [00:01<00:00,  2.55batch/s, loss=2.093]


Pred : <sos> determine the following block diagram <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 11/20: 100%|██████████| 4/4 [00:01<00:00,  2.47batch/s, loss=1.870]


Pred : <sos> determine the optimal value of <ltx> <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 12/20: 100%|██████████| 4/4 [00:01<00:00,  2.84batch/s, loss=1.581]


Pred : <sos> it additional space <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 13/20: 100%|██████████| 4/4 [00:01<00:00,  2.62batch/s, loss=1.691]


Pred : <sos> the we time signal <ltx> <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 14/20: 100%|██████████| 4/4 [00:01<00:00,  2.53batch/s, loss=1.731]


Pred : <sos> it there is additional space for your answer on the next page <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 15/20: 100%|██████████| 4/4 [00:01<00:00,  2.71batch/s, loss=1.532]


Pred : <sos> scenarios problem least dsp processor minimise invariant form dollars classification unseen form upsampling week smallest prototype problem page implemented without scenario algebraically observations observations unseen answer algebraically without next page without processor algebraically computed projecting least squares times artist fast obtain ms must machine implemented without minimise containing observations observations
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 16/20: 100%|██████████| 4/4 [00:01<00:00,  2.62batch/s, loss=1.429]


Pred : <sos> scenarios problem interval must butterfly implemented run treat working according anything wrong point interval ensuring reconstructed desired axis closed recommend diagram add must measure implemented preprocess treat dsp processor after unit khz page algebraically implemented hint learning possible females earn more fir average than males provided more fast matric average
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 17/20: 100%|██████████| 4/4 [00:01<00:00,  2.52batch/s, loss=1.343]


Pred : <sos> scenarios problem recorded must be implemented algebraically minimise dsp processor implemented invariant learning notch add khz ola isolate unit calculation processor minimise invariant implemented wrong add must fir implemented wrong stable fast radix five through top estimation form expression ffts scenario problem recommend space wrong about initialisation invariant implemented stable
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 18/20: 100%|██████████| 4/4 [00:01<00:00,  2.60batch/s, loss=1.264]


Pred : <sos> scenarios scalar separately passed through frequencies thoroughly algebraically improves without processor algebraically observations around implemented algebraically actually without processor implemented observations learning unseen add fast hint consists analysed top calculations times truth fast learning form down point discretised per interval sample stable implemented algebraically implemented without processor lpf thousands dsp
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 19/20: 100%|██████████| 4/4 [00:01<00:00,  2.62batch/s, loss=1.114]


Pred : <sos> interval between occurred person implemented squared inference dsp padding implemented invariant unit improved without scenario algebraically observations classification unseen implemented algebraically considered error observations implemented processor isolate thousands overlap observations textbf executed implemented improves sales lpf characterised implemented wrong minimise dsp processor implemented invariant overlap lti add <opn> ola <cld>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>



Epoch 20/20: 100%|██████████| 4/4 [00:01<00:00,  2.74batch/s, loss=1.225]

Pred : <sos> for the system is described by the following impulse response <ltx> <eos>
Grdt : <sos> the classifier makes <num> correct positive predictions and <num> <com> <num> correct negative predictions <eos>






In [93]:
text = "<sos> as ons die teikenuittree voorstel as <ltx> en ons <ltx> afrigpunte het <com> dan kan ons die negatiewe log waarskynlikheidskostefunksie skryf as <eos>"
print(rnnNMT.translate(nmt, text, afrikaans, english, device)+"\n")
gd = "<sos> given that we represent the target output as <ltx> and we have <ltx> training points <com> we can write the negative log likelihood of the parameters as follows <eos>"
print(gd)

<sos> given that we represent the target output as <ltx> and we have <ltx> training points <com> we can write the negative log likelihood of the parameters as follows <eos>

<sos> given that we represent the target output as <ltx> and we have <ltx> training points <com> we can write the negative log likelihood of the parameters as follows <eos>
