In [1]:
from src.Normalizer import preprocess_data
from typing import Dict

# DATA PERPARATION

In [2]:
DATA_BASE = "./data"
TRAIN_RAW = f"{DATA_BASE}/train"
TRAIN_DATA = f"{TRAIN_RAW}/norm"

VAL_RAW = f"{DATA_BASE}/val"
VAL_DATA = f"{VAL_RAW}/norm"

TRAIN_AFRIKAANS = [
    "data414_2021_a1.af.txt",
    "data414_2021_a2.af.txt",
    "data414_2020_a1.af.txt",
    "ss414_2018_a1.af.txt",
    "ss414_2018_a2.af.txt",
    "ss414_2018_a3.af.txt",
    "ss414_2019_a1.af.txt",
    "ss414_2019_a2.af.txt",
    "ss414_2019_a3.af.txt",
]

TRAIN_ENGLISH = [
    "data414_2021_a1.en.txt",
    "data414_2021_a2.en.txt",
    "data414_2020_a1.en.txt",
    "ss414_2018_a1.en.txt",
    "ss414_2018_a2.en.txt",
    "ss414_2018_a3.en.txt",
    "ss414_2019_a1.en.txt",
    "ss414_2019_a2.en.txt",
    "ss414_2019_a3.en.txt",
]

VAL_AFRIKAANS = [
    "compsys414_2017_a1.af.txt",
    "compsys414_2017_a2.af.txt",
    "compsys414_2017_a3.af.txt",
]

VAL_ENGLISH = [
    "compsys414_2017_a1.en.txt",
    "compsys414_2017_a2.en.txt",
    "compsys414_2017_a3.en.txt",
]

In [3]:
# # TRAIN_DATA
# preprocess_data(TRAIN_RAW, TRAIN_DATA, TRAIN_AFRIKAANS, "afrikaans")
# preprocess_data(TRAIN_RAW, TRAIN_DATA, TRAIN_ENGLISH, "english")

In [4]:
# # VAL_DATA
# preprocess_data(VAL_RAW, VAL_DATA, VAL_AFRIKAANS, "afrikaans")
# preprocess_data(VAL_RAW, VAL_DATA, VAL_ENGLISH, "english")

## Corpus

In [5]:
class Corpus:
    def __init__(self, file_name: str, lang: str):
        self.file_name = file_name
        self.lang = lang
        self.vocab_size = 11
        self.data = []
        self.stoi: Dict[str, int] = {
            "<pad>": 0,
            "<sos>": 1,
            "<eos>": 2,
            "<unk>": 3,
            "<num>": 4,
            "<com>": 5,
            "<prc>": 6,
            "<opn>": 7,
            "<cld>": 8,
            "<apo>": 9,
            "<ltx>": 10,
        }
        self.itos: Dict[int, str] = {
            0: "<pad>",
            1: "<sos>",
            2: "<eos>",
            3: "<unk>",
            4: "<num>",
            5: "<com>",
            6: "<prc>",
            7: "<opn>",
            8: "<cld>",
            9: "<apo>",
            10: "<ltx>",
        }
        self.__init_data()
        self.__encode()

    def __init_data(self):
        with open(self.file_name, "r") as file:
            for line in file:
                line = line.strip().split()
                self.data.append(line)
                for word in line:
                    if not self.stoi.get(word):
                        self.vocab_size += 1
                        self.stoi[word] = self.vocab_size - 1
                        self.itos[self.vocab_size - 1] = word

    def __encode(self):
        _data = [[self.stoi[word] for word in sentence] for sentence in self.data]
        self.data = _data

    def decode(self, data):
        _data = [[self.stoi[word] for word in sentence] for sentence in data]
        return _data

## Torch data

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [15]:
class LangData(Dataset):
    def __init__(self, source, target):
        if len(source.data) != len(target.data):
            raise RuntimeError("Source and target must have the same lenght")
        self.source = source.data
        self.target = target.data

    def __getitem__(self, idx):
        x = torch.tensor(self.source[idx], dtype=torch.long)
        y = torch.tensor(self.target[idx], dtype=torch.long)
        return x, y

    def __len__(self):
        return len(self.source)


def collate_fn(batch):
    """
    Pad shorter sequence with 0 (<pad>) to match the longest sequence
    to obtain a uniform bacht size.
    """
    source, target = zip(*batch)
    # Pad sequences
    source = pad_sequence(source, batch_first=False, padding_value=0)
    target = pad_sequence(target, batch_first=False, padding_value=0)
    return source, target


def dataLoader(dataset, batch_size):
    return DataLoader(
        dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True
    )

## NMT: AFRIKAANS -> ENGLISH

In [16]:
import torch
from torch import nn
from torch import optim
from tqdm import tqdm
import random
from torch.utils.tensorboard import SummaryWriter

if torch.backends.mps.is_available:
    device = "mps"  # OSX
elif torch.cuda.is_available:
    device = "cuda"
else:
    device = "cpu"
print(device)

mps


In [17]:
class Encoder(nn.Module):
    def __init__(self, input_size, embd_size, hidden_size, num_layers) -> None:
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embd_size)
        self.rnn = nn.LSTM(
            input_size=embd_size, hidden_size=hidden_size, num_layers=num_layers
        )

    def forward(self, x):
        # x: L x B
        e = self.embedding(x)
        # e: L x B x E
        _, (hidden, cell) = self.rnn(e)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(
        self, input_size, embd_size, hidden_size, output_size, num_layers
    ) -> None:
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embd_size)
        self.rnn = nn.LSTM(embd_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x: B -> 1 x B
        e = self.embedding(x.unsqueeze(0))
        # e: 1 x B x E
        out, (hidden, cell) = self.rnn(e, (hidden, cell))
        # out: 1 x B x H
        pred = self.fc(out)
        # pred: 1 x B x V -> B x V_out
        return pred.squeeze(0), hidden, cell


class NMT(nn.Module):
    def __init__(self, encoder, decoder, target_vocab_size):
        super(NMT, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.target_size = target_vocab_size

    def forward(self, source, target, tch_force=0.95):
        batch_size = source.size(1)
        target_len = target.size(0)
        hidden, cell = self.encoder(source)
        outputs = torch.zeros(batch_size, target_len, self.target_size).to(device)

        x = target[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[:, t, :] = output
            yhat = output.argmax(1).detach()
            x = target[t] if random.random() < tch_force else yhat
        return outputs

    def translate(self, source):
        batch_size = source.size(1)
        target_len = source.size(0) + 2
        with torch.no_grad():
            hidden, cell = self.encoder(source)
            x = source[0, 0].unsqueeze(0)
            t = 0
            outputs = [x.item()]
            while x.item() != 2 and t < 50:
                output, hidden, cell = self.decoder(x, hidden, cell)
                x = torch.argmax(output, 1)
                outputs.append(x.item())
                t += 1
        return outputs


def translate(model, text, source, target, device):
    text = [source.stoi[word] for word in text.strip().split()]
    text = torch.tensor(text, dtype=torch.long).unsqueeze(1)
    text = text.to(device)
    out = model.translate(text)
    out = [target.itos[idx] for idx in out]
    return " ".join(out)

In [21]:
# Hyper-params
afrikaans = Corpus(f"{TRAIN_DATA}/afrikaans.txt", "Afrikaans")
english = Corpus(f"{TRAIN_DATA}/english.txt", "English")
IN_ENCODER = afrikaans.vocab_size
IN_DECODER = english.vocab_size
OUT_DECODER = english.vocab_size

ENCODER_EMB = 512
DECODER_EMB = 512

HIDDEN_SIZE = 1024
NUM_LAYERS = 4

In [22]:
encoder_net = Encoder(IN_ENCODER, ENCODER_EMB, HIDDEN_SIZE, NUM_LAYERS).to(device)
decoder_net = Decoder(IN_DECODER, DECODER_EMB, HIDDEN_SIZE, OUT_DECODER, NUM_LAYERS).to(
    device
)
nmt = NMT(encoder_net, decoder_net, OUT_DECODER)

In [29]:
EPOCHS = 20
LR = 1e-3
BATCH_SIZE = 128

train_data = LangData(afrikaans, english)
train_loader = dataLoader(train_data, BATCH_SIZE)

pad_idx = english.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.NAdam(nmt.parameters(), LR)

In [30]:
step = 0
# writer = SummaryWriter(f"runs/loss_plot", comment="Loss")
N = len(train_data)
text = "<sos> ons rig <apo>n line <opn> e <cld> re regressiemodel af wat <apo>n persoon se beroep as intree neem en hul salaris voorspel <eos>"
grdt = "<sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>"
print(translate(nmt, text, afrikaans, english, device) + "\n")

<sos> i <eos>



In [31]:
print(translate(nmt, text, afrikaans, english, device) + "\n")
for epoch in range(EPOCHS):
    pbar = tqdm(train_loader, unit="batch", desc=f"Epoch {epoch+1}/{EPOCHS}")
    run_loss = 0
    for source, target_ in pbar:
        source = source.to(device)
        target = target_.to(device)

        output_ = nmt(source, target)
        output = output_.reshape(-1, output_.shape[2])
        target = target.permute(1, 0).reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()

        # torch.nn.utils.clip_grad_norm_(nmt.parameters(), max_norm=2)
        optimizer.step()
        run_loss += loss.item() * source.size(0)
        pbar.set_postfix(loss=f"{run_loss/N:.3f}")
    print(f"Pred : {translate(nmt, text, afrikaans, english, device)}")
    print(f"Grdt : {grdt}\n")
# 	writer.add_scalar("Training_loss", run_loss/N, global_step=epoch)
# writer.flush()
# writer.close()

<sos> i <eos>



Epoch 1/20: 100%|██████████| 4/4 [00:04<00:00,  1.07s/batch, loss=0.382]


Pred : <sos> we are interested in predicting the <prc> change in the usd euro exchange rate in relation to the weekly changes in the world stock markets <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 2/20: 100%|██████████| 4/4 [00:04<00:00,  1.07s/batch, loss=0.338]


Pred : <sos> we are interested in predicting the <prc> change in the usd euro exchange rate in relation to the weekly changes in the world stock markets <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 3/20: 100%|██████████| 4/4 [00:03<00:00,  1.02batch/s, loss=0.304]


Pred : <sos> we are interested in understanding which factors affect ceo salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 4/20: 100%|██████████| 4/4 [00:03<00:00,  1.06batch/s, loss=0.275]


Pred : <sos> we are interested in understanding which factors affect ceo salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 5/20: 100%|██████████| 4/4 [00:03<00:00,  1.04batch/s, loss=0.277]


Pred : <sos> now calculate the maximum likelihood estimates of these parameters by hand <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 6/20: 100%|██████████| 4/4 [00:04<00:00,  1.06s/batch, loss=0.281]


Pred : <sos> we perform regression on this dataset using basis functions <com> with <ltx> <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 7/20: 100%|██████████| 4/4 [00:04<00:00,  1.05s/batch, loss=0.263]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 8/20: 100%|██████████| 4/4 [00:04<00:00,  1.07s/batch, loss=0.272]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 9/20: 100%|██████████| 4/4 [00:04<00:00,  1.02s/batch, loss=0.270]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 10/20: 100%|██████████| 4/4 [00:04<00:00,  1.04s/batch, loss=0.249]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 11/20: 100%|██████████| 4/4 [00:03<00:00,  1.01batch/s, loss=0.240]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 12/20: 100%|██████████| 4/4 [00:04<00:00,  1.00s/batch, loss=0.224]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 13/20: 100%|██████████| 4/4 [00:03<00:00,  1.03batch/s, loss=0.228]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 14/20: 100%|██████████| 4/4 [00:03<00:00,  1.01batch/s, loss=0.219]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 15/20: 100%|██████████| 4/4 [00:03<00:00,  1.02batch/s, loss=0.230]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 16/20: 100%|██████████| 4/4 [00:03<00:00,  1.15batch/s, loss=0.185]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 17/20: 100%|██████████| 4/4 [00:04<00:00,  1.00s/batch, loss=0.220]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 18/20: 100%|██████████| 4/4 [00:04<00:00,  1.08s/batch, loss=0.242]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 19/20: 100%|██████████| 4/4 [00:04<00:00,  1.07s/batch, loss=0.231]


Pred : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>



Epoch 20/20: 100%|██████████| 4/4 [00:03<00:00,  1.06batch/s, loss=0.204]

Pred : <sos> i <eos>
Grdt : <sos> we train a linear regression model which takes a person<apo>s occupation and predicts their salary <eos>




