In [1]:
from src.Normalizer import preprocess_data

# DATA PERPARATION

In [2]:
DATA_BASE = "./data"
TRAIN_RAW = f"{DATA_BASE}/train"
TRAIN_DATA = f"{TRAIN_RAW}/norm"

VAL_RAW = f"{DATA_BASE}/val"
VAL_DATA = f"{VAL_RAW}/norm"

TRAIN_AFRIKAANS = [
    "data414_2021_a1.af.txt",
    "data414_2021_a2.af.txt",
    "data414_2020_a1.af.txt",
    "ss414_2018_a1.af.txt",
    "ss414_2018_a2.af.txt",
    "ss414_2018_a3.af.txt",
    "ss414_2019_a1.af.txt",
    "ss414_2019_a2.af.txt",
    "ss414_2019_a3.af.txt",]

TRAIN_ENGLISH = [
    "data414_2021_a1.en.txt",
    "data414_2021_a2.en.txt",
    "data414_2020_a1.en.txt",
    "ss414_2018_a1.en.txt",
    "ss414_2018_a2.en.txt",
    "ss414_2018_a3.en.txt",
    "ss414_2019_a1.en.txt",
    "ss414_2019_a2.en.txt",
    "ss414_2019_a3.en.txt",]

VAL_AFRIKAANS = [
    "compsys414_2017_a1.af.txt",
    "compsys414_2017_a2.af.txt",
    "compsys414_2017_a3.af.txt",]

VAL_ENGLISH = [
    "compsys414_2017_a1.en.txt",
    "compsys414_2017_a2.en.txt",
    "compsys414_2017_a3.en.txt",]

In [3]:
# TRAIN_DATA
preprocess_data(TRAIN_RAW, TRAIN_DATA, TRAIN_AFRIKAANS, "afrikaans")
preprocess_data(TRAIN_RAW, TRAIN_DATA, TRAIN_ENGLISH, "english")

Done for afrikaans!
Done for english!


In [4]:
# VAL_DATA
preprocess_data(VAL_RAW, VAL_DATA, VAL_AFRIKAANS, "afrikaans")
preprocess_data(VAL_RAW, VAL_DATA, VAL_ENGLISH, "english")

Done for afrikaans!
Done for english!


## Corpus

In [33]:
class Corpus:
    def __init__(self, file_name: str, lang:str):
        self.file_name = file_name
        self.lang = lang
        self.vocab_size = 11
        self.data = []
        self.stoi: Dict[str, int] = {
            "<pad>": 0,
            "<sos>": 1,
            "<eos>": 2,
            "<unk>": 3,
            "<num>": 4,
            "<com>": 5,
            "<prc>": 6,
            "<opn>": 7,
            "<cld>": 8,
            "<apo>": 9,
            "<ltx>": 10,
        }
        self.itos: Dict[int, str] = {
            0: "<pad>",
            1: "<sos>",
            2: "<eos>",
            3: "<unk>",
            4: "<num>",
            5: "<com>",
            6: "<prc>",
            7: "<opn>",
            8: "<cld>",
            9: "<apo>",
            10: "<ltx>",
        }
        self.__init_data()
        self.__encode()

    def __init_data(self):
        with open(self.file_name, "r") as file:
            for line in file:
                line = line.strip().split()
                self.data.append(line)
                for word in line:
                    if not self.stoi.get(word):
                        self.vocab_size += 1
                        self.stoi[word] = self.vocab_size - 1
                        self.itos[self.vocab_size-1] = word
    def __encode(self):
        _data = [[self.stoi[word] for word in sentence] for sentence in self.data]
        self.data = _data
        
    def decode(self, data):
        _data = [[self.stoi[word] for word in sentence] for sentence in data]
        return _data

In [40]:
afrikaans = Corpus(f"{TRAIN_DATA}/afrikaans.txt", "Afrikaans")
english = Corpus(f"{TRAIN_DATA}/english.txt", "English")

## Torch data

In [41]:
import torch
from torch.utils.data import Dataset, DataLoader

In [44]:
class LangData(Dataset):
    def __init__(self, source, target):
        if len(source.data) != len(target.data):
            raise RuntimeError("Source and target must have the same lenght")
        self.source = source.data
        self.target = target.data
    def __getitem__(self, idx):
        return self.source[idx], self.target[idx]
    def __len__(self):
        return len(self.source)

def collate_fn(batch):
    """
     Pad shorter sequence with 0 (<pad>) to match the longest sequence
     to obtain a uniform bacht size.
    """
    source, target = zip(*batch)
    # Pad sequences
    source = pad_sequence(source, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)
    return source, target


def dataLoader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

In [45]:
train_data = LangData(english, afrikaans)

In [47]:
train_data[0]

([1,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  11,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  4,
  33,
  34,
  2],
 [1,
  11,
  12,
  13,
  14,
  15,
  16,
  11,
  17,
  18,
  11,
  19,
  20,
  21,
  22,
  18,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  4,
  31,
  27,
  32,
  2])