In [1]:
# ----------------------------------------------------------------------------
# Project     : Chameleon - Make KO-EN translator
# Created By  : Eungi
# Team        : Product Growth - Data Science
# Created Date: 2023-07-06
# Updated Date: 2023-07-07
# Purpose     : Make data_loader for loading corpus data
# version     : 0.0.1
# ---------------------------------------------------------------------------

In [2]:
import pandas as pd
import numpy as np
import psutil
import os
import pickle
from tqdm import tqdm
from ToolCadeau.preprocessors.refiner import refine

# Train - Valid - Test
- make src and tgt column
- tokenize `src` using `Sentencepiece`, and tokenize adjustly of `tgt`
- create train, valid, test set

In [3]:
def read_data(file_path, delimiter="\t"):
    with open(file_path, "r") as file:
        kos, ens = [], []
        for line in file:
            ko, en = line.strip().split(delimiter)
            kos += [ko]
            ens += [en]

    data = pd.DataFrame({"ko": kos, "en": ens})
    return data


DATA_ROOT = "../data/"
data = read_data(DATA_ROOT + "corpus.txt")

In [4]:
def refine_data(data, regex_fn):
    cols = data.columns.unique().tolist()

    for col in tqdm(cols):
        data[col] = refine(data, column=col, regex_fn=regex_fn)

    return data


REGEX_FN = "./refine.regex.txt"
data = refine_data(data, REGEX_FN)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [02:10<00:00, 65.18s/it]


In [6]:
data.to_pickle(DATA_ROOT + "corpus.refine.pickle")

In [7]:
from sklearn.model_selection import train_test_split


def split_train_valid_test(data):
    train_data, test_data = train_test_split(
        data,
        test_size=0.2,
        random_state=1004,
    )

    test_data, valid_data = train_test_split(
        test_data, test_size=0.5, random_state=1004
    )

    train_data = train_data.reset_index(drop=True)
    valid_data = valid_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)

    print(len(train_data), len(valid_data), len(test_data))

    train_data.to_pickle(DATA_ROOT + "chameleon.train.pickle")
    valid_data.to_pickle(DATA_ROOT + "chameleon.valid.pickle")
    test_data.to_pickle(DATA_ROOT + "chameleon.test.pickle")

    return train_data, valid_data, test_data


train_data, valid_data, test_data = split_train_valid_test(data)

1281934 160242 160242


In [10]:
def save_train_srcs(train_data):
    cols = train_data.columns.unique().tolist()
    for col in cols:
        fn = f"chameleon.train.{col}.txt"
        srcs = train_data[col].unique().tolist()
        with open(DATA_ROOT + fn, "w") as file:
            for src in srcs:
                if src.strip() != "":
                    file.write(src.strip() + "\n")
                else:
                    file.write("\n")


save_train_srcs(train_data)

In [11]:
import sentencepiece as spm


def train_spm(cols) -> None:
    vocab_size = {"ko": 50000, "en": 30000}

    for col in cols:
        fn = f"chameleon.train.{col}.txt"
        input_params = {
            "input": DATA_ROOT + fn,
            "model_prefix": DATA_ROOT + f"{col}.train.bpe",
            "vocab_size": vocab_size[col],
        }
        spm.SentencePieceTrainer.train(**input_params)


cols = data.columns.unique().tolist()
# ['ko', 'en']
train_spm(cols)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/chameleon.train.ko.txt
  input_format: 
  model_prefix: ../data/ko.train.bpe
  model_type: UNIGRAM
  vocab_size: 50000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  dif

In [13]:
def tokenize_train_valid_test_src(train_data, valid_data, test_data):
    cols = train_data.columns.unique().tolist()
    types = ["train", "valid", "test"]
    # ['ko', 'en']

    for col in cols:
        sp = spm.SentencePieceProcessor(model_file=DATA_ROOT + f"{col}.train.bpe.model")

        for type in types:
            tgt_data = locals()[type + "_" + "data"]
            srcs = tgt_data[col].tolist()
            encoded_srcs = sp.encode(srcs, out_type=str)
            tgt_data[f"tok_{col}"] = [
                " ".join(encoded_src) for encoded_src in encoded_srcs
            ]

    return train_data, valid_data, test_data


train_data, valid_data, test_data = tokenize_train_valid_test_src(
    train_data, valid_data, test_data
)

In [15]:
train_data.to_pickle(DATA_ROOT + "chameleon.train.tok.pickle")
valid_data.to_pickle(DATA_ROOT + "chameleon.valid.tok.pickle")
test_data.to_pickle(DATA_ROOT + "chameleon.test.tok.pickle")

## Make src and tgt
- use Vocabulary, TranslationDataset, DataLoader to make dataloader

In [17]:
from itertools import repeat
from collections import defaultdict
import torch


class Vocabulary(object):
    # pre-defined token idx
    PAD, BOS, EOS, UNK = 0, 1, 2, 3

    def __init__(
        self,
        min_freq=1,
        max_vocab=99999,
    ):
        # Default Vocabulary
        self.itos = {
            Vocabulary.PAD: "<PAD>",
            Vocabulary.BOS: "<BOS>",
            Vocabulary.EOS: "<EOS>",
            Vocabulary.UNK: "<UNK>",
        }
        self.stoi = {token: idx for idx, token in self.itos.items()}

        self.min_freq = min_freq
        self.max_vocab = max_vocab

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer(text, delimiter):
        return [tok.strip() for tok in text.split(delimiter)]

    def build_vocab(self, sents, delimiter):
        # bag of words
        bow = defaultdict(int)

        for sent in sents:
            words = self.tokenizer(sent, delimiter=delimiter)
            for word in words:
                bow[word] += 1

        # limit vocab by removing low frequence word
        bow = {word: freq for word, freq in bow.items() if freq >= self.min_freq}
        bow = sorted(bow.items(), key=lambda x: -x[1])

        # limit size of the vocab
        bow = dict(bow[: self.max_vocab - len(self.itos)])

        # create vocab
        start_idx = len(self.itos)

        for word in bow.keys():
            self.stoi[word] = start_idx
            self.itos[start_idx] = word
            start_idx += 1

        print("Number of vocabularies: ", len(self))

    def encode(self, text, delimiter):
        """
        Encode text input. Support batch input.
        Return list.
        """

        encoded_text = []

        if isinstance(text, list):
            # |text| = [text1, text2, ...]
            tokenized_text = list(map(self.tokenizer, text, repeat(delimiter)))
            # |tokenized_text| = [[token1, token2, ...], [token1, token2, ...]]
            for tokens in tokenized_text:
                encoded_text += [
                    [
                        self.stoi[token]
                        if token in self.stoi.keys()
                        else self.stoi["<UNK>"]
                        for token in tokens
                    ]
                ]
                # |encoded_text| = [[token_idx1, token_idx2], [token_idx1, token_idx2]]
        else:
            # |text| = str
            tokenized_text = self.tokenizer(text, delimiter=delimiter)
            # |tokenized_text| = [token1, token2, ...]
            encoded_text += [
                self.stoi[token] if token in self.stoi.keys() else self.stoi["<UNK>"]
                for token in tokenized_text
            ]
            # |encoded_text| = [token_idx1, token_idx2, ...]

        return encoded_text

    def decode(self, indice, delimiter, removed_indice=[BOS, EOS, PAD]):
        """
        Decode indice input. Support batch input.
        Return list.
        """

        decoded_indice = []

        # check if indice is batch input
        if isinstance(indice, torch.Tensor):
            is_nested = indice.ndim > 1
            indice = indice.tolist()
        else:
            is_nested = any(isinstance(elm, list) for elm in indice)

        if is_nested:
            # |indice| = (batch_size, length)
            # |indice| = [[idx1, idx2, ...], [idx1, idx2, ...]]
            for encoded_text in indice:
                decoded = []
                for idx in encoded_text:
                    if idx in self.itos.keys() and idx not in removed_indice:
                        decoded += [self.itos[idx]]
                    elif idx in removed_indice:
                        continue
                    else:
                        decoded += [self.itos[Vocabulary.UNK]]

                decoded_indice += [delimiter.join(decoded).strip()]

        else:
            # |indice| = (length, )
            # |indice| = [idx1, idx2, ...]
            decoded = []
            for idx in indice:
                if idx in self.itos.keys() and idx not in removed_indice:
                    decoded += [self.itos[idx]]
                elif idx in removed_indice:
                    continue
                else:
                    decoded += [self.itos[Vocabulary.UNK]]

            decoded_indice += [delimiter.join(decoded).strip()]

        return decoded_indice

In [19]:
src_vocab = Vocabulary()
src_vocab.build_vocab(train_data["tok_ko"], delimiter=" ")

tgt_vocab = Vocabulary()
tgt_vocab.build_vocab(train_data["tok_en"], delimiter=" ")

Number of vocabularies:  53430
Number of vocabularies:  30488


In [39]:
from torch.utils.data import Dataset, DataLoader
from types import NoneType
from typing import Type, Union, List


class TranslationDataset(Dataset):
    """
    Args:
        srcs (list): Sources to be used as the input data.
            Note. Sources must be tokenized before putting into Dataset.
        tgts (list): Targets to be used as the target data.
            Note. Targets must be tokenized before putting into Dataset.
        min_freq (int): Minimum frequency to be included in the vocabulary. Defaults to 1.
        max_vocab (int): Maximum size of vocabulary. Defaults to 99999.
        src_delimiter (str): Delimiter to tokenize the srcs and tgts.
        src_vocab (Vocabulary): Vocabulary to encode or decode the srcs of the validation_set and test_set.
            Defaults to None.
        tgt_vocab (Vocabulary): Vocabulary to encode or decode the tgts of the validation_set and test_set.
            Defaults to None.
        with_text (bool): Whether to include raw text in the output when calling __getitem__ method.
            It is used in evaluation and reinforcement learning. Defaults to False.
    """

    def __init__(
        self,
        srcs: List[str],
        tgts: List[str],
        min_freq: int = 1,
        max_vocab: int = 99999,
        src_delimiter: str = " ",
        tgt_delimiter: str = " ",
        src_vocab: Union[Type[Vocabulary], NoneType] = None,
        tgt_vocab: Union[Type[Vocabulary], NoneType] = None,
        with_text: bool = False,
    ):
        # Originally, srcs and tgts both must have been tokenized using BPE before.
        # But in agri translation model, tgts were tokenized with custom tokenization.
        # Instead, tgts have to be delimited by tgt_delimiter before.
        self.srcs, self.tgts = srcs, tgts
        self.src_delimiter, self.tgt_delimiter = src_delimiter, tgt_delimiter

        # If with_text is True, not only the encoded_src and encoded_tgt,
        # the raw src and tgt text would be returned together when __getitem__ is called.
        self.with_text = with_text

        # If the Dataset is train_dataset, it has to build its vocabulary.
        if src_vocab is None or tgt_vocab is None:
            # Initialize vocabulary of sources and targets
            self.src_vocab = Vocabulary(min_freq=min_freq, max_vocab=max_vocab)
            self.tgt_vocab = Vocabulary(min_freq=min_freq, max_vocab=max_vocab)
            # Build vocabulary of sources and targets
            self.src_vocab.build_vocab(self.srcs, delimiter=src_delimiter)
            self.tgt_vocab.build_vocab(self.tgts, delimiter=tgt_delimiter)
        else:
            # If the Dataset is validation or test_dateset, it has to use the vocabulary originated from train_dataset.
            self.src_vocab = src_vocab
            self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.srcs)

    def __getitem__(self, idx):
        src, tgt = self.srcs[idx], self.tgts[idx]

        # encode src
        encoded_src = self.src_vocab.encode(src, delimiter=self.src_delimiter)

        # In seq2seq structure, tgt must have BOS and EOS token at the beginning and the end.
        encoded_tgt = self.tgt_vocab.encode(tgt, delimiter=self.tgt_delimiter)
        encoded_tgt.insert(0, Vocabulary.BOS)
        encoded_tgt.append(Vocabulary.EOS)

        return_value = {
            "src": torch.tensor(encoded_src),
            "tgt": torch.tensor(encoded_tgt),
        }

        # src_txt and tgt_txt would be used in inference and evaluation
        if self.with_text:
            return_value["src_text"] = src
            return_value["tgt_text"] = tgt

        return return_value

In [40]:
Dataset = TranslationDataset(
    srcs=train_data["tok_ko"].tolist(), tgts=train_data["tok_en"].tolist()
)

Number of vocabularies:  53430
Number of vocabularies:  30488


In [41]:
len(Dataset)

1281934

In [42]:
print(Dataset.src_vocab.decode(Dataset[170000]["src"], delimiter=" "))
print(Dataset.tgt_vocab.decode(Dataset[170000]["tgt"], delimiter=" "))

['▁잠 잘 ▁때 ▁모습이 ▁어떻게 ▁저렇게 ▁웃 길 ▁수 ▁있죠 ?']
['▁How ▁can ▁one ▁look ▁so ▁funny ▁while ▁sleeping ?']


In [72]:
from torch.nn.utils.rnn import pad_sequence


class TranslationCollator:
    def __init__(self, pad_idx: int, max_length: int, with_text: bool = False):
        """
        Usages:
            It is used as a parameter in DataLoader.
            Collate batch srcs or tgts and process it to make batch loader.
            Add length of each src and tgt, and add pad token according to the length of batch.
        Args:
            pad_idx (int): Index of pad_token.
            max_length (list): Max length of the encoded_srcs or encoded_tgts .
            with_text (bool): Whether to include raw text in the output.
        """
        self.pad_idx = pad_idx
        self.max_length = max_length
        self.with_text = with_text

    def __call__(self, batch):
        # |batch| = [{"src": tensor[], "tgt": tensor[]}, {"src": tensor[], "tgt": tensor[]}...]

        srcs, tgts = [], []

        # If there are raw text passed from batch, include them in the returned value
        # If length of src or target is larger than max_length, truncate it.
        if self.with_text:
            srcs_texts, tgts_texts = [], []

            for sample in batch:
                srcs.append(
                    (
                        sample["src"][: self.max_length],
                        len(sample["src"][: self.max_length]),
                    )
                )
                tgts.append(
                    (
                        sample["tgt"][: self.max_length],
                        len(sample["tgt"][: self.max_length]),
                    )
                )

                srcs_texts.append(
                    " ".join(sample["src_text"].split(" ")[: self.max_length])
                )
                tgts_texts.append(
                    " ".join(sample["tgt_text"].split(" ")[: self.max_length])
                )

        else:
            for sample in batch:
                srcs.append(
                    (
                        sample["src"][: self.max_length],
                        len(sample["src"][: self.max_length]),
                    )
                )
                tgts.append(
                    (
                        sample["tgt"][: self.max_length],
                        len(sample["tgt"][: self.max_length]),
                    )
                )
        # |srcs| = [(src_ids, src_length), (src_ids, src_length) ...]
        # |srcs_texts| = [src_text, src_text, ...]

        # Pad Sequence with pad token according to the length
        srcs, srcs_lengths = zip(*srcs)
        tgts, tgts_lengths = zip(*tgts)
        # |srcs| = [[src_ids], [src_ids] ...]
        # |srcs_lenghts| = [src_length, src_length]

        srcs = pad_sequence(srcs, batch_first=True, padding_value=self.pad_idx)
        tgts = pad_sequence(tgts, batch_first=True, padding_value=self.pad_idx)
        # |srcs| = (batch_size, batch_max_length)

        srcs = (srcs, torch.LongTensor(srcs_lengths))
        tgts = (tgts, torch.LongTensor(tgts_lengths))

        return_value = {
            "input_ids": srcs,
            "output_ids": tgts,
        }

        if self.with_text:
            return_value["input_texts"] = srcs_texts
            return_value["output_texts"] = tgts_texts

        return return_value

In [73]:
train_srcs = train_data["tok_ko"].tolist()
train_tgts = train_data["tok_en"].tolist()

test_srcs = []
test_tgts = []

MAX_LENGTH = 128
for src, tgt in zip(train_srcs, train_tgts):
    if len(src.split(" ")) > MAX_LENGTH or len(tgt.split(" ")) > MAX_LENGTH:
        test_srcs += [src]
        test_tgts += [tgt]

In [74]:
train_loader = DataLoader(
    TranslationDataset(test_srcs, test_tgts, with_text=True),
    batch_size=1,
    shuffle=True,
    collate_fn=TranslationCollator(
        pad_idx=Vocabulary.PAD, max_length=MAX_LENGTH, with_text=True
    ),
)

Number of vocabularies:  1314
Number of vocabularies:  1421


In [75]:
mini_batch = next(iter(train_loader))

In [76]:
mini_batch

{'input_ids': (tensor([[599, 600, 207, 214,   5, 293, 601, 602,  14, 603, 604, 605,  99, 606,
            21, 607,   4, 125, 608,  95, 609, 125, 610,  22, 611, 215, 612, 216,
           613,  27,   4, 163, 214, 614, 615,   8, 616, 217, 218, 617, 618, 619,
           620,   5, 621,   5, 622, 129, 623,   5, 624, 121, 130, 625, 294,   4,
           125, 214,   5, 293, 626, 627,  54, 628, 162, 295, 103,  11]]),
  tensor([68])),
 'output_ids': (tensor([[  1, 695,   9, 696, 697, 271, 698,   4, 699, 418, 369,  52,   4, 419,
           700, 701,   7, 702, 276,  92,   7,   4, 132,   5,  97,   7, 134,   6,
           109,  27,   4,  66,  40,   4, 136,  16, 703,  90,   9,  67, 131,   6,
           108, 420,   9, 704,   6, 277, 705,   5,  90,   9,  67, 131, 156,  41,
           162, 706, 421,   9, 707,   6, 112,   5, 422,   7, 708,  41,   6, 278,
            35, 163, 709,  14, 423,   7, 710, 711, 106, 712, 418,   6,  55,  21,
           713,   6, 714,   6,   7, 715, 716,   8, 717, 718, 719,  14, 7