In [1]:
import pandas as pd
from ToolCadeau.preprocessors.refiner import refine

with open("../data/corpus/corpus.txt", "r") as file:
    kos, ens = [], []
    for line in file:
        ko, en = line.strip().split("\t")
        kos += [ko]
        ens += [en]

In [2]:
corpus = pd.DataFrame({"ko": kos, "en": ens})

refine corpus data

In [3]:
# refine data
from tqdm import tqdm

regex_fn = "../utils/preprocessors/tools/refine.regex.txt"
cols = ["ko", "en"]

for col in tqdm(cols):
    corpus[col] = refine(corpus, column=col, regex_fn=regex_fn)
    corpus[col] = corpus[col].str.replace(pat=r"[\"]", repl=r"``", regex=True)

100%|████████████████████████████████████████████| 2/2 [03:21<00:00, 100.69s/it]


shuffle corpus data

In [5]:
corpus = corpus.sample(frac=1)

split into train, valid, test data

In [6]:
from sklearn.model_selection import train_test_split

corpus_train, corpus_test = train_test_split(corpus, test_size=0.2, random_state=1004)

corpus_test, corpus_valid = train_test_split(
    corpus_test, test_size=0.5, random_state=1004
)

In [7]:
def save_data_to_file(data, fn, lang):
    with open(f"../data/corpus/corpus.{fn}.{lang}", "w") as f:
        for line in data:
            f.write(line.strip() + "\n")


for lang in ["ko", "en"]:
    for fn in ["train", "valid", "test"]:
        corpus_ = locals()["corpus" + "_" + fn]
        save_data_to_file(corpus_[lang], fn, lang)

ko tokenization with mecab

In [10]:
! cat ../data/corpus/corpus.train.ko | mecab -O wakati -b 9999 | python3 ../preprocessing/post_tokenize.py ../data/corpus/corpus.train.ko > ../data/corpus/corpus.train.tok.ko

In [11]:
! cat ../data/corpus/corpus.valid.ko | mecab -O wakati -b 9999 | python3 ../preprocessing/post_tokenize.py ../data/corpus/corpus.valid.ko > ../data/corpus/corpus.valid.tok.ko

In [12]:
! cat ../data/corpus/corpus.test.ko | mecab -O wakati -b 9999 | python3 ../preprocessing/post_tokenize.py ../data/corpus/corpus.test.ko > ../data/corpus/corpus.test.tok.ko

en tokenization with nltk.word_tokenize

In [13]:
! cat ../data/corpus/corpus.train.en | python3 ../preprocessing/tokenizer.py | python3 ../preprocessing/post_tokenize.py ../data/corpus/corpus.train.en > ../data/corpus/corpus.train.tok.en

In [14]:
! cat ../data/corpus/corpus.valid.en | python3 ../preprocessing/tokenizer.py | python3 ../preprocessing/post_tokenize.py ../data/corpus/corpus.valid.en > ../data/corpus/corpus.valid.tok.en

In [15]:
! cat ../data/corpus/corpus.test.en | python3 ../preprocessing/tokenizer.py | python3 ../preprocessing/post_tokenize.py ../data/corpus/corpus.test.en > ../data/corpus/corpus.test.tok.en

subword-segmentation

In [16]:
! python3 ./../preprocessing/learn_bpe.py --input ../data/corpus/corpus.train.tok.ko --output ./translation/models/bpe.ko.model --symbols 30000

In [17]:
! python3 ./../preprocessing/learn_bpe.py --input ../data/corpus/corpus.train.tok.en --output ./translation/models/bpe.en.model --symbols 50000

In [18]:
! cat ../data/corpus/corpus.train.tok.ko | python3 ../preprocessing/apply_bpe.py -c ./translation/models/bpe.ko.model > ../data/corpus/corpus.train.tok.bpe.ko

In [19]:
! cat ../data/corpus/corpus.valid.tok.ko | python3 ../preprocessing/apply_bpe.py -c ./translation/models/bpe.ko.model > ../data/corpus/corpus.valid.tok.bpe.ko

In [20]:
! cat ../data/corpus/corpus.test.tok.ko | python3 ../preprocessing/apply_bpe.py -c ./translation/models/bpe.ko.model > ../data/corpus/corpus.test.tok.bpe.ko

In [21]:
! cat ../data/corpus/corpus.train.tok.en | python3 ../preprocessing/apply_bpe.py -c ./translation/models/bpe.en.model > ../data/corpus/corpus.train.tok.bpe.en

In [22]:
! cat ../data/corpus/corpus.valid.tok.en | python3 ../preprocessing/apply_bpe.py -c ./translation/models/bpe.en.model > ../data/corpus/corpus.valid.tok.bpe.en

In [23]:
! cat ../data/corpus/corpus.test.tok.en | python3 ../preprocessing/apply_bpe.py -c ./translation/models/bpe.en.model > ../data/corpus/corpus.test.tok.bpe.en

save train_corpus, valid_corpus, test_corpus

In [24]:
with open("../data/corpus/corpus.txt", "r") as file:
    kos, ens = [], []
    for line in file:
        ko, en = line.strip().split("\t")
        kos += [ko]
        ens += [en]

In [25]:
def save_data():
    for fn in ["train", "valid", "test"]:
        with open(f"../data/corpus/corpus.{fn}.tok.bpe.ko", "r") as ko, open(
            f"../data/corpus/corpus.{fn}.tok.bpe.en", "r"
        ) as en:
            kos, ens = [], []

            for ko_line, en_line in zip(ko, en):
                kos += [ko_line]
                ens += [en_line]

            corpus = pd.DataFrame({"ko": kos, "en": ens})

        corpus.to_csv(f"../data/corpus/corpus.{fn}.tsv", sep="\t", index=None)


save_data()