In [1]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.vocab import build_vocab_from_iterator

from konlpy.tag import Hannanum

from tqdm import tqdm
import pandas as pd
import os

In [2]:
kor_tokenizer = get_tokenizer(Hannanum().morphs)
eng_tokenizer = get_tokenizer('spacy', language='en')



In [3]:
def vocab_iterator(strings, tokenizer):
    for string_ in tqdm(strings):
        yield tokenizer(string_)

In [4]:
if os.path.isfile('./datasets/korNeng_corpus/kor_vocab.pt'):
    kor_vocab = torch.load('./datasets/korNeng_corpus/kor_vocab.pt')
else:
    kor_vocab_base = pd.read_csv('./datasets/korNeng_corpus/train_data.csv')['KOR']
    kor_vocab = build_vocab_from_iterator(vocab_iterator(kor_vocab_base, kor_tokenizer), specials=['<unk>', '<pad>', '<bos>', '<eos>'], min_freq=5)
    kor_vocab.set_default_index(kor_vocab['<unk>'])
    torch.save(kor_vocab, './datasets/korNeng_corpus/kor_vocab.pt')

In [5]:
if os.path.isfile('./datasets/korNeng_corpus/eng_vocab.pt'):
    eng_vocab = torch.load('./datasets/korNeng_corpus/eng_vocab.pt')
else:
    eng_vocab_base = pd.read_csv('./datasets/korNeng_corpus/train_data.csv')['ENG']
    eng_vocab = build_vocab_from_iterator(vocab_iterator(eng_vocab_base, eng_tokenizer), specials=['<unk>', '<pad>', '<bos>', '<eos>'], min_freq=5)
    eng_vocab.set_default_index(eng_vocab['<unk>'])
    torch.save(eng_vocab, './datasets/korNeng_corpus/eng_vocab.pt')

In [6]:
def data_process(file_path):
    raw_kor_iter = iter(pd.read_csv(file_path)['KOR'])
    raw_eng_iter = iter(pd.read_csv(file_path)['ENG'])
    data = []
    for (raw_kor, raw_eng) in tqdm(zip(raw_kor_iter, raw_eng_iter)):
        kor_tensor_ = torch.tensor([kor_vocab[token] for token in kor_tokenizer(raw_kor)], dtype = torch.long)
        eng_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(raw_eng)], dtype = torch.long)

        data.append((kor_tensor_, eng_tensor_))

    return data


In [7]:
if os.path.isfile('./datasets/korNeng_corpus/train_tensor.pt'):
    train_data = torch.load('./datasets/korNeng_corpus/train_tensor.pt')
else:
    train_data = data_process('./datasets/korNeng_corpus/train_data.csv')
    torch.save(train_data, './datasets/korNeng_corpus/train_tensor.pt')

In [8]:
if os.path.isfile('./datasets/korNeng_corpus/valid_tensor.pt'):
    valid_data = torch.load('./datasets/korNeng_corpus/valid_tensor.pt')
else:
    valid_data = data_process('./datasets/korNeng_corpus/validation_data.csv')
    torch.save(valid_data, './datasets/korNeng_corpus/valid_tensor.pt')

In [9]:
if os.path.isfile('./datasets/korNeng_corpus/test_tensor.pt'):
    test_data = torch.load('./datasets/korNeng_corpus/test_tensor.pt')
else:
    test_data = data_process('./datasets/korNeng_corpus/test_data.csv')
    torch.save(test_data, './datasets/korNeng_corpus/test_tensor.pt')

In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 128
PAD_IDX = kor_vocab['<pad>']
BOS_IDX = kor_vocab['<bos>']
EOS_IDX = kor_vocab['<eos>']
device = torch.device('cuda')

def generate_batch(data_batch):
    kor_batch, eng_batch = [], []
    for (kor_item, eng_item) in data_batch:
        kor_batch.append(torch.cat([torch.tensor([BOS_IDX]), kor_item, torch.tensor([EOS_IDX])], dim=0))
        eng_batch.append(torch.cat([torch.tensor([BOS_IDX]), eng_item, torch.tensor([EOS_IDX])], dim=0))
    kor_batch = pad_sequence(kor_batch, padding_value=PAD_IDX)
    eng_batch = pad_sequence(eng_batch, padding_value=PAD_IDX)
    return kor_batch, eng_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(valid_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_batch)