# SPM using Kowiki Dataset using Huggingface's Tokenizers

## 01. Dataset Load

- `glob` 모듈을 이용해 kowiki dataset을 로드한다.

In [1]:
from glob import glob

In [2]:
file_lists = glob('../data/kowiki/*/*')
file_lists.sort()

In [8]:
# file_lists

## 02. Data Preprocessing for SPM

- SentencePiece Model의 Input에 맞게끔 데이터를 처리해준다.

- `nltk` 모듈을 이용해 문서를 문장 단위로 분리한다.

- `spm` 모듈의 input을 위해 `spm_input.txt`를 생성한다.

- reformer를 pretraining 시키기 위한 corpus를 pkl로 저장한다.

In [3]:
import re
import json

import sentencepiece as spm

from nltk import sent_tokenize
from tqdm.notebook import tqdm

In [4]:
corpus = []

for file_path in tqdm(file_lists):
    items = []
    with open(file_path, encoding="utf-8") as source:
        raw_text = source.readlines()
        for obj in raw_text:
            text = json.loads(obj)['text']
            text = re.sub('\\n', ' ', text)
            text = re.sub('\\s+', ' ', text)
            items.append(text)
    
    for text in items:
        sents = sent_tokenize(text)
        corpus.extend(sents)

HBox(children=(FloatProgress(value=0.0, max=676.0), HTML(value='')))




In [15]:
# corpus

In [6]:
import dill

# save corpus for reformer pretraining
with open('../data/corpus/kowiki_corpus.pkl', 'wb') as f:
    dill.dump(corpus, f)

In [17]:
input_file = '../data/spm/spm_input.txt'

with open(input_file, 'w', encoding='utf-8') as f:
    for sent in corpus:
        f.write(f'{sent}\n')

## 03. Train SentencePieceModel

- HuggingFace's `tokenizers` 모듈을 이용하여 Tokenizer 생성

In [7]:
from tokenizers import SentencePieceBPETokenizer, BertWordPieceTokenizer

In [12]:
# Initialize a tokenizer
# tokenizer = SentencePieceBPETokenizer()
tokenizer = BertWordPieceTokenizer()

In [13]:
%%time 

# And then train
tokenizer.train('../data/spm/spm_input.txt', show_progress=True)

CPU times: user 6min 52s, sys: 8.84 s, total: 7min 1s
Wall time: 6min 56s


In [14]:
# save tokenizers
tokenizer.save('../data/tokenizers/')

['../data/tokenizers/vocab.txt']

## 04. Check SPM

In [1]:
from tokenizers import BertWordPieceTokenizer

In [2]:
tokenizer = BertWordPieceTokenizer(vocab_file='../data/tokenizers/vocab.txt')

In [3]:
tmp_text = "지미 카터는 조지아주 섬터 카운티 플레인스 마을에서 태어났다."

encoded = tokenizer.encode(tmp_text)
print(encoded)

Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])


In [4]:
encoded.tokens

['[CLS]',
 '지미',
 '카터',
 '##는',
 '조지아',
 '##주',
 '섬',
 '##터',
 '카운티',
 '플레이',
 '##ᆫ',
 '##스',
 '마을에서',
 '태어났다',
 '.',
 '[SEP]']