# Sentece Piece 

- unsupervised text tokenizer and detokenizer 이며 주로 딥러닝에서 사용
- 내부적으로 BPE (byte-pair-encoding) 그리고 unigram language model 을 사용
- 특정 언어에 국한되지 않고, 다양한 언어에 사용 가능

- [논문](https://arxiv.org/pdf/1808.06226.pdf)
- [Github](https://github.com/google/sentencepiece)

설치

```
$ sudo apt-get install cmake build-essential pkg-config libgoogle-perftools-dev
$ pip install sentencepiece
```

In [88]:
from pathlib import Path
from tempfile import gettempdir
from typing import Dict, List

import numpy as np
import pandas as pd
import requests
import sentencepiece as stp
from konlpy.tag import Okt

okt = Okt()

# Data

In [142]:
def download(url, filename):
    path = Path(gettempdir()) / filename
    if not path.exists():
        with open(path, "wt") as f:
            r = requests.get(url, allow_redirects=True)
            f.write(r.text)

    df = pd.read_csv(test_path, delimiter="\t")
    return path, df


def preprocess_morph(text) -> str:
    morphs = okt.pos(str(text))
    tokens = []
    for word, pos in morphs:
        if pos in ("Punctuation", "Foreign"):
            continue

        tokens.append(word)
    return " ".join(tokens)


train_url = "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt"
test_url = "https://github.com/e9t/nsmc/raw/master/ratings_test.txt"

train_path, train_df = download(train_url, "nsmc_train.txt")
test_path, test_df = download(test_url, "nsmc_test.txt")

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
train_df["morph"] = train_df.document.apply(preprocess_morph)
test_df["morph"] = test_df.document.apply(preprocess_morph)

print("train_path:", train_path)
print("test_path :", test_path)
print(f"train_df  : {train_df.shape}")
print(f"test_df   : {test_df.shape}")
test_df.sample(5)

train_path: /tmp/nsmc_train.txt
test_path : /tmp/nsmc_test.txt
train_df  : (49997, 4)
test_df   : (49997, 4)


Unnamed: 0,id,document,label,morph
38829,9068645,포도향이 진하게 전해지는 영화..근20년 지난 지금도..,1,포도 향 이 진하게 전해지는 영화 근 20년 지난 지금 도
48451,9635876,조니뎁 진짜 약빨고 연기한 거 같음,1,조니뎁 진짜 약 빨 고 연기 한 거 같음
43125,9320232,그냥 디즈니 닭 이라고 하자!,0,그냥 디즈니 닭 이라고 하자
22037,3748046,굿.,1,굿
16127,5566461,재미있습니다. 볼만해요.,1,재미있습니다 볼 만해 요


# Sentencepiece with text file

## Train with text File 
- input: 학습 파일 위치
- model_prefix: 모델이름
- vocab_size: vocabulary 단어 크기
- model_type: `unigram` (default) | `bpe` | `char` | `word`
- max_sentence_length: 문장 최대 길이
- pad_id: pad token ID
- unk_id: unknown token ID
- bos_id: Begin of sentence token ID
- eos_id: End of sentence token ID 
- user_defined_symbols: 사용자 정의 토큰

In [114]:
train_morph_path = Path(gettempdir()) / "sentencepiece-train.txt"
model_prefix_path = Path(gettempdir()) / "nsmc-sentencepiece"
train_df.morph.to_csv(train_morph_path, index=False, header=False)

stp.SentencePieceTrainer.train(
    input=train_morph_path,
    model_prefix=model_prefix_path,
    vocab_size=4000,
    user_defined_symbols=["foo", "bar"],
)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /tmp/sentencepiece-train.txt
  input_format: 
  model_prefix: /tmp/nsmc-sentencepiece
  model_type: UNIGRAM
  vocab_size: 4000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: foo
  user_defined_symbols: bar
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
}
normalizer

# Inference

In [253]:
sp = stp.SentencePieceProcessor()
sp.load(str(model_prefix_path.with_suffix(".model")))


text = test_df.sample().morph.values[0]
print('Text             :', text)
print('Encode as IDs   :', sp.EncodeAsIds(text))
print('Encode as Pieces:', sp.EncodeAsPieces(text))
print('Decode from IDs :', sp.decode(sp.Encode(text)))

Text             : 잼있고 신나는 영화 보드 타는 모습 너무 멋져
Encode as IDs   : [668, 15, 3089, 19, 7, 118, 145, 501, 19, 421, 23, 1139, 394]
Encode as Pieces: ['▁잼있', '고', '▁신나', '는', '▁영화', '▁보', '드', '▁타', '는', '▁모습', '▁너무', '▁멋', '져']
Decode from IDs : 잼있고 신나는 영화 보드 타는 모습 너무 멋져


In [254]:
text_list = test_df.sample(3).morph.tolist()

print('[Text]')
display(text_list)


print('\n[Encode]')
encoded = sp.encode(text_list)
print(encoded)

print('\n[Encode as Pieces]')
print([sp.encode_as_pieces(line) for line in text_list])

print('\n[Decode]')
sp.decode(encoded)

[Text]


['주인공 들 너 무답 답 미 리터 놓고말 좀하지 특히 여 주인공',
 '진짜 재밌게 봤고 다시 봐두 재미 에 감동 임창정 짱 ㅠㅂㅠ',
 '그때 나 지금 이나 군대 는']


[Encode]
[[195, 13, 680, 116, 1457, 983, 187, 480, 287, 5, 3993, 15, 413, 111, 889, 607, 146, 195], [34, 236, 5, 3999, 15, 143, 497, 806, 102, 10, 80, 171, 980, 160, 279, 328, 3196], [1309, 30, 205, 323, 688, 158, 14]]

[Encode as Pieces]
[['▁주인공', '▁들', '▁너', '▁무', '답', '▁답', '▁미', '▁리', '터', '▁', '놓', '고', '말', '▁좀', '하지', '▁특히', '▁여', '▁주인공'], ['▁진짜', '▁재밌게', '▁', '봤', '고', '▁다시', '▁봐', '두', '▁재미', '▁에', '▁감동', '▁임', '창', '정', '▁짱', '▁ᅲ', '뷰'], ['▁그때', '▁나', '▁지금', '▁이나', '▁군', '대', '▁는']]

[Decode]


['주인공 들 너 무답 답 미 리터 놓고말 좀하지 특히 여 주인공',
 '진짜 재밌게 봤고 다시 봐두 재미 에 감동 임창정 짱 ᅲ뷰',
 '그때 나 지금 이나 군대 는']