In [1]:
from tokenizers import BertWordPieceTokenizer

import pickle
import os


In [2]:
tokenizer = BertWordPieceTokenizer(
    vocab_file=None,
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False, # Must be False if cased model
    lowercase=False,
    wordpieces_prefix="##"
    # Default Setting
#     , unk_token = '[UNK]'
#     , seq_token = '[SEP]'
#     , cls_token = '[CLS]'
#     , pad_token = '[PAD]'
#     , mask_token = '[MASK]'
)

In [3]:
data_path = 'C://Users/LGCNS/Documents/GitHub/Q_Bert/dt'

corpus_file = [ os.path.join(data_path, 'train_wiki_sentence1.txt'), os.path.join(data_path, 'train_wiki_sentence2.txt') ]
limit_alphabet = 3000
vocab_size = 32000

In [4]:

tokenizer.train(
    files = corpus_file,
    limit_alphabet = limit_alphabet,
    vocab_size = vocab_size
)


In [5]:
tokenizer.get_vocab_size()

32000

In [7]:
tokenizer.get_vocab()

{'조선후기': 26103,
 '##성인': 20995,
 '제17대': 26185,
 '지형을': 30409,
 '##개발': 8207,
 '게이초': 28521,
 '행렬': 10000,
 '선포': 10784,
 '선물': 18092,
 '뿡': 1429,
 '수록곡': 14690,
 '넓': 440,
 '##서이다': 14714,
 '구부': 25255,
 '아니며': 13439,
 '섭취': 13729,
 '##ien': 26358,
 '갔다가': 24721,
 '웅': 1934,
 '이외에는': 25935,
 '##릌': 5529,
 '5월에는': 22274,
 '립': 1024,
 '치러진': 17035,
 '##에너지': 24191,
 '공산당': 11187,
 '5월에': 11243,
 '동물로': 26175,
 '목포': 13110,
 '내가': 11408,
 '##옺': 5551,
 '의미한다': 7729,
 '##동은': 8390,
 '후미': 31352,
 '승낙': 26002,
 '이끄는': 8085,
 '모임': 12224,
 '타는': 20805,
 '그들과': 31985,
 '쳇': 2336,
 '꼭짓': 17839,
 '국민들의': 22674,
 '서포터': 28087,
 '명령어': 28544,
 'what': 30645,
 'j': 42,
 '영화로': 14477,
 '##ason': 27033,
 '뿐만': 8036,
 '아스': 10150,
 '##2년에': 8290,
 '사건에': 11910,
 '바이오': 17219,
 '##추어': 8599,
 '##지진': 28908,
 '좋아한다': 29336,
 '형제': 8646,
 '##보로': 11853,
 '가족': 6589,
 '##쿙': 5200,
 '김해': 12376,
 '##붸': 5106,
 '교사': 12347,
 '무라': 14366,
 '고수': 16366,
 '##양에': 19334,
 '괏': 172,
 '달고': 21605,
 '겝': 124,
 '

In [8]:
tokenizer.save("./model/bert-tkn-{}-{}".format(limit_alphabet, vocab_size),True)

In [9]:
result_model = tokenizer.save_model("./model", 'BertTokenizer-{}-{}'.format(limit_alphabet, vocab_size))

In [10]:
from transformers import BertTokenizerFast

# classtransformers.BertTokenizerFast(
# vocab_file, 
# tokenizer_file=None, 
# do_lower_case=True, 
# unk_token='[UNK]', 
# sep_token='[SEP]', 
# pad_token='[PAD]', 
# cls_token='[CLS]', 
# mask_token='[MASK]', 
# tokenize_chinese_chars=True, 
# strip_accents=None, **kwargs

In [11]:
tokenizer_for_load = BertTokenizerFast.from_pretrained(result_model[-1]
                                                   , strip_accents=False
                                                   , lowercase=False)

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated


In [12]:

print('vocab size : %d' % tokenizer_for_load.vocab_size)
# tokenized_input_for_pytorch = tokenizer_for_load("i am very hungry", return_tensors="pt")
tokenized_input_for_pytorch = tokenizer_for_load("나는 오늘 아침밥을 먹었다.[SEP]나는 내일도 아침밥을 먹을 수 있을까?", return_tensors="pt")
tokenized_input_for_tensorflow = tokenizer_for_load("나는 오늘 아침밥을 먹었다.", return_tensors="tf")

print("Tokens (str)      : {}".format([tokenizer_for_load.convert_ids_to_tokens(s) for s in tokenized_input_for_pytorch['input_ids'].tolist()[0]]))
print("Tokens (int)      : {}".format(tokenized_input_for_pytorch['input_ids'].tolist()[0]))
print("Tokens (attn_mask): {}\n".format(tokenized_input_for_pytorch['attention_mask'].tolist()[0]))

vocab size : 32000
Tokens (str)      : ['[CLS]', '나는', '오늘', '아침', '##밥', '##을', '먹', '##었다', '.', '[SEP]', '나는', '내', '##일', '##도', '아침', '##밥', '##을', '먹을', '수', '있을', '##까', '?', '[SEP]']
Tokens (int)      : [2, 8477, 6591, 8990, 4066, 3085, 1071, 5582, 17, 3, 8477, 409, 3267, 3185, 8990, 4066, 3085, 16586, 1573, 6555, 3390, 29, 3]
Tokens (attn_mask): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

