In [3]:
from tokenizers.implementations import CharBPETokenizer
from tokenizers.processors import TemplateProcessing

tokenizer = CharBPETokenizer(lowercase=True, bert_normalizer=True, split_on_whitespace_only=True)
tokenizer.post_processor = TemplateProcessing(
    single="<s> $9 </s>",
    pair="<s> $A </s> $B:1 </s>:1",
    special_tokens=[
        ("<s>", 2),
        ("</s>", 3)
    ]
)
tokenizer.enable_padding(pad_token="<pad>")

tokenizer.train(
    "../data/prep-Train.txt",
    vocab_size=6000,
    min_frequency=2,
    special_tokens=["<pad>", "<unk>", "<s>", "</s>", "<mask>"]
)






In [4]:
tokenizer.encode("Tôi yêu phở bò").tokens

['<s>', 'tôi</w>', 'yêu</w>', 'ph', 'ở</w>', 'bò</w>', '</s>']

In [5]:
print(tokenizer.get_vocab_size())
tokenizer.get_vocab()

5272


{'5690</w>': 4391,
 'gương</w>': 2511,
 'xét</w>': 1399,
 'goi</w>': 4227,
 'mò</w>': 3283,
 'ountsmile</w>': 4360,
 'na</w>': 2374,
 'đó</w>': 3439,
 'cung</w>': 1914,
 '😶</w>': 427,
 'giữ</w>': 1806,
 'sữa</w>': 4014,
 'mạ': 706,
 'sụp</w>': 4022,
 'rạch</w>': 4667,
 'ipx</w>': 5069,
 'này</w>': 554,
 'gây</w>': 2069,
 'rô': 3992,
 'antutu</w>': 2338,
 'ãi</w>': 1102,
 'ting</w>': 1527,
 'buồn</w>': 4249,
 'qua</w>': 726,
 'triển</w>': 5264,
 'cắm</w>': 1487,
 'tràng</w>': 4135,
 '=</w>': 441,
 '😝</w>': 313,
 'ánh</w>': 1145,
 'ệt</w>': 1704,
 'mày</w>': 2297,
 'đối</w>': 3762,
 'c': 43,
 'modem</w>': 3189,
 'de': 1869,
 'tết</w>': 1921,
 'hôi</w>': 2905,
 'lý</w>': 4580,
 'vy</w>': 4724,
 'lip</w>': 2475,
 'sa': 520,
 'giọt</w>': 2470,
 'nge</w>': 1905,
 '61</w>': 2351,
 'ẻ</w>': 280,
 'đáo</w>': 4248,
 'ho': 774,
 'cứng</w>': 1691,
 'ôm</w>': 794,
 'gom</w>': 4228,
 '@</w>': 421,
 'tam</w>': 2037,
 'khỏi</w>': 990,
 '∆</w>': 403,
 'trang</w>': 1375,
 'nhung</w>': 1841,
 'tốc<

In [6]:
tokenizer.save_model(".", "vn-smartphone-absa")

['./vn-smartphone-absa-vocab.json', './vn-smartphone-absa-merges.txt']