In [3]:
from tokenizers.implementations import CharBPETokenizer
from tokenizers.processors import TemplateProcessing

tokenizer = CharBPETokenizer(lowercase=True, bert_normalizer=True, split_on_whitespace_only=True)
tokenizer.post_processor = TemplateProcessing(
    single="<s> $9 </s>",
    pair="<s> $A </s> $B:1 </s>:1",
    special_tokens=[
        ("<s>", 2),
        ("</s>", 3)
    ]
)
tokenizer.enable_padding(pad_token="<pad>")

tokenizer.train(
    "../data/prep-Train.txt",
    vocab_size=6000,
    min_frequency=2,
    special_tokens=["<pad>", "<unk>", "<s>", "</s>", "<mask>"]
)






In [4]:
tokenizer.encode("T√¥i y√™u ph·ªü b√≤").tokens

['<s>', 't√¥i</w>', 'y√™u</w>', 'ph', '·ªü</w>', 'b√≤</w>', '</s>']

In [5]:
print(tokenizer.get_vocab_size())
tokenizer.get_vocab()

5272


{'5690</w>': 4391,
 'g∆∞∆°ng</w>': 2511,
 'x√©t</w>': 1399,
 'goi</w>': 4227,
 'm√≤</w>': 3283,
 'ountsmile</w>': 4360,
 'na</w>': 2374,
 'ƒëoÃÅ</w>': 3439,
 'cung</w>': 1914,
 'üò∂</w>': 427,
 'gi·ªØ</w>': 1806,
 's·ªØa</w>': 4014,
 'm·∫°': 706,
 's·ª•p</w>': 4022,
 'r·∫°ch</w>': 4667,
 'ipx</w>': 5069,
 'n√†y</w>': 554,
 'g√¢y</w>': 2069,
 'r√¥': 3992,
 'antutu</w>': 2338,
 '√£i</w>': 1102,
 'ting</w>': 1527,
 'bu√¥ÃÄn</w>': 4249,
 'qua</w>': 726,
 'tri√™Ãân</w>': 5264,
 'c·∫Øm</w>': 1487,
 'tr√†ng</w>': 4135,
 '=</w>': 441,
 'üòù</w>': 313,
 '√°nh</w>': 1145,
 '·ªát</w>': 1704,
 'm√†y</w>': 2297,
 'ƒë√¥ÃÅi</w>': 3762,
 'c': 43,
 'modem</w>': 3189,
 'de': 1869,
 't·∫øt</w>': 1921,
 'h√¥i</w>': 2905,
 'lyÃÅ</w>': 4580,
 'vy</w>': 4724,
 'lip</w>': 2475,
 'sa': 520,
 'gi·ªçt</w>': 2470,
 'nge</w>': 1905,
 '61</w>': 2351,
 '·∫ª</w>': 280,
 'ƒëaÃÅo</w>': 4248,
 'ho': 774,
 'c·ª©ng</w>': 1691,
 '√¥m</w>': 794,
 'gom</w>': 4228,
 '@</w>': 421,
 'tam</w>': 2037,
 'kh·ªèi</w>': 990,
 '‚àÜ<

In [6]:
tokenizer.save_model(".", "vn-smartphone-absa")

['./vn-smartphone-absa-vocab.json', './vn-smartphone-absa-merges.txt']