In [None]:
import tiktoken
import unicodedata
from typing import Union, Set, Collection
from tiktoken.load import load_tiktoken_bpe

In [None]:
class Arguments:
    def __init__(self):
        self.name = "my_tiktoken"
        self.vocab_file = "/path/to/your/*.tiktoken"
        self.text_file = "/path/to/your/*.txt"
        self.extra_vocab_file = "/path/to/your/*.txt"

In [None]:
class MyTokenizer:
    """Refer to Qwen tokenizer."""
    
    def __init__(self, configs):
        self.pat_str = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
        self.endoftext = "<|endoftext|>"
        self.im_start = "<|im_start|>"
        self.im_end = "<|im_end|>"
        self.extras = tuple((f"<|extra_{i}|>" for i in range(205)))
        self.special_start_id = 151643
        self.special_tokens = {
            token: index
            for index, token in tuple(enumerate(
                (
                        (self.endoftext, self.im_start, self.im_end,) + self.extras
                ),
                start=self.special_start_id))
        }
        self.mergeable_ranks = load_tiktoken_bpe(configs.vocab_file)
        self.tokenizer = tiktoken.Encoding(
            name=configs.name,
            pat_str=self.pat_str,
            mergeable_ranks=self.mergeable_ranks,
            special_tokens=self.special_tokens
        )
        self.decoder = {v: k.decode("utf-8", errors="replace") for k, v in self.mergeable_ranks.items()}
        self.decoder.update({v: k for k, v in self.special_tokens.items()})

    def tokenize(self, text: str, allowed_special: Union[Set, str]="all", disallowed_special: Union[Collection, str]=(), **kwargs):
        tokens = []
        text = unicodedata.normalize("NFC", text)
        for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special):
            tokens.append(self.decoder[t])
        return tokens

In [None]:
configs = Arguments()
my_tokenizer = MyTokenizer(configs)

results = {}
with open(configs.text_file) as f:
    text = f.read()

tokens = my_tokenizer.tokenize(text)
for token in tokens:
    results.setdefault(token, 0)
    results[token] += 1

with open(configs.extra_vocab_file, mode="w", encoding="utf-8") as f:
    for k, v in results.items():
        f.write(k+"\t"+str(v))
        f.write("\n")