In [None]:
from transformers import PreTrainedTokenizerFast

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('./model_save/tokenizer')

In [None]:
len(tokenizer)

# 1. Train the tokenizer (optional)

In [2]:
import tokenizers
from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Punctuation, Digits, Metaspace, ByteLevel
from tokenizers.normalizers import NFKC 
from rich import progress

# 2. Define the source of the tokenizer training corpus

In [3]:
cropus_file =  './data/wiki.simple.txt'
tokenizer_save_path = './model_save/hf_bpe_tokenizer.json'

# 3. Function to train the tokenizer
The `get_training_corpus` function concatenates multiple short texts into sentences longer than `chunk_len=2048`, returning `buffer_size=1000` of such long sentences each iteration

In [None]:
def train_my_huggingface_wiki_tokenizer(max_train_line: int=None, token_type: str='char') -> None:
    '''
    Train tokenizer with huggingface, at least 32GB of memory needed, about half an hour to run.
    '''

    # if not exists(tokenizer_save_path): mkdir(tokenizer_save_path)

    def get_training_corpus(buffer_size: int=1000, chunk_len: int=2048) -> list:
        '''
        A text chunk of size 2048
        '''
        line_cnt = 0
        buffer = []
        with open(cropus_file, 'r', encoding='utf-8') as f_read:
            cur_chunk_txt, txt_len = [], 0
            for line in f_read:

                cur_chunk_txt.append(line)
                txt_len += len(line)
                line_cnt += 1

                if txt_len >= chunk_len:
                    buffer.append(
                        ''.join(cur_chunk_txt)
                    )
                    cur_chunk_txt, txt_len = [], 0
                
                if len(buffer) >= buffer_size:
                    yield buffer
                    buffer = []

                if isinstance(max_train_line, int) and line_cnt > max_train_line: break
                
            # yield last
            if len(buffer) > 0: yield buffer        

    special_tokens = ["[PAD]","[EOS]","[SEP]","[BOS]", "[CLS]", "[MASK]", "[UNK]"]
    
    if token_type ==' char':
        model = BPE(unk_token="[UNK]")
        tokenizer = Tokenizer(model)
        
        

        # Use compatible equivalent decomposition and recombination to process utf encoding, e.g., converting full-width A to half-width A
        tokenizer.normalizer = tokenizers.normalizers.Sequence([NFKC()])

        # Pre-tokenization for punctuation, digits, and Metaspace (otherwise, decoded text will not have spaces)
        tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
            [Punctuation(), Digits(individual_digits=True), Metaspace()]
        )

        tokenizer.add_special_tokens(special_tokens)
        tokenizer.decoder = decoders.Metaspace()
    elif token_type ==' byte':
        # Byte BPE does not need unk_token
        model = BPE() 
        tokenizer = Tokenizer(model)
        tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True)

        tokenizer.add_special_tokens(special_tokens)
        tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, use_regex=True)
        tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False)
    else:
        raise Exception('Token type must be `char` or `byte`')

    trainer = BpeTrainer(vocab_size=40960, min_frequency=100, show_progress=True, special_tokens=special_tokens)
    tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

    # add \t \n 
    if '\t' not in tokenizer.get_vocab():
        tokenizer.add_tokens(['\t'])
    if '\n' not in tokenizer.get_vocab():
        tokenizer.add_tokens(['\n'])

    tokenizer.save(tokenizer_save_path)

# 4. Start training the tokenizer
At least `32GB` of memory needed for 100 million characters (actually, `32GB` is still not quite enough, frequent swapping may occur), CPU `13600k` takes about an hour for training.

In [None]:
train_my_huggingface_wiki_tokenizer(token_type='byte')

# 5. Convert the trained tokenizer to PreTrainedTokenizerFast and save
Conversion is for ease of use as `AutoTokenizer` in other `huggingface` components.

During conversion, manually specify `pad_token`, `eos_token`, etc., as it doesn't automatically identify which characters in the original tokenizer are these special tokens

In [None]:
slow_tokenizer = Tokenizer.from_file(tokenizer_save_path)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=slow_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    bos_token='[BOS]',
    eos_token='[EOS]',                  
)
tokenizer.save_pretrained('./model_save/fast_tokenizer/')