In [1]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFKC

In [4]:
tokenizer = Tokenizer(BPE(unk_token="<unk>"))

tokenizer.normalizer = normalizers.Sequence([
    NFKC()
])

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

tokenizer.decoder = decoders.ByteLevel()

trainer = BpeTrainer(
    vocab_size=16000,
    min_frequency=2,
    special_tokens=["<pad>", "<unk>", "<s>", "</s>"]
)


print("Training tokenizer on corpus...")

tokenizer.train(
    files=["prepared/lm_corpus.txt"], 
    trainer=trainer
)


tokenizer.save("tokens/tokenizer.json")

print("Tokenizer training completed.")
print(f"Vocab size: {tokenizer.get_vocab_size()}")


Training tokenizer on corpus...
Tokenizer training completed.
Vocab size: 16000


In [5]:
test_str = "Wifi is not working"
encoded = tokenizer.encode(test_str).tokens
print(f"Test Tokenization ('{test_str}'):")
print(encoded) 


Test Tokenization ('Wifi is not working'):
['W', 'ifi', 'Ġis', 'Ġnot', 'Ġworking']
