In [1]:
from tokenizers import BertWordPieceTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
import json

In [2]:
model_path = '../data/embeddings/wordpiece2.txt'
filename = '../data/text.txt'
vocab_size = 10000

In [3]:
# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    strip_accents=True,
    lowercase=True,
)

### train

In [4]:
# And then train
tokenizer.train(
    filename,
    vocab_size=vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=["<start>", "<end>"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

### save

In [5]:
# Save the files
tokenizer.save(model_path, pretty=True)

### load

In [6]:
tokenizer = WordPiece.from_file(model_path)

In [7]:
tokenizer = Tokenizer.from_file(model_path)

In [8]:
tokenizer.get_vocab()

{'##inct': 9096,
 'make': 701,
 'age': 3731,
 'honest': 1669,
 'excuses': 8287,
 '##apolog': 9339,
 'viewing': 9741,
 'significant': 9910,
 'group': 1518,
 '##ified': 2464,
 'clay': 7149,
 '##ahn': 9349,
 '##😆': 299,
 '##eth': 2398,
 '##coffeetoo': 8564,
 'nero': 8811,
 'flirting': 9585,
 '##ks': 553,
 '⃣': 90,
 '##wy': 2554,
 'blue': 2762,
 'retweet': 3701,
 'land': 3755,
 'true': 1972,
 'hoping': 2870,
 'chicks': 4226,
 'enter': 2571,
 'roy': 8841,
 'dem': 2194,
 'attacking': 5974,
 'tw': 535,
 'tone': 6854,
 '##asm': 3361,
 '##ci': 753,
 'anyways': 5019,
 'familiar': 8650,
 'county': 9697,
 'everyone': 1113,
 'ftr': 7624,
 'dras': 1474,
 '##4u': 5279,
 'adela': 9426,
 'explains': 8326,
 'fired': 3594,
 'corporate': 8599,
 'soci': 2836,
 'bug': 3445,
 'vocab': 9937,
 'rights': 1500,
 '##girl': 835,
 '##82': 7858,
 '##ries': 2635,
 'gewt': 8609,
 'fucking': 928,
 '1': 18,
 '##ign': 1389,
 'wishing': 6552,
 '##irub': 7036,
 '##ball': 963,
 '##irst': 9191,
 '##putout': 7451,
 'atro': 80

### test

In [9]:
encoding = tokenizer.encode("<start>This is a simple input to be tokenized <end>")

print("Encoded string: {}".format(encoding.tokens))

print(encoding.ids)

decoded = tokenizer.decode(encoding.ids)
print("Decoded string: {}".format(decoded))

Encoded string: ['<start>', 'this', 'is', 'a', 'simple', 'input', 'to', 'be', 'token', '##ized', '<end>']
[0, 459, 409, 38, 3561, 9150, 392, 416, 7898, 1825, 1]
Decoded string: this is a simple input to be tokenized
