In [7]:
from tiktoken_educational import SimpleBytePairEncoding

corpus = "The LLM is trained to predict the next word."
re_pattern = r"""
    's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
"""
tokenizer = SimpleBytePairEncoding.train(corpus, vocab_size=500, pat_str=re_pattern, visualise=None)

input_text = "The next word is"
tokens = tokenizer.encode(input_text, visualise=None)
output_text = tokenizer.decode(tokens)
print(f"{output_text}")

# 看一下分词器内部的状态：
from pprint import pprint
# pprint(tokenizer.mergeable_ranks)

The next word is


In [8]:
from pprint import pprint

from bpe import BPETokenizer

re_pattern = r"""
    's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
"""
bpe = BPETokenizer(re_pattern=re_pattern)
corpus = "The LLM is trained to predict the next word."
vocab_size = 500
end_word_token = ""
bpe.train(corpus, vocab_size)

# pprint(bpe.token_to_id)
assert bpe.token_to_id == tokenizer.mergeable_ranks

inputs = "The next word is"
tokens = bpe.encode(inputs)
print(f"encoded tokens: {tokens}")
outputs = bpe.decode(tokens)
print(f"decoded tokens: {outputs}")


encoded tokens: [259, 281, 285, 264]
decoded tokens: The next word is


In [9]:
from tiktoken_educational import SimpleBytePairEncoding

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    corpus = f.read()

re_pattern = r"""
    's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
"""

tokenizer = SimpleBytePairEncoding.train(corpus, vocab_size=500, pat_str=re_pattern, visualise=None)
inputs = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
encoded = tokenizer.encode(inputs, visualise=None)
decoded = tokenizer.decode(encoded)
print(f"{decoded}")

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


In [10]:
from pprint import pprint

from bpe import BPETokenizer

re_pattern = r"""
    's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
"""
bpe = BPETokenizer(re_pattern=re_pattern)
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    corpus = f.read()
word_end_token="<|endoftext|>"
bpe.train(corpus, vocab_size=500)
inputs = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
encoded = bpe.encode(inputs)
decoded = bpe.decode(encoded)
print(f"{decoded}")

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.
