In [1]:
%%capture

!uv pip install tokenizers polars tiktoken

In [2]:
import polars as pl

In [3]:
START_GAME = "<|g_start|>"
END_GAME = "<|g_end|>"

In [4]:
SPECIAL_TOKENS = [START_GAME, END_GAME]

In [5]:
df = pl.read_csv("../.data/chess_games_2025-01-15.csv", null_values=["None"])

In [6]:
sample = df.select("PGN").sample(n=500)

In [7]:
training_text = []

for game in sample.iter_rows():
    if game[0]:
        training_text.append(START_GAME + game[0].strip() + END_GAME)

In [8]:
import re

# ignore `1.`, ` 2.`, ` `, etc. and get the actual moves as separate entries
chunk_pattern = re.compile(r""" ?\d+\.|\. ?| ?[-\w]+|[#+]|\s+""")

In [9]:
from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers.normalizers import NFD
from tokenizers.pre_tokenizers import Split
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.trainers import BpeTrainer

In [10]:
tokenizer = Tokenizer(
    BPE(unk_token="[UNK]", fuse_unk=True, continuing_subword_prefix="")
)

tokenizer.normalizer = NFD()

tokenizer.pre_tokenizer = Split(
    pattern=Regex(r""" ?\d+\.|\. ?| ?[-\w]+|[#+]"""), behavior="isolated"
)

tokenizer.post_processor = ByteLevelProcessor(trim_offsets=True)
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(vocab_size=3072, show_progress=True)

In [11]:
tokenizer.train_from_iterator([training_text], trainer=trainer)






In [12]:
sample = df.sample(1).select("PGN").item()

output = tokenizer.encode(sample)

print(sample)

1.d4 d5 2.Nf3 Bf5 3.Bf4 Nc6 4.h3 e6 5.c3 Bd6 6.e3 Bxf4 7.exf4 Qd6 8.Qd2 Nf6 9.Bb5 a6 10.Bxc6+ bxc6 11.O-O Ne4 12.Qe3 Rb8 13.Nbd2 Rxb2 14.Rfd1 O-O 15.c4 Nxd2 16.Nxd2 Qb4 17.a3 Qb7 18.Qc3 dxc4 19.Qxc4 Qb5 20.Qc3 Rc2 21.Qe3 Rd8 22.Rdb1 Qd3 23.Qxd3 Bxd3 24.Nf3 f6 25.Rd1 Be2 26.Re1 Bxf3 27.gxf3 Kf7 28.Red1 Rc4 29.Rac1 Rxc1 30.Rxc1 Rd6 31.Rc4 h6 32.h4 Kg6 33.Kg2 h5 34.Kg3 f5 35.Ra4 a5 36.Rc4 Kf7 37.Kg2 Ke7 38.Kf1 Kd7 39.Ke2 Rd5 40.a4 c5 41.Kd3 cxd4 42.Rxd4 c5 43.Rxd5+ exd5 44.Kd2 c4 45.Kc3 Ke6 46.Kd4 Kd6 47.Ke3 Kc5 48.Kd2 d4 49.Kc2 Kb4 50.Kb2 Kxa4 51.Ka2 c3 52.Ka1 Kb3 53.Kb1 a4 54.Ka1 d3 55.Kb1 d2 56.Ka1 d1=Q# 


In [13]:
# adapted from: https://github.com/openai/tiktoken/blob/main/tiktoken/_educational.py#L186
def visualise_kn1ght_tokens(token_values: list[str | bytes]) -> None:
    background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
    # If token boundaries do not occur at unicode character boundaries, it's unclear how best to
    # visualise the token. Here, we'll just use the unicode replacement character to represent some
    # fraction of a character.
    if not all(isinstance(x, str) for x in token_values):
        unicode_token_values = [x.decode("utf-8") for x in token_values]
    else:
        unicode_token_values = token_values

    running_length = 0
    last_color = None
    for token in unicode_token_values:
        color = background[running_length % len(background)]
        if color == last_color:
            color = background[(running_length + 1) % len(background)]
            assert color != last_color
        last_color = color
        running_length += len(token)
        print(color + token, end="")
    print("\u001b[0m")

In [14]:
import tiktoken

tiktoken_gpt4_encoding = tiktoken.get_encoding("cl100k_base")
tiktoken_gpt4_tokens = tiktoken_gpt4_encoding.encode(sample)

tiktoken_gpt4o_encoding = tiktoken.get_encoding("o200k_base")
tiktoken_gpt4o_tokens = tiktoken_gpt4o_encoding.encode(sample)

In [15]:
print("tiktoken gpt-4o Tokenizer:")
visualise_kn1ght_tokens(
    tiktoken_gpt4o_encoding.decode_tokens_bytes(tiktoken_gpt4o_tokens)
)
print("---")
print("tiktoken gpt-4 Tokenizer:")
visualise_kn1ght_tokens(
    tiktoken_gpt4_encoding.decode_tokens_bytes(tiktoken_gpt4_tokens)
)
print("---")
print("kn1ght Tokenizer:")
visualise_kn1ght_tokens(output.tokens)

tiktoken gpt-4o Tokenizer:
[48;5;167m1[48;5;179m.d[48;5;77m4[48;5;80m d[48;5;134m5[48;5;167m [48;5;179m2[48;5;185m.N[48;5;80mf[48;5;68m3[48;5;134m B[48;5;179mf[48;5;185m5[48;5;77m [48;5;80m3[48;5;68m.B[48;5;167mf[48;5;179m4[48;5;185m Nc[48;5;68m6[48;5;134m [48;5;167m4[48;5;179m.h[48;5;77m3[48;5;80m e[48;5;134m6[48;5;167m [48;5;179m5[48;5;185m.c[48;5;80m3[48;5;68m Bd[48;5;179m6[48;5;185m [48;5;77m6[48;5;80m.e[48;5;134m3[48;5;167m B[48;5;185mxf[48;5;80m4[48;5;68m [48;5;134m7[48;5;167m.ex[48;5;77mf[48;5;80m4[48;5;68m Q[48;5;167md[48;5;179m6[48;5;185m [48;5;77m8[48;5;80m.Q[48;5;134md[48;5;167m2[48;5;179m N[48;5;77mf[48;5;80m6[48;5;68m [48;5;134m9[48;5;167m.B[48;5;185mb[48;5;77m5[48;5;80m a[48;5;134m6[48;5;167m [48;5;179m10[48;5;77m.B[48;5;68mxc[48;5;167m6[48;5;179m+[48;5;185m b[48;5;80mxc[48;5;134m6[48;5;167m [48;5;179m11[48;5;77m.O[48;5;68m-O[48;5;167m Ne[48;5;77m4[48;5;80m [48;5;68m12[48;5;167m.Q[48;5;185me

In [16]:
test = tokenizer.decode(output.ids)
print(output)
print(test)
print(test == sample)

Encoding(num_tokens=177, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
1.d4 d5 2.Nf3 Bf5 3.Bf4 Nc6 4.h3 e6 5.c3 Bd6 6.e3 Bxf4 7.exf4 Qd6 8.Qd2 Nf6 9.Bb5 a6 10.Bxc6+ bxc6 11.O-O Ne4 12.Qe3 Rb8 13.Nbd2 Rxb2 14.Rfd1 O-O 15.c4 Nxd2 16.Nxd2 Qb4 17.a3 Qb7 18.Qc3 dxc4 19.Qxc4 Qb5 20.Qc3 Rc2 21.Qe3 Rd8 22.Rdb1 Qd3 23.Qxd3 Bxd3 24.Nf3 f6 25.Rd1 Be2 26.Re1 Bxf3 27.gxf3 Kf7 28.Red1 Rc4 29.Rac1 Rxc1 30.Rxc1 Rd6 31.Rc4 h6 32.h4 Kg6 33.Kg2 h5 34.Kg3 f5 35.Ra4 a5 36.Rc4 Kf7 37.Kg2 Ke7 38.Kf1 Kd7 39.Ke2 Rd5 40.a4 c5 41.Kd3 cxd4 42.Rxd4 c5 43.Rxd5+ exd5 44.Kd2 c4 45.Kc3 Ke6 46.Kd4 Kd6 47.Ke3 Kc5 48.Kd2 d4 49.Kc2 Kb4 50.Kb2 Kxa4 51.Ka2 c3 52.Ka1 Kb3 53.Kb1 a4 54.Ka1 d3 55.Kb1 d2 56.Ka1 d1=Q# 
True


In [17]:
tokenizer.decode(tokenizer.encode("1.d4 d5 2.Nf3 Bf5").ids)

'1.d4 d5 2.Nf3 Bf5'

In [20]:
tokenizer.model.save("../.data/tokenizer/", "kn1ght")

['../.data/tokenizer/kn1ght-vocab.json',
 '../.data/tokenizer/kn1ght-merges.txt']