In [1]:
%%capture

!uv pip install tokenizers polars tiktoken

In [2]:
import polars as pl

In [31]:
from kn1ght.constants import SPECIAL_TOKENS

['[g_start]', '[g_end]', '[unknown]']

In [5]:
df = pl.read_csv("../.data/chess_games_2025-01-15.csv", null_values=["None"])

In [6]:
sample = df.select("PGN").sample(n=500)

In [7]:
training_text = []

for game in sample.iter_rows():
    if game[0]:
        training_text.append(
            SPECIAL_TOKENS["START"] + game[0].strip() + SPECIAL_TOKENS["END"]
        )

print(training_text[0])

[g_start]1.d4 Nf6 2.c4 g6 3.Nc3 Bg7 4.e4 d6 5.Bd3 O-O 6.Nge2 e5 7.d5 Nbd7 8.Bg5 h6 9.Be3 a5 10.f3 Nc5 11.Bc2 Ne8 12.O-O f5 13.Bxc5 dxc5 14.exf5 gxf5 15.Ng3 Nd6 16.Qd3 Qh4 17.b3 Bd7 18.Rae1 Rae8 19.Nce2 Qg5 20.Kh1 h5 21.Ng1 h4 22.Nh3 Qh6 23.Ne2 Kh8 24.Nf2 Bf6 25.Ng1 Rg8 26.Nfh3 Rg7 27.Re2 Reg8 28.Qd2 f4 29.Qxa5 b6 30.Qd2 Bf5 31.Rff2 Qg6 32.a4 Qf7 33.Qe1 Qd7 34.Rf1 Bxc2 35.Rxc2 Nf5 36.Re2 Nd4 37.Rxe5 Kh7 38.Nxf4 Bxe5 39.Qxe5 Re8 40.Ne6 Nxe6 41.dxe6 Rxe6 42.Qb2 Qe7 43.Qc3 Kg8 44.Nh3 Re3 45.Qd2 Re2 46.Qd5+ Kh8 47.Nf4 Re5 48.Qc6 Qf7 49.Nd5 Re6[g_end]


In [8]:
import re

# ignore `1.`, ` 2.`, ` `, etc. and get the actual moves as separate entries
chunk_pattern = re.compile(r""" ?\d+\.|\. ?| ?[-\w]+|[#+]|\s+""")

In [9]:
from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers.normalizers import NFD
from tokenizers.pre_tokenizers import Split
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.trainers import BpeTrainer

In [21]:
tokenizer = Tokenizer(BPE(unk_token=SPECIAL_TOKENS["UNKNOWN"], fuse_unk=True))

tokenizer.normalizer = NFD()

tokenizer.pre_tokenizer = Split(
    pattern=Regex(r""" ?\d+\.|\. ?| ?[-\w]+|[#+]|\s+|"""), behavior="isolated"
)

tokenizer.post_processor = ByteLevelProcessor(trim_offsets=True)
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(
    vocab_size=4096, show_progress=True, special_tokens=list(SPECIAL_TOKENS.values())
)

In [22]:
tokenizer.train_from_iterator([training_text], trainer=trainer)






In [23]:
sample = df.sample(1).select("PGN").item()

output = tokenizer.encode(sample)

print(sample)

1.e4 c5 2.Nf3 Nc6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 e5 6.Ndb5 d6 7.Bg5 a6 8.Na3 b5 9.Nd5 Qa5+ 10.Bd2 Qd8 11.Nxf6+ Qxf6 12.Bd3 Be7 13.O-O Qg6 14.f4 exf4 15.Bxf4 Ne5 16.Kh1 O-O 17.Qe2 Bf6 18.c3 Nxd3 19.Qxd3 Bb7 20.Qxd6 Qxe4 21.Qd2 Rad8 22.Qc2 Rfe8 23.Kg1 b4 24.cxb4 Bd4+ 25.Kh1 Qe2 26.Rg1 Bxb2 27.Qxe2 Rxe2 28.Rad1 Bxg2+ 


In [24]:
# adapted from: https://github.com/openai/tiktoken/blob/main/tiktoken/_educational.py#L186
def visualise_kn1ght_tokens(token_values: list[str | bytes]) -> None:
    background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
    # If token boundaries do not occur at unicode character boundaries, it's unclear how best to
    # visualise the token. Here, we'll just use the unicode replacement character to represent some
    # fraction of a character.
    if not all(isinstance(x, str) for x in token_values):
        unicode_token_values = [x.decode("utf-8") for x in token_values]
    else:
        unicode_token_values = token_values

    running_length = 0
    last_color = None
    for token in unicode_token_values:
        color = background[running_length % len(background)]
        if color == last_color:
            color = background[(running_length + 1) % len(background)]
            assert color != last_color
        last_color = color
        running_length += len(token)
        print(color + token, end="")
    print("\u001b[0m")

In [25]:
import tiktoken

tiktoken_gpt4_encoding = tiktoken.get_encoding("cl100k_base")
tiktoken_gpt4_tokens = tiktoken_gpt4_encoding.encode(sample)

tiktoken_gpt4o_encoding = tiktoken.get_encoding("o200k_base")
tiktoken_gpt4o_tokens = tiktoken_gpt4o_encoding.encode(sample)

In [26]:
print("tiktoken gpt-4o Tokenizer:")
visualise_kn1ght_tokens(
    tiktoken_gpt4o_encoding.decode_tokens_bytes(tiktoken_gpt4o_tokens)
)
print("---")
print("tiktoken gpt-4 Tokenizer:")
visualise_kn1ght_tokens(
    tiktoken_gpt4_encoding.decode_tokens_bytes(tiktoken_gpt4_tokens)
)
print("---")
print("kn1ght Tokenizer:")
visualise_kn1ght_tokens(output.tokens)

tiktoken gpt-4o Tokenizer:
[48;5;167m1[48;5;179m.e[48;5;77m4[48;5;80m c[48;5;134m5[48;5;167m [48;5;179m2[48;5;185m.N[48;5;80mf[48;5;68m3[48;5;134m Nc[48;5;185m6[48;5;77m [48;5;80m3[48;5;68m.d[48;5;167m4[48;5;179m c[48;5;77mxd[48;5;68m4[48;5;134m [48;5;167m4[48;5;179m.N[48;5;77mxd[48;5;68m4[48;5;134m N[48;5;179mf[48;5;185m6[48;5;77m [48;5;80m5[48;5;68m.N[48;5;167mc[48;5;179m3[48;5;185m e[48;5;80m5[48;5;68m [48;5;134m6[48;5;167m.N[48;5;185mdb[48;5;80m5[48;5;68m d[48;5;167m6[48;5;179m [48;5;185m7[48;5;77m.B[48;5;68mg[48;5;134m5[48;5;167m a[48;5;185m6[48;5;77m [48;5;80m8[48;5;68m.Na[48;5;179m3[48;5;185m b[48;5;80m5[48;5;68m [48;5;134m9[48;5;167m.N[48;5;185md[48;5;77m5[48;5;80m Qa[48;5;167m5[48;5;179m+[48;5;185m [48;5;77m10[48;5;68m.B[48;5;167md[48;5;179m2[48;5;185m Q[48;5;80md[48;5;68m8[48;5;134m [48;5;167m11[48;5;185m.N[48;5;80mxf[48;5;134m6[48;5;167m+[48;5;179m Q[48;5;77mxf[48;5;68m6[48;5;134m [48;5;167m12[

In [27]:
test = tokenizer.decode(output.ids)
print(output)
print(test)
print(test == sample)

Encoding(num_tokens=89, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
1.e4 c5 2.Nf3 Nc6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 e5 6.Ndb5 d6 7.Bg5 a6 8.Na3 b5 9.Nd5 Qa5+ 10.Bd2 Qd8 11.Nxf6+ Qxf6 12.Bd3 Be7 13.O-O Qg6 14.f4 exf4 15.Bxf4 Ne5 16.Kh1 O-O 17.Qe2 Bf6 18.c3 Nxd3 19.Qxd3 Bb7 20.Qxd6 Qxe4 21.Qd2 Rad8 22.Qc2 Rfe8 23.Kg1 b4 24.cxb4 Bd4+ 25.Kh1 Qe2 26.Rg1 Bxb2 27.Qxe2 Rxe2 28.Rad1 Bxg2+ 
True


In [28]:
tokenizer.decode(tokenizer.encode("1.d4 d5 2.Nf3 Bf5").ids)

'1.d4 d5 2.Nf3 Bf5'

In [18]:
# tokenizer.model.save("../.data/tokenizer/", "kn1ght")

In [29]:
tokenizer.encode(START_GAME).tokens

['[g_start]']