In [9]:
%%capture

!uv pip install tokenizers polars

In [10]:
import polars as pl

In [11]:
START_GAME = "<|g_start|>"
END_GAME = "<|g_end|>"

In [12]:
SPECIAL_TOKENS = [START_GAME, END_GAME]

In [13]:
df = pl.read_csv("../.data/chess_games_2025-01-15.csv", null_values=["None"])

In [14]:
sample = df.select("PGN").sample(n=500)

In [20]:
training_text = []

for game in sample.iter_rows():
    if game[0]:
        training_text.append(START_GAME + game[0].strip() + END_GAME)

In [21]:
import re

# ignore `1.`, ` 2.`, ` `, etc. and get the actual moves as separate entries
chunk_pattern = re.compile(r""" ?\d+\.|\. ?| ?[-\w]+|[#+]|\s+""")

In [134]:
from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers.normalizers import NFD
from tokenizers.pre_tokenizers import Split, ByteLevel, Sequence
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.trainers import BpeTrainer

In [145]:
tokenizer = Tokenizer(
    BPE(unk_token="[UNK]", fuse_unk=True, continuing_subword_prefix="")
)

tokenizer.normalizer = NFD()

tokenizer.pre_tokenizer = Sequence(
    [
        Split(
            pattern=Regex(r""" ?\d+\.|\. ?| ?[-\w]+|[#+]|\s+"""), behavior="isolated"
        ),
        # TODO: figure why this adds random Ġ characters
        # everywhere when we just want to avoid spaces
        # ByteLevel(add_prefix_space=False),
    ]
)

# tokenizer.post_processor = ByteLevelProcessor(trim_offsets=True)
# tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(vocab_size=3072, show_progress=True)

In [146]:
tokenizer.train_from_iterator([training_text], trainer=trainer)






In [147]:
sample = df.sample(1).select("PGN").item()

print(sample)

1.e4 c5 2.Nf3 d6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 a6 6.Be3 e5 7.Nb3 Be7 8.f3 Be6 9.Qd2 Nbd7 10.g4 Qc7 11.O-O-O O-O 12.Kb1 b5 13.g5 Nh5 14.Ne2 Rac8 15.Ng3 Nf4 16.h4 d5 17.Bxf4 exf4 18.Nf5 Bb4 19.c3 Bc5 20.Nfd4 Bxd4 21.Nxd4 Nb6 22.Qc2 Kh8 23.h5 Qe5 24.h6 g6 25.Re1 Na4 26.Rg1 Nc5 27.Bd3 Rfd8 28.Qe2 Re8 29.a3 Kg8 30.Qd2 Red8 31.Qc2 Re8 32.Qe2 Red8 33.Bc2 dxe4 34.Bxe4 Bc4 35.Qc2 Re8 36.Qd2 Qd6 37.Bc2 Rxe1+ 38.Rxe1 Ne6 39.b3 Nxd4 40.bxc4 Nf5 41.Re2 Qxd2 42.Rxd2 Ne3 43.cxb5 Nc4 44.Rd4 Nxa3+ 45.Kb2 Nxb5 46.Rxf4 Rxc3 47.Rf6 Kf8 48.Bb3 Rc7 49.Rxa6 Ke7 50.Ra4 Rd7 51.Rb4 Nd6 52.f4 f6 53.Kc3 fxg5 54.fxg5 Rc7+ 55.Kd4 Rc1 56.Rb8 Rg1 57.Rh8 Rxg5 58.Rxh7+ Kf6 59.Bd5 Rh5 60.Kc5 Ne4+ 61.Kc6 Nc3 


In [148]:
output = tokenizer.encode(sample)

print(output.ids)
print(output.tokens)

[45, 70, 165, 106, 97, 186, 107, 69, 228, 108, 242, 96, 109, 132, 193, 110, 195, 184, 111, 392, 192, 112, 61, 233, 113, 226, 274, 114, 85, 225, 115, 372, 134, 116, 450, 214, 118, 77, 421, 119, 283, 423, 120, 420, 647, 123, 129, 172, 124, 640, 521, 126, 428, 335, 127, 74, 305, 128, 2045, 558, 130, 242, 375, 133, 278, 321, 135, 117, 476, 139, 121, 175, 142, 261, 1083, 146, 434, 330, 152, 200, 386, 158, 244, 240, 161, 181, 336, 162, 226, 1272, 164, 278, 240, 168, 244, 1272, 170, 449, 369, 173, 466, 746, 177, 278, 240, 179, 226, 398, 185, 449, 854, 2, 190, 1300, 348, 198, 140, 425, 202, 646, 390, 208, 389, 933, 216, 991, 720, 219, 812, 433, 227, 606, 1455, 2, 232, 694, 1536, 238, 663, 822, 247, 761, 286, 255, 381, 436, 262, 1299, 294, 272, 975, 353, 280, 737, 503, 285, 72, 260, 289, 470, 1060, 301, 904, 436, 2, 309, 563, 596, 316, 542, 997, 324, 785, 1151, 337, 1051, 2, 327, 351, 489, 887, 355, 1165, 297, 2, 373, 1023, 1041, 0]
['1.', 'e4', ' c5', ' 2.', 'Nf3', ' d6', ' 3.', 'd4', ' cxd4',

In [149]:
test = tokenizer.decode(output.ids)
print(output)
print(test)
print(test == sample)

Encoding(num_tokens=189, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
1. e4  c5  2. Nf3  d6  3. d4  cxd4  4. Nxd4  Nf6  5. Nc3  a6  6. Be3  e5  7. Nb3  Be7  8. f3  Be6  9. Qd2  Nbd7  10. g4  Qc7  11. O-O-O  O-O  12. Kb1  b5  13. g5  Nh5  14. Ne2  Rac8  15. Ng3  Nf4  16. h4  d5  17. Bxf4  exf4  18. Nf5  Bb4  19. c3  Bc5  20. Nfd4  Bxd4  21. Nxd4  Nb6  22. Qc2  Kh8  23. h5  Qe5  24. h6  g6  25. Re1  Na4  26. Rg1  Nc5  27. Bd3  Rfd8  28. Qe2  Re8  29. a3  Kg8  30. Qd2  Red8  31. Qc2  Re8  32. Qe2  Red8  33. Bc2  dxe4  34. Bxe4  Bc4  35. Qc2  Re8  36. Qd2  Qd6  37. Bc2  Rxe1 +  38. Rxe1  Ne6  39. b3  Nxd4  40. bxc4  Nf5  41. Re2  Qxd2  42. Rxd2  Ne3  43. cxb5  Nc4  44. Rd4  Nxa3 +  45. Kb2  Nxb5  46. Rxf4  Rxc3  47. Rf6  Kf8  48. Bb3  Rc7  49. Rxa6  Ke7  50. Ra4  Rd7  51. Rb4  Nd6  52. f4  f6  53. Kc3  fxg5  54. fxg5  Rc7 +  55. Kd4  Rc1  56. Rb8  Rg1  57. Rh8  Rxg5  58. Rxh7 +  Kf6  59. Bd5  Rh5  60. Kc5  Ne4 +  61. Kc6  Nc3  
False
