In [1]:
%%capture
!uv pip install datasets

In [2]:
import datetime

from datasets import load_dataset, DatasetDict, load_from_disk

In [3]:
from kn1ght.constants import SPECIAL_TOKENS

In [4]:
date = datetime.datetime.now().strftime("%Y-%m-%d")

CURRENT_DATASET = "chess_games_2025-01-17.csv"
DATA_DIRECTORY = "../.data"
DATASET_PATH = DATA_DIRECTORY + "/datasets"
SAMPLE_OUTPUT_PATH = DATASET_PATH + "/sample/" + date
DATASET_OUTPUT_PATH = DATASET_PATH + "/" + date
CURRENT_DATASET_PATH = DATASET_PATH + "/" + CURRENT_DATASET

In [5]:
dataset = load_dataset("csv", data_files=CURRENT_DATASET_PATH)

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
        num_rows: 3561470
    })
})


In [7]:
def add_start_and_end_tokens(items):
    PGNs = items["PGN"]

    for i, pgn in enumerate(PGNs):
        if pgn is not None:
            items["PGN"][i] = SPECIAL_TOKENS["START"] + pgn + SPECIAL_TOKENS["END"]

    return items


dataset["train"] = (
    dataset["train"]
    .filter(lambda x: x["PGN"] is not None)
    .map(lambda x: add_start_and_end_tokens(x), batched=True)
)

In [8]:
dataset["train"][4]

{'Date': '2000.02.20',
 'Result': '1/2-1/2',
 'WhiteElo': 2851,
 'BlackElo': 2633.0,
 'PGN': '[g_start]1.e4 e5 2.Nf3 Nc6 3.Bb5 a6 4.Ba4 Nf6 5.O-O Be7 6.Re1 b5 7.Bb3 d6 8.c3 O-O 9.h3 Na5 10.Bc2 c5 11.d4 Qc7 12.Nbd2 Bd7 13.Nf1 cxd4 14.cxd4 Rac8 15.Ne3 Nc6 16.d5 Nb4 17.Bb1 a5 18.a3 Na6 19.b4 Ra8 20.Bd2 Rfc8 21.Bd3 Qb7 22.g4 g6 23.Nf1 axb4 24.axb4 Bd8 25.Ng3 Nc7 26.Qe2 Rxa1 27.Rxa1 Ra8 28.Qe1 Nfe8 29.Qc1 Ng7 30.Rxa8 Qxa8 31.Bh6 Nce8 32.Qb2 Qa4 33.Kg2 Bb6 34.Bc2 Qa7 35.Bd3 Qa4 36.Ne2 Nc7 37.Nxe5 dxe5 38.Qxe5 Nce8 39.Bxg7 Qd1 40.Bh6 Qxd3 41.Qe7 Ng7 42.Ng3 Qc2 43.Qf6 Nf5 44.Qxb6 Nh4+ 45.Kh2 Nf3+ 46.Kg2 Nh4+ 47.Kh2 Nf3+ 48.Kg2 Nh4+ 49.Kh2[g_end]'}

In [9]:
sample = dataset["train"].train_test_split(
    test_size=1000, train_size=10000, shuffle=True, seed=1997
)

In [10]:
sample_temp_dataset = sample["test"].train_test_split(test_size=0.5, seed=1997)

sample_dataset_dict = {
    "train": sample["train"],
    "test": sample_temp_dataset["train"],
    "validation": sample_temp_dataset["test"],
}

In [11]:
sample_dataset_dict = DatasetDict(sample_dataset_dict)

sample_dataset_dict.save_to_disk(SAMPLE_OUTPUT_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

In [12]:
import random

test = load_from_disk(dataset_path=SAMPLE_OUTPUT_PATH)

pgns = []

for x in test["train"].select_columns("PGN").to_list():
    pgns.append(x["PGN"])

random_items = random.sample(pgns, 5)

for item in random_items:
    print(item)
    print("---")

[g_start]1.e4 e5 2.Nf3 Nc6 3.Nc3 Nf6 4.d4 exd4 5.Nxd4 Bb4 6.Nxc6 bxc6 7.Bd3 d5 8.exd5 cxd5 9.Qe2+ Qe7 10.Qxe7+ Bxe7 11.O-O c6 12.Na4 Be6[g_end]
---
[g_start]1.e4 c5 2.c3 e6 3.d4 d5 4.e5 Nc6 5.Nf3 Qb6 6.a3 c4 7.g3 Na5 8.Nbd2 Bd7 9.Bh3 Be7 10.O-O h5 11.Ne1 g6 12.f4 Nh6 13.Nef3 O-O-O 14.Ng5 Nb3 15.Nxb3 Ba4 16.Be3 Bxb3 17.Qe2 Rdf8 18.Nf3 a5[g_end]
---
[g_start]1.e4 e5 2.Nf3 Nc6 3.Bb5 a6 4.Bxc6 dxc6 5.Nc3 Bd6 6.d4 Bg4 7.dxe5 Bxf3 8.Qxf3 Bxe5 9.O-O Qf6 10.Qxf6 Nxf6 11.f3 O-O-O 12.Be3 Bd4 13.Bxd4 Rxd4 14.Rfd1 Rhd8 15.Rxd4 Rxd4 16.Rd1 c5 17.Kf2 b5 18.Ke2 Nd7 19.Rd3 Ne5 20.Rxd4 cxd4 21.Nd5 c5 22.Ne7+ Kd7 23.Nf5 g6 24.Nh6 f6 25.f4 Nc4 26.b3 Na3 27.Kd2 Nb1+ 28.Kd3 Nc3 29.a3 Nd1 30.Ng4 Ke6 31.e5 f5 32.Nf6 h6 33.Ne8 Kd7 34.Nd6 Nf2+ 35.Ke2 Ne4 36.Nf7 h5 37.Nh8 g5 38.Ng6 g4 39.Nh4 Ke6 40.g3 a5 41.b4 a4 42.Kd1 c4 43.Ng6 Kf7 44.Nh4 Ke6 45.Ng2 Nc3+ 46.Kc1 Ne4 47.Kb2 c3+ 48.Kc1 Nd2 49.Ne1 Nf3 50.Nd3 Nxh2 51.Nc5+ Ke7 52.Nxa4 h4 53.Kd1 h3 54.Nc5 Nf1 55.Nd3 h2 56.Nf2 Nxg3 57.Ke1 h1=Q+ 58.Nxh1 Nxh1 59.Ke2 g3

In [13]:
train = dataset["train"].train_test_split(test_size=0.9, seed=1997)
temp_dataset = train["test"].train_test_split(test_size=0.5, seed=1997)

dataset_dict = {
    "train": train,
    "test": temp_dataset["train"],
    "validation": temp_dataset["test"],
}

In [14]:
dataset_dict = DatasetDict(sample_dataset_dict)

dataset_dict.save_to_disk(DATASET_OUTPUT_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]