In [1]:
%%capture
!uv pip install datasets

In [2]:
import datetime

from datasets import load_dataset, DatasetDict, load_from_disk

In [3]:
from kn1ght.constants import SPECIAL_TOKENS

In [4]:
date = datetime.datetime.now().strftime("%Y-%m-%d")

CURRENT_DATASET = "chess_games_2025-01-17.csv"
DATA_DIRECTORY = "../.data"
DATASET_PATH = DATA_DIRECTORY + "/datasets"
SAMPLE_OUTPUT_PATH = DATASET_PATH + "/sample/" + date
DATASET_OUTPUT_PATH = DATASET_PATH + "/full/" + date
CURRENT_DATASET_PATH = DATASET_PATH + "/" + CURRENT_DATASET

In [5]:
dataset = load_dataset("csv", data_files=CURRENT_DATASET_PATH)

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
        num_rows: 3561470
    })
})


In [7]:
def add_start_and_end_tokens(items):
    PGNs = items["PGN"]

    for i, pgn in enumerate(PGNs):
        if pgn is not None:
            items["PGN"][i] = SPECIAL_TOKENS["START"] + pgn + SPECIAL_TOKENS["END"]

    return items


dataset["train"] = (
    dataset["train"]
    .filter(lambda x: x["PGN"] is not None)
    .map(lambda x: add_start_and_end_tokens(x), batched=True)
)

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
        num_rows: 3523492
    })
})

In [9]:
sample = dataset["train"].train_test_split(
    test_size=1000, train_size=10000, shuffle=True, seed=1997
)

In [10]:
sample_temp_dataset = sample["test"].train_test_split(test_size=0.5, seed=1997)

sample_dataset_dict = {
    "train": sample["train"],
    "test": sample_temp_dataset["train"],
    "validation": sample_temp_dataset["test"],
}

In [11]:
sample_dataset_dict = DatasetDict(sample_dataset_dict)

sample_dataset_dict.save_to_disk(SAMPLE_OUTPUT_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

In [12]:
import random

test = load_from_disk(dataset_path=SAMPLE_OUTPUT_PATH)

pgns = []

for x in test["train"].select_columns("PGN").to_list():
    pgns.append(x["PGN"])

random_items = random.sample(pgns, 5)

for item in random_items:
    print(item)
    print("---")

[g_start]1.e4 e5 2.Nf3 Qe7 3.Nc3 Nf6 4.Bc4 Nc6 5.d3 d6 6.O-O Bg4 7.h3 Bh5 8.Bb5 Qd7 9.Qe2 a6 10.Bxc6 Qxc6 11.Qe3 d5 12.exd5 Qd7 13.Nxe5 Qe7 14.Nc4 a5 15.Re1 g6 16.Qd2 Be2 17.Rxe2 Ne4 18.Rxe4 Rd8 19.Rxe7+ Bxe7 20.Qe2 b6 21.Bg5 f6 22.Bxf6 Ra8 23.Qxe7#[g_end]
---
[g_start]1.e4 c5 2.Nf3 Nc6 3.d4 cxd4 4.Nxd4 Nf6 5.Nc3 e5 6.Ndb5 d6 7.a4 Be6 8.Be2 a6 9.Na3 d5 10.exd5 Nxd5 11.Nxd5 Bxd5 12.O-O Bc5 13.Nc4 O-O 14.Bg5 f6 15.Be3 Nd4 16.a5 Qc7 17.f4 Kh8 18.Kh1 Qc6 19.Rf3 Nxf3 20.Bxf3 e4 21.Bxc5 Qxc5 22.Nb6 exf3 23.Qxd5 Qxd5 24.Nxd5 fxg2+ 25.Kxg2 Rac8 26.Ne3 g6 27.Kf3 Kg7 28.Ra3 Rc5 29.c4 Rd8 30.b4 Rh5 31.Kg3 g5 32.Nd5 gxf4+ 33.Nxf4 Rg5+ 34.Kf3 Re5 35.Re3 Rd2 36.Ne6+ Kf7 37.Ng5+ Rxg5 38.h4 Rf5+[g_end]
---
[g_start]1.Nf3 Nf6 2.c4 g6 3.b3 Bg7 4.Bb2 d6 5.e3 O-O 6.Be2 e5 7.O-O e4 8.Ne1 c6 9.d3 d5 10.Nd2 Qc7 11.cxd5 Ng4 12.d6 Qxd6 13.Bxg4 Bxb2 14.Bxc8 Rxc8 15.Nxe4 Qe7 16.Rb1 Bg7 17.b4 Nd7 18.Nf3 Re8 19.Qc2 Rac8 20.Rfc1 Nb6 21.Qc5 Rc7 22.Qxe7 Rexe7 23.Nc5 Nd5 24.d4 Bh6 25.Kf1 a6 26.Na4 Kf8 27.Nc3 Nxc3 28.R

In [13]:
train = dataset["train"].train_test_split(test_size=0.9, seed=1997)
temp_dataset = train["test"].train_test_split(test_size=0.5, seed=1997)

dataset_dict = {
    "train": train["train"],
    "test": temp_dataset["train"],
    "validation": temp_dataset["test"],
}

In [14]:
dataset_dict

{'train': Dataset({
     features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
     num_rows: 352349
 }),
 'test': Dataset({
     features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
     num_rows: 1585571
 }),
 'validation': Dataset({
     features: ['Date', 'Result', 'WhiteElo', 'BlackElo', 'PGN'],
     num_rows: 1585572
 })}

In [15]:
dataset_dict = DatasetDict(dataset_dict)

dataset_dict.save_to_disk(DATASET_OUTPUT_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/352349 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1585571 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1585572 [00:00<?, ? examples/s]