In [1]:
%%capture
!uv pip install polars kagglehub

In [2]:
import polars as pl
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
file = "all_with_filtered_anotations_since1998.txt"
path = kagglehub.dataset_download("milesh1/35-million-chess-games")

filepath = path + "/" + file

contents = open(filepath).read()

In [4]:
# remove first five lines of the file
unformattedGames = contents.split("\n")

# this will store our formatted dataset
formattedGames = []

In [5]:
# iterate through the games and format them properly
for line in unformattedGames:
    data = line.split(" ### ")

    if len(data) != 2:
        continue

    metadata = data[0].split(" ")
    game = data[1]

    date = metadata[1]
    result = metadata[2]
    whiteElo = metadata[3]
    blackElo = metadata[4]

    turns = game.split(" W")

    gameString = []

    # iterate through turns
    for i, turn in enumerate(turns):
        # split turn on on space
        moves = turn.split(" ")

        # iterate through moves
        for move in moves:
            move.strip()

            if not move:
                continue

            if move[0] == "W":
                moves[moves.index(move)] = move[1:]
            # if move starts with a B, remove the B and the number that follows
            elif move[0] == "B":
                prefix = f"B{i + 1}."
                moves[moves.index(move)] = move[len(prefix) :]
            else:
                moves[moves.index(move)] = move

        # join the moves into a single string and add the moves to the game
        gameString.append(" ".join(moves))

    formattedGames.append(
        {
            "Date": date,
            "Result": result,
            "WhiteElo": whiteElo,
            "BlackElo": blackElo,
            "PGN": " ".join(gameString).strip(),
        }
    )

In [6]:
df = pl.DataFrame(formattedGames)

In [7]:
df.head()

Date,Result,WhiteElo,BlackElo,PGN
str,str,str,str,str
"""2000.03.14""","""1-0""","""2851""","""None""","""1.d4 d5 2.c4 e6 3.Nc3 Nf6 4.cx…"
"""2000.03.14""","""1-0""","""2851""","""None""","""1.e4 d5 2.exd5 Qxd5 3.Nc3 Qa5 …"
"""1999.11.20""","""1-0""","""2851""","""None""","""1.e4 e5 2.Nf3 Nc6 3.Bc4 Bc5 4.…"
"""1999.11.20""","""1-0""","""2851""","""None""","""1.e4 d5 2.exd5 Qxd5 3.Nc3 Qa5 …"
"""2000.02.20""","""1/2-1/2""","""2851""","""2633""","""1.e4 e5 2.Nf3 Nc6 3.Bb5 a6 4.B…"


In [8]:
# save the formatted dataset to a csv file in the data directory
# current date in yyyy-mm-dd format
import datetime

date = datetime.datetime.now().strftime("%Y-%m-%d")

df.filter(pl.col("PGN").is_not_null()).write_csv(f"../.data/chess_games_{date}.csv")