In [1]:
# %pip install pandas python-chess torch
import pandas as pd
import chess
import chess.pgn
import numpy as np
import io
import json
import sys
import os

# Get the notebook's directory and add its parent directory to sys.path
notebook_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(parent_dir)
from constants import *

In [3]:
eco_mapping_df = pd.read_csv("../data/ECO_codes.csv")
eco_mapping = {eco: idx for idx, eco in enumerate(eco_mapping_df["code"].unique())}

In [4]:
eco_mapping

{'A01': 0,
 'A02': 1,
 'A03': 2,
 'A04': 3,
 'A05': 4,
 'A06': 5,
 'A07': 6,
 'A08': 7,
 'A09': 8,
 'A10': 9,
 'A11': 10,
 'A12': 11,
 'A13': 12,
 'A14': 13,
 'A15': 14,
 'A16': 15,
 'A17': 16,
 'A18': 17,
 'A19': 18,
 'A20': 19,
 'A21': 20,
 'A22': 21,
 'A23': 22,
 'A24': 23,
 'A25': 24,
 'A26': 25,
 'A27': 26,
 'A28': 27,
 'A29': 28,
 'A30': 29,
 'A31': 30,
 'A32': 31,
 'A33': 32,
 'A34': 33,
 'A35': 34,
 'A36': 35,
 'A37': 36,
 'A38': 37,
 'A39': 38,
 'A40': 39,
 'A41': 40,
 'A42': 41,
 'A43': 42,
 'A44': 43,
 'A45': 44,
 'A46': 45,
 'A47': 46,
 'A48': 47,
 'A49': 48,
 'A50': 49,
 'A51': 50,
 'A52': 51,
 'A53': 52,
 'A54': 53,
 'A55': 54,
 'A56': 55,
 'A57': 56,
 'A58': 57,
 'A59': 58,
 'A60': 59,
 'A61': 60,
 'A62': 61,
 'A63': 62,
 'A64': 63,
 'A65': 64,
 'A66': 65,
 'A67': 66,
 'A68': 67,
 'A69': 68,
 'A70': 69,
 'A71': 70,
 'A72': 71,
 'A73': 72,
 'A74': 73,
 'A75': 74,
 'A76': 75,
 'A77': 76,
 'A78': 77,
 'A79': 78,
 'A80': 79,
 'A81': 80,
 'A82': 81,
 'A83': 82,
 'A84': 83,
 '

In [2]:
def format_moves(moves_str):
    """Format move sequence correctly for PGN by adding move numbers."""
    moves = moves_str.split()
    formatted_moves = []
    
    for i in range(0, len(moves), 2):
        move_number = (i // 2) + 1
        move_pair = f"{move_number}. {moves[i]}"
        if i + 1 < len(moves):
            move_pair += f" {moves[i + 1]}"
        formatted_moves.append(move_pair)
    
    return " ".join(formatted_moves)

def convert_to_pgn(row):
    """Convert a CSV row into a properly formatted PGN string."""
    result = "1-0" if row["winner"] == "white" else "0-1" if row["winner"] == "black" else "*"
    
    pgn = f"""[Event "?"]
[Site "Lichess.org"]
[Date "?"]
[Round "?"]
[White "{row['white_id']}"]
[Black "{row['black_id']}"]
[WhiteElo "{row['white_rating']}"]
[BlackElo "{row['black_rating']}"]
[Result "{result}"]
[Opening "{row['opening_name']}"]
[ECO "{row['opening_eco']}"]

{format_moves(row['moves'])} {result}
"""
    return pgn

def board_to_matrix(board):
    """Convert a chess.Board into an 8x8 numpy matrix."""
    board_matrix = np.zeros((8, 8), dtype=int)
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece is not None:
            row = 7 - chess.square_rank(square)  # Flip row to match board visualization
            col = chess.square_file(square)
            value = PIECE_TO_INT[piece.piece_type]
            board_matrix[row, col] = value if piece.color == chess.WHITE else -value
    return board_matrix

def get_game_matrices(pgn_string):
    """Convert a PGN game into a list of board matrices for each move."""
    pgn = io.StringIO(pgn_string)
    game = chess.pgn.read_game(pgn)
    board = game.board()
    matrices = []
    
    for move in game.mainline_moves():
        board.push(move)
        matrices.append(board_to_matrix(board))
    
    return matrices

In [5]:
os.makedirs(PREPROCESSED_DIR, exist_ok=True)

# Load raw CSV
df = pd.read_csv(os.path.join(RAW_DIR, "lichess_games.csv"))

# Create new dataframe
data = []
for _, row in df.iterrows():
    pgn = convert_to_pgn(row)
    matrices = get_game_matrices(pgn)
    data.append({
        "pgn": pgn,
        "matrices": json.dumps([matrix.tolist() for matrix in matrices]),  # Convert NumPy arrays to lists
        "opening_eco": row["opening_eco"],
        "opening_ply": row["opening_ply"]
    })

# Convert to DataFrame
processed_df = pd.DataFrame(data)

# Save processed data
processed_df.to_csv(os.path.join(PREPROCESSED_DIR, "lichess_games.csv"), index=False)

print("Preprocessed data saved successfully!")

Preprocessed data saved successfully!
