In [1]:
# %pip install pandas python-chess torch
import pandas as pd
import chess
import chess.pgn
import numpy as np
import io
import json
import os
from constants import *

In [2]:
def format_moves(moves_str):
    """Format move sequence correctly for PGN by adding move numbers."""
    moves = moves_str.split()
    formatted_moves = []
    
    for i in range(0, len(moves), 2):
        move_number = (i // 2) + 1
        move_pair = f"{move_number}. {moves[i]}"
        if i + 1 < len(moves):
            move_pair += f" {moves[i + 1]}"
        formatted_moves.append(move_pair)
    
    return " ".join(formatted_moves)

def convert_to_pgn(row):
    """Convert a CSV row into a properly formatted PGN string."""
    result = "1-0" if row["winner"] == "white" else "0-1" if row["winner"] == "black" else "*"
    
    pgn = f"""[Event "?"]
[Site "Lichess.org"]
[Date "?"]
[Round "?"]
[White "{row['white_id']}"]
[Black "{row['black_id']}"]
[WhiteElo "{row['white_rating']}"]
[BlackElo "{row['black_rating']}"]
[Result "{result}"]
[Opening "{row['opening_name']}"]
[ECO "{row['opening_eco']}"]

{format_moves(row['moves'])} {result}
"""
    return pgn

def board_to_matrix(board):
    """Convert a chess.Board into an 8x8 numpy matrix."""
    board_matrix = np.zeros((8, 8), dtype=int)
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece is not None:
            row = 7 - chess.square_rank(square)  # Flip row to match board visualization
            col = chess.square_file(square)
            value = PIECE_TO_INT[piece.piece_type]
            board_matrix[row, col] = value if piece.color == chess.WHITE else -value
    return board_matrix

def get_game_matrices(pgn_string):
    """Convert a PGN game into a list of board matrices for each move."""
    pgn = io.StringIO(pgn_string)
    game = chess.pgn.read_game(pgn)
    board = game.board()
    matrices = []
    
    for move in game.mainline_moves():
        board.push(move)
        matrices.append(board_to_matrix(board))
    
    return matrices

In [5]:
os.makedirs(PREPROCESSED_DIR, exist_ok=True)

# Load raw CSV
df = pd.read_csv(os.path.join(RAW_DIR, "lichess_games.csv"))

# Create new dataframe
data = []
for _, row in df.iterrows():
    pgn = convert_to_pgn(row)
    matrices = get_game_matrices(pgn)
    data.append({
        "pgn": pgn,
        "matrices": json.dumps([matrix.tolist() for matrix in matrices]),  # Convert NumPy arrays to lists
        "opening_eco": row["opening_eco"],
        "opening_ply": row["opening_ply"]
    })

# Convert to DataFrame
processed_df = pd.DataFrame(data)

# Save processed data
processed_df.to_csv(os.path.join(PREPROCESSED_DIR, "lichess_games.csv"), index=False)

print("Preprocessed data saved successfully!")

Preprocessed data saved successfully!
