In [1]:
# %pip install pandas python-chess torch
import pandas as pd
import chess
import chess.pgn
import numpy as np
import io
import json
import os
from constants import *

In [2]:
def format_moves(moves_str):
    """Format move sequence correctly for PGN by adding move numbers."""
    moves = moves_str.split()
    formatted_moves = []
    
    for i in range(0, len(moves), 2):
        move_number = (i // 2) + 1
        move_pair = f"{move_number}. {moves[i]}"
        if i + 1 < len(moves):
            move_pair += f" {moves[i + 1]}"
        formatted_moves.append(move_pair)
    
    return " ".join(formatted_moves)

def convert_to_pgn(row):
    """Convert a CSV row into a properly formatted PGN string."""
    result = "1-0" if row["winner"] == "white" else "0-1" if row["winner"] == "black" else "*"
    
    pgn = f"""[Event "?"]
[Site "Lichess.org"]
[Date "?"]
[Round "?"]
[White "{row['white_id']}"]
[Black "{row['black_id']}"]
[WhiteElo "{row['white_rating']}"]
[BlackElo "{row['black_rating']}"]
[Result "{result}"]
[Opening "{row['opening_name']}"]
[ECO "{row['opening_eco']}"]

{format_moves(row['moves'])} {result}
"""
    return pgn

def board_to_matrix(board):
    """Convert a chess.Board into an 8x8 numpy matrix."""
    board_matrix = np.zeros((8, 8), dtype=int)
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece is not None:
            row = 7 - chess.square_rank(square)  # Flip row to match board visualization
            col = chess.square_file(square)
            value = PIECE_TO_INT[piece.piece_type]
            board_matrix[row, col] = value if piece.color == chess.WHITE else -value
    return board_matrix

def get_game_matrices(pgn_string):
    """Convert a PGN game into a list of board matrices for each move."""
    pgn = io.StringIO(pgn_string)
    game = chess.pgn.read_game(pgn)
    board = game.board()
    matrices = []
    
    for move in game.mainline_moves():
        board.push(move)
        matrices.append(board_to_matrix(board))
    
    return matrices

In [3]:
# Load CSV
df = pd.read_csv("data/raw/lichess_games.csv")

# Convert all rows to PGN
pgn_games = [convert_to_pgn(row) for _, row in df.iterrows()]

# Example usage: Convert first PGN game to a list of board matrices
board_matrices = get_game_matrices(pgn_games[0])

print(f"Total positions: {len(board_matrices)}")
for i, matrix in enumerate(board_matrices):
    print(f"Move {i+1}:")
    print(matrix)
    print()

Total positions: 13
Move 1:
[[-4 -2 -3 -5 -6 -3 -2 -4]
 [-1 -1 -1 -1 -1 -1 -1 -1]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 1  1  1  0  1  1  1  1]
 [ 4  2  3  5  6  3  2  4]]

Move 2:
[[-4 -2 -3 -5 -6 -3 -2 -4]
 [-1 -1 -1  0 -1 -1 -1 -1]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0 -1  0  0  0  0]
 [ 0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 1  1  1  0  1  1  1  1]
 [ 4  2  3  5  6  3  2  4]]

Move 3:
[[-4 -2 -3 -5 -6 -3 -2 -4]
 [-1 -1 -1  0 -1 -1 -1 -1]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0 -1  0  0  0  0]
 [ 0  0  1  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 1  1  0  0  1  1  1  1]
 [ 4  2  3  5  6  3  2  4]]

Move 4:
[[-4 -2 -3 -5 -6 -3 -2 -4]
 [-1 -1  0  0 -1 -1 -1 -1]
 [ 0  0 -1  0  0  0  0  0]
 [ 0  0  0 -1  0  0  0  0]
 [ 0  0  1  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 1  1  0  0  1  1  1  1]
 [ 4  2  3  5  6  3  2  4]]

Move 5:
[[-4 -2 -3 -5 -6 -3 -2 -4]
 [-1 -1  0  0 -1 -1 -1 -1]
 [ 0  0 -1  0 

In [4]:
df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [5]:
# Load raw CSV
df = pd.read_csv("data/raw/lichess_games.csv")

output_dir = "data/preprocessed"
os.makedirs(output_dir, exist_ok=True)

# Create new dataframe
data = []
for _, row in df.iterrows():
    pgn = convert_to_pgn(row)
    matrices = get_game_matrices(pgn)
    data.append({
        "pgn": pgn,
        "matrices": json.dumps([matrix.tolist() for matrix in matrices]),  # Convert NumPy arrays to lists
        "opening_eco": row["opening_eco"],
        "opening_ply": row["opening_ply"]
    })

# Convert to DataFrame
processed_df = pd.DataFrame(data)

# Save processed data
processed_df.to_csv(os.path.join(output_dir, "lichess_games.csv"), index=False)

print("Preprocessed data saved successfully!")

Preprocessed data saved successfully!
