In [1]:
# Imports
from chess import WHITE, BLACK, PAWN, KNIGHT, BISHOP, ROOK, QUEEN, Board, \
    Color, Piece
from chess.pgn import read_game, GameNode
import pandas as pd
from pandas import DataFrame

In [3]:
# Setup empty dataframe
data_headers = [
    "White Pawns", "White Knights", "White Bishops", "White Rooks", "White Queens",
    "Black Pawns", "Black Knights", "Black Bishops", "Black Rooks", "Black Queens",
    "Result_W", "Result_B", "Result_S"]
data = pd.DataFrame(columns=data_headers, dtype=int)
data.head()

Unnamed: 0,White Pawns,White Knights,White Bishops,White Rooks,White Queens,Black Pawns,Black Knights,Black Bishops,Black Rooks,Black Queens,Result_W,Result_B,Result_S


In [16]:
# Logic Setup
colors = [WHITE, BLACK]
pieces = [PAWN, KNIGHT, BISHOP, ROOK, QUEEN]
piece_names = {
    PAWN: "Pawn",
    KNIGHT: "Knight",
    BISHOP: "Bishop",
    ROOK: "Rook",
    QUEEN: "Queen"
}

def is_stalemate(node: GameNode) -> bool:
    return node.game().headers["Result"] == "1/2-1/2"

def num_pieces(board: Board, color: Color, piece: int) -> int:
    return len(board.pieces(piece, color))

def column_name(color: Color, piece: int) -> str:
    c = "White" if color == WHITE else "Black"
    p = (piece_names[piece] + "s")
    return c + " " + p

def get_game_result(node: GameNode) -> str:
    return node.game().headers["Result"]

def load_game_into_df(df: DataFrame, game_node: GameNode) -> DataFrame:
    while not game_node.is_end():
        series = pd.Series(index=data_headers, dtype=int)
        
        board = game_node.board()
        for c in colors:
            for p in pieces:
                series[column_name(c, p)] = num_pieces(board, c, p)
        
        res = get_game_result(game_node)
        if res == "1-0":
            series["Result_W"] = 1
            series["Result_B"] = 0
            series["Result_S"] = 0
        elif res == "0-1":
            series["Result_W"] = 0
            series["Result_B"] = 1
            series["Result_S"] = 0
        elif res == "1/2-1/2":
            series["Result_W"] = 0
            series["Result_B"] = 0
            series["Result_S"] = 1
            
        df = pd.concat([df, pd.DataFrame(series).T], ignore_index=True).astype(int)
        
        game_node = game_node.next()
        
    return df

In [17]:
# Read the PGN DB
chess_db = open("data/AJ-OTB-PGN-001.pgn", encoding="utf-8")

# Set data limits
WINS = 700 # Number of wins for each side
STALEMATES = 700 # Number of stalemates

# Load data into dataframe
num_w_wins = 0
num_b_wins = 0
num_stalemates = 0
while num_w_wins < WINS or num_b_wins < WINS or num_stalemates < STALEMATES:
    game = read_game(chess_db)
    
    res = get_game_result(game)
    if res == "1-0":
        if num_w_wins > WINS:
            continue
        num_w_wins += 1
    elif res == "0-1":
        if num_b_wins > WINS:
            continue
        num_b_wins += 1
    elif res == "1/2-1/2":
        if num_stalemates > STALEMATES:
            continue
        num_stalemates += 1
    else:
        continue
        
    data = load_game_into_df(data, game)
data

Unnamed: 0,White Pawns,White Knights,White Bishops,White Rooks,White Queens,Black Pawns,Black Knights,Black Bishops,Black Rooks,Black Queens,Result_W,Result_B,Result_S
0,8,2,2,2,1,8,2,2,2,1,0,0,1
1,8,2,2,2,1,8,2,2,2,1,0,0,1
2,8,2,2,2,1,8,2,2,2,1,0,0,1
3,8,2,2,2,1,8,2,2,2,1,0,0,1
4,7,2,2,2,1,8,2,2,2,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
177334,6,1,1,1,0,7,1,1,0,1,0,1,0
177335,6,1,1,1,0,7,1,1,0,1,0,1,0
177336,6,1,1,1,0,7,1,1,0,1,0,1,0
177337,6,1,1,1,0,7,1,1,0,1,0,1,0


In [18]:
path = "data/chess_data_output-110524.csv"
data.to_csv(path, index=False)

In [3]:
path = "data/chess_data_output-110524.csv"
data = pd.read_csv(path)

In [14]:
# Checking all the game states
states = []
for i in range(10000):
    game = read_game(chess_db)
    r = game.headers["Result"]
    
    if not r in states:
        states.append(r)
    if r == "*":
        print(game.headers)
states

Headers(Event='Serbian Championship 2023', Site='Senta', Date='2023.06.12', Round='7.4', White='Ljepic, Andrej', Black='Indjic, Aleksandar', Result='*', ECO='A87', WhiteElo='2207', BlackElo='2613', PlyCount='45', EventDate='2023.06.04', EventType='tourn', EventCountry='SRB', SourceTitle='playchess.com', Source='ChessBase', SourceQuality='1')
Headers(Event='Serbian Championship 2023', Site='Senta', Date='2023.06.12', Round='7.5', White='Markus, Robert', Black='Lajthajm, Borko', Result='*', ECO='B51', WhiteElo='2582', BlackElo='2341', PlyCount='52', EventDate='2023.06.04', EventType='tourn', EventCountry='SRB', SourceTitle='playchess.com', Source='ChessBase', SourceQuality='1')
Headers(Event='VII SESC Caioba Open 2023', Site='Matinhos', Date='2023.06.19', Round='1.8', White='Luz, Jhonatan Rodrigues D', Black='Santiago, Yago De Moura', Result='*', ECO='D02', WhiteElo='1775', BlackElo='2481', PlyCount='111', EventDate='2023.06.17', EventType='swiss', EventCountry='BRA', SourceTitle='playch

['0-1', '1/2-1/2', '1-0', '*']