# With one-hot encoding

In [None]:
import chess
import chess.pgn
import io
import pandas as pd
import numpy as np

def pgn_to_dataframe(pgn_string):
    pgn = chess.pgn.read_game(io.StringIO(pgn_string))
    
    metadata = {}
    for key in pgn.headers:
        metadata[key] = pgn.headers[key]
    
    board = chess.Board()
    states = []
    
    for move in pgn.mainline_moves():
        board.push(move)
        square_states = []
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece is None:
                square_states.extend([0] * 12)  # Empty square
            else:
                piece_type = piece.piece_type
                color = int(piece.color)
                square_states.extend([1 if i == piece_type - 1 + color * 6 else 0 for i in range(12)])
        states.append(square_states)
    
    columns = []
    for square in chess.SQUARES:
        columns.extend([f"{chess.square_name(square)}_{piece}" for piece in ["wp", "wn", "wb", "wr", "wq", "wk", "bp", "bn", "bb", "br", "bq", "bk"]])
    
    df = pd.DataFrame(states, columns=columns)
    
    for key, value in metadata.items():
        df[key] = value
    
    df["move_number"] = range(1, len(df) + 1)
    
    return df

pgn_string = """
[Event "Third Rosenwald Trophy"]
[Site "New York, NY USA"]
[Date "1956.10.17"]
[EventDate "1956.10.07"]
[Round "8"]
[Result "0-1"]
[White "Donald Byrne"]
[Black "Robert James Fischer"]
[ECO "D92"]
[WhiteElo "?"]
[BlackElo "?"]
[PlyCount "82"]

1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7 4. d4 O-O 5. Bf4 d5 6. Qb3 dxc4 7. Qxc4 c6 8. e4 Nbd7 9. Rd1 Nb6 10. Qc5 Bg4 11. Bg5 Na4 12. Qa3 Nxc3 13. bxc3 Nxe4 14. Bxe7 Qb6 15. Bc4 Nxc3 16. Bc5 Rfe8+ 17. Kf1 Be6 18. Bxb6 Bxc4+ 19. Kg1 Ne2+ 20. Kf1 Nxd4+ 21. Kg1 Ne2+ 22. Kf1 Nc3+ 23. Kg1 axb6 24. Qb4 Ra4 25. Qxb6 Nxd1 26. h3 Rxa2 27. Kh2 Nxf2 28. Re1 Rxe1 29. Qd8+ Bf8 30. Nxe1 Bd5 31. Nf3 Ne4 32. Qb8 b5 33. h4 h5 34. Ne5 Kg7 35. Kg1 Bc5+ 36. Kf1 Ng3+ 37. Ke1 Bb4+ 38. Kd1 Bb3+ 39. Kc1 Ne2+ 40. Kb1 Nc3+ 41. Kc1 Rc2# 0-1
"""

df = pgn_to_dataframe(pgn_string)
print(df.head())
df.to_csv("game.csv", index=False)


# Without one-hot encoding
## using capital versus lower case to differentiate white from black
P = white pawn
p = black pawn

In [None]:
import chess
import chess.pgn
import io
import pandas as pd

def pgn_to_dataframe(pgn_string):
    pgn = chess.pgn.read_game(io.StringIO(pgn_string))
    
    # Extract metadata from the PGN headers
    metadata = {key: pgn.headers[key] for key in pgn.headers}
    
    board = chess.Board()
    states = []
    
    # Iterate through the moves of the game
    for move in pgn.mainline_moves():
        board.push(move)
        square_states = []
        
        # Iterate through all squares on the board
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece is None:
                square_states.append('none')  # Empty square
            else:
                # Use piece symbol to represent the piece on the square
                piece_symbol = piece.symbol()
                if piece.color == chess.WHITE:
                    square_states.append(piece_symbol.upper())  # White pieces
                else:
                    square_states.append(piece_symbol.lower())  # Black pieces
        
        states.append(square_states)
    
    # Create column names based on square names
    columns = [chess.square_name(square) for square in chess.SQUARES]
    
    # Create the DataFrame
    df = pd.DataFrame(states, columns=columns)
    
    # Add metadata and move number to the DataFrame
    for key, value in metadata.items():
        df[key] = value
    df["move_number"] = range(1, len(df) + 1)
    
    return df

# PGN string of the game
pgn_string = """
[Event "Third Rosenwald Trophy"]
[Site "New York, NY USA"]
[Date "1956.10.17"]
[EventDate "1956.10.07"]
[Round "8"]
[Result "0-1"]
[White "Donald Byrne"]
[Black "Robert James Fischer"]
[ECO "D92"]
[WhiteElo "?"]
[BlackElo "?"]
[PlyCount "82"]
1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7 4. d4 O-O 5. Bf4 d5 6. Qb3 dxc4 7. Qxc4 c6 8. e4 Nbd7 9. Rd1 Nb6 10. Qc5 Bg4 11. Bg5 Na4 12. Qa3 Nxc3 13. bxc3 Nxe4 14. Bxe7 Qb6 15. Bc4 Nxc3 16. Bc5 Rfe8+ 17. Kf1 Be6 18. Bxb6 Bxc4+ 19. Kg1 Ne2+ 20. Kf1 Nxd4+ 21. Kg1 Ne2+ 22. Kf1 Nc3+ 23. Kg1 axb6 24. Qb4 Ra4 25. Qxb6 Nxd1 26. h3 Rxa2 27. Kh2 Nxf2 28. Re1 Rxe1 29. Qd8+ Bf8 30. Nxe1 Bd5 31. Nf3 Ne4 32. Qb8 b5 33. h4 h5 34. Ne5 Kg7 35. Kg1 Bc5+ 36. Kf1 Ng3+ 37. Ke1 Bb4+ 38. Kd1 Bb3+ 39. Kc1 Ne2+ 40. Kb1 Nc3+ 41. Kc1 Rc2# 0-1
"""

# Convert PGN to DataFrame
df = pgn_to_dataframe(pgn_string)

# Display the first few rows of the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv("game.csv", index=False)


# Without one-hot encoding
## using prefix "w" and prefix "b" to differentiate white from black
wp = white pawn
bp = black pawn

In [None]:
import chess
import chess.pgn
import io
import pandas as pd

def pgn_to_dataframe(pgn_string):
    pgn = chess.pgn.read_game(io.StringIO(pgn_string))
    
    # Extract metadata from the PGN headers
    metadata = {key: pgn.headers[key] for key in pgn.headers}
    
    board = chess.Board()
    states = []
    
    # Define a mapping from piece types and colors to strings
    piece_map = {
        (chess.PAWN, chess.WHITE): 'wp',
        (chess.KNIGHT, chess.WHITE): 'wn',
        (chess.BISHOP, chess.WHITE): 'wb',
        (chess.ROOK, chess.WHITE): 'wr',
        (chess.QUEEN, chess.WHITE): 'wq',
        (chess.KING, chess.WHITE): 'wk',
        (chess.PAWN, chess.BLACK): 'bp',
        (chess.KNIGHT, chess.BLACK): 'bn',
        (chess.BISHOP, chess.BLACK): 'bb',
        (chess.ROOK, chess.BLACK): 'br',
        (chess.QUEEN, chess.BLACK): 'bq',
        (chess.KING, chess.BLACK): 'bk',
    }
    
    # Iterate through the moves of the game
    for move in pgn.mainline_moves():
        board.push(move)
        square_states = []
        
        # Iterate through all squares on the board
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece is None:
                square_states.append('none')  # Empty square
            else:
                # Map the piece type and color to the corresponding string
                square_states.append(piece_map[(piece.piece_type, piece.color)])
        
        states.append(square_states)
    
    # Create column names based on square names
    columns = [chess.square_name(square) for square in chess.SQUARES]
    
    # Create the DataFrame
    df = pd.DataFrame(states, columns=columns)
    
    # Add metadata and move number to the DataFrame
    for key, value in metadata.items():
        df[key] = value
    df["move_number"] = range(1, len(df) + 1)
    
    return df

# PGN string of the game
pgn_string = """
[Event "Third Rosenwald Trophy"]
[Site "New York, NY USA"]
[Date "1956.10.17"]
[EventDate "1956.10.07"]
[Round "8"]
[Result "0-1"]
[White "Donald Byrne"]
[Black "Robert James Fischer"]
[ECO "D92"]
[WhiteElo "?"]
[BlackElo "?"]
[PlyCount "82"]
1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7 4. d4 O-O 5. Bf4 d5 6. Qb3 dxc4 7. Qxc4 c6 8. e4 Nbd7 9. Rd1 Nb6 10. Qc5 Bg4 11. Bg5 Na4 12. Qa3 Nxc3 13. bxc3 Nxe4 14. Bxe7 Qb6 15. Bc4 Nxc3 16. Bc5 Rfe8+ 17. Kf1 Be6 18. Bxb6 Bxc4+ 19. Kg1 Ne2+ 20. Kf1 Nxd4+ 21. Kg1 Ne2+ 22. Kf1 Nc3+ 23. Kg1 axb6 24. Qb4 Ra4 25. Qxb6 Nxd1 26. h3 Rxa2 27. Kh2 Nxf2 28. Re1 Rxe1 29. Qd8+ Bf8 30. Nxe1 Bd5 31. Nf3 Ne4 32. Qb8 b5 33. h4 h5 34. Ne5 Kg7 35. Kg1 Bc5+ 36. Kf1 Ng3+ 37. Ke1 Bb4+ 38. Kd1 Bb3+ 39. Kc1 Ne2+ 40. Kb1 Nc3+ 41. Kc1 Rc2# 0-1
"""

# Convert PGN to DataFrame
df = pgn_to_dataframe(pgn_string)

# Display the first few rows of the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv("game.csv", index=False)


# For creating a dataset of chess openings, and encoding important additional metadata!
Assumes that there is a /.chess-openings folder from  https://github.com/lichess-org/chess-openings

Turns the ./chess-openings files a.tsv, b.tsv, c.tsv, d.tsv, and e.tsv into one large file containing the board states between each move for each of the opening in all 5 files.

Additionally encodes the move that happened leading to this game board which is typically always visualized when playing chess online.

In [None]:
import pandas as pd
import io
import chess
import chess.pgn

def pgn_to_dataframe(pgn_string, eco, name, opening_type, game_number):
    pgn = chess.pgn.read_game(io.StringIO(pgn_string))
    
    # Extract metadata from the PGN headers
    metadata = {key: pgn.headers[key] for key in pgn.headers}
    
    # Include additional metadata
    metadata["ECO"] = eco
    metadata["opening_name"] = name
    metadata["opening_type"] = opening_type
    
    board = chess.Board()
    states = []
    
    # Define a mapping from piece types and colors to strings
    piece_map = {
        (chess.PAWN, chess.WHITE): 'wp',
        (chess.KNIGHT, chess.WHITE): 'wn',
        (chess.BISHOP, chess.WHITE): 'wb',
        (chess.ROOK, chess.WHITE): 'wr',
        (chess.QUEEN, chess.WHITE): 'wq',
        (chess.KING, chess.WHITE): 'wk',
        (chess.PAWN, chess.BLACK): 'bp',
        (chess.KNIGHT, chess.BLACK): 'bn',
        (chess.BISHOP, chess.BLACK): 'bb',
        (chess.ROOK, chess.BLACK): 'br',
        (chess.QUEEN, chess.BLACK): 'bq',
        (chess.KING, chess.BLACK): 'bk',
    }

    # Function to capture the current board state
    def capture_board_state(board, last_move_san='None', move_from='None', move_to='None'):
        square_states = []
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece is None:
                square_states.append('')  # Empty square
            else:
                # Map the piece type and color to the corresponding string
                square_states.append(piece_map[(piece.piece_type, piece.color)])
        square_states.extend([last_move_san, move_from, move_to])
        return square_states

    # Capture the initial board state
    states.append(capture_board_state(board))
    
    # Iterate through the moves of the game
    for move in pgn.mainline_moves():
        last_move_san = board.san(move)  # Get the standard algebraic notation of the last move
        move_from = chess.square_name(move.from_square)
        move_to = chess.square_name(move.to_square)
        board.push(move)
        states.append(capture_board_state(board, last_move_san, move_from, move_to))
    
    columns = [chess.square_name(square) for square in chess.SQUARES] + ['last_move', 'from_square', 'to_square']
    df = pd.DataFrame(states, columns=columns)
    
    # Add metadata and move number to the DataFrame
    for key, value in metadata.items():
        df[key] = value
    df["move_number"] = range(1, len(df) + 1)
    df["game_number"] = game_number  # Add game number to each row
    
    return df



def process_openings(files):
    dfs = []
    game_number = 0  # Initialize game_number
    for file in files:
        openings = pd.read_csv(f"./chess-openings/{file}", sep='\t')
        for _, row in openings.iterrows():
            game_number += 1  # Increment game_number for each game processed
            df = pgn_to_dataframe(row['pgn'], row['eco'], row['name'], file.replace(".tsv", ""), game_number)
            dfs.append(df)
    
    # Concatenate all DataFrames
    final_df = pd.concat(dfs, ignore_index=True)
    
    # Save to CSV
    final_df.to_csv("all_openings.csv", index=False)

# List of files to process
files = ["a.tsv", "b.tsv", "c.tsv", "d.tsv", "e.tsv"]

# Process the openings and save to a single large CSV
process_openings(files)


# Turn one large .pgn of multiple PGN games into a CSV

In [None]:
import pandas as pd
import io
import chess
import chess.pgn

def parse_pgn(pgn_text):
    """Parses a string containing multiple PGN games."""
    pgn_io = io.StringIO(pgn_text)
    while True:
        game = chess.pgn.read_game(pgn_io)
        if game is None:
            break
        yield game

def capture_board_state(board, piece_map, last_move_san='None', move_from='None', move_to='None'):
    """Captures the current board state."""
    square_states = []
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece is None:
            square_states.append('')  # Empty square
        else:
            square_states.append(piece_map[(piece.piece_type, piece.color)])
    square_states.extend([last_move_san, move_from, move_to])
    return square_states

def game_to_dataframe(game, game_number):
    """Converts a single game into a DataFrame with board states and metadata."""
    # Define a mapping from piece types and colors to strings
    piece_map = {
        (chess.PAWN, chess.WHITE): 'wp',
        (chess.KNIGHT, chess.WHITE): 'wn',
        (chess.BISHOP, chess.WHITE): 'wb',
        (chess.ROOK, chess.WHITE): 'wr',
        (chess.QUEEN, chess.WHITE): 'wq',
        (chess.KING, chess.WHITE): 'wk',
        (chess.PAWN, chess.BLACK): 'bp',
        (chess.KNIGHT, chess.BLACK): 'bn',
        (chess.BISHOP, chess.BLACK): 'bb',
        (chess.ROOK, chess.BLACK): 'br',
        (chess.QUEEN, chess.BLACK): 'bq',
        (chess.KING, chess.BLACK): 'bk',
    }

    metadata = {key: game.headers.get(key, '') for key in game.headers}
    board = chess.Board()
    states = [capture_board_state(board, piece_map)]

    for move in game.mainline_moves():
        last_move_san = board.san(move)
        move_from = chess.square_name(move.from_square)
        move_to = chess.square_name(move.to_square)
        board.push(move)
        states.append(capture_board_state(board, piece_map, last_move_san, move_from, move_to))

    columns = [chess.square_name(square) for square in chess.SQUARES] + ['last_move', 'from_square', 'to_square']
    df = pd.DataFrame(states, columns=columns)
    
    for key, value in metadata.items():
        df[key] = value
    df["move_number"] = range(1, len(df) + 1)
    df["game_number"] = game_number

    return df

def process_pgn_file(file_path):
    """Reads a PGN file and converts all games to a single DataFrame."""
    with open(file_path, 'r') as file:
        pgn_text = file.read()

    game_number = 0
    dfs = []
    for game in parse_pgn(pgn_text):
        game_number += 1
        df = game_to_dataframe(game, game_number)
        dfs.append(df)

    final_df = pd.concat(dfs, ignore_index=True)
    final_df.to_csv("h0lter-white-rated-rapid-daily.csv", index=False)

# Call the function to process the PGN file
process_pgn_file("h0lter-white-rated-rapid-daily.pgn")


# Turn one large .pgn of multiple PGN games into a CSV
## and also encode move frequency and win rate

In [3]:
import pandas as pd
import io
import chess
import chess.pgn

def capture_board_state(board, piece_map, last_move_san='None', move_from='None', move_to='None'):
    """Captures the current board state."""
    square_states = []
    for square in chess.SQUARES:
        piece = board.piece_at(square)
        if piece is None:
            square_states.append('')  # Empty square
        else:
            square_states.append(piece_map[(piece.piece_type, piece.color)])
    square_states.extend([last_move_san, move_from, move_to])
    return square_states

def collect_statistics(pgn_text):
    """ Collects statistics for each move across all games, taking parent sequences into account. """
    pgn_io = io.StringIO(pgn_text)
    stats = {}
    total_games = 0  # Initialize total_games counter
    while True:
        game = chess.pgn.read_game(pgn_io)
        if game is None:
            break
        total_games += 1  # Increment total_games for each game processed
        board = chess.Board()
        move_sequence = ""
        prev_sequence = ""
        for move in game.mainline_moves():
            move_san = board.san(move)
            board.push(move)
            if move_sequence:
                prev_sequence = move_sequence
            move_sequence += " " + move_san if move_sequence else move_san

            if move_sequence not in stats:
                stats[move_sequence] = {'frequency': 0, 'parent_frequency': 0, 'white_wins': 0, 'black_wins': 0, 'draws': 0}
            
            stats[move_sequence]['frequency'] += 1
            # Set parent frequency to total games if it's the first move
            stats[move_sequence]['parent_frequency'] = total_games if not prev_sequence else stats[prev_sequence]['frequency']

            if game.headers["Result"] == "1-0":
                stats[move_sequence]['white_wins'] += 1
            elif game.headers["Result"] == "0-1":
                stats[move_sequence]['black_wins'] += 1
            elif game.headers["Result"] == "1/2-1/2":
                stats[move_sequence]['draws'] += 1

    for key, value in stats.items():
        parent_freq = value['parent_frequency']
        freq = value['frequency']
        value['relative_frequency'] = freq / parent_freq if parent_freq > 0 else 0
        value['white_win_rate'] = value['white_wins'] / freq if freq > 0 else 0
        value['black_win_rate'] = value['black_wins'] / freq if freq > 0 else 0
        value['draw_rate'] = value['draws'] / freq if freq > 0 else 0
    return stats, total_games

def parse_pgn_and_create_dataframe(pgn_text, stats):
    """Parses PGN games and creates dataframes using pre-collected stats."""
    pgn_io = io.StringIO(pgn_text)
    game_number = 0
    dfs = []
    while True:
        game = chess.pgn.read_game(pgn_io)
        if game is None:
            break
        game_number += 1
        df = game_to_dataframe(game, game_number, stats)
        dfs.append(df)
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

def get_opening_from_ECO(eco):
    opening_letter = eco[0]
    df = pd.read_csv(f"./chess-openings/{opening_letter}.tsv", sep='\t')
    # access safely and add unknown if not found:
    return df[df['eco'] == eco]['name'].values[0] if not df[df['eco'] == eco].empty else "Unknown"

def game_to_dataframe(game, game_number, stats):
    # Implement as previously described, including new columns for relative frequencies and win rates
    piece_map = {
        (chess.PAWN, chess.WHITE): 'wp', (chess.KNIGHT, chess.WHITE): 'wn', 
        (chess.BISHOP, chess.WHITE): 'wb', (chess.ROOK, chess.WHITE): 'wr',
        (chess.QUEEN, chess.WHITE): 'wq', (chess.KING, chess.WHITE): 'wk',
        (chess.PAWN, chess.BLACK): 'bp', (chess.KNIGHT, chess.BLACK): 'bn',
        (chess.BISHOP, chess.BLACK): 'bb', (chess.ROOK, chess.BLACK): 'br',
        (chess.QUEEN, chess.BLACK): 'bq', (chess.KING, chess.BLACK): 'bk',
    }
    metadata = {key: game.headers.get(key, '') for key in game.headers}
    eco = chess.pgn.Headers(game.headers).get("ECO", "")
    board = chess.Board()
    states = []
    move_sequence = ""
    columns = [chess.square_name(square) for square in chess.SQUARES] + [
        'last_move', 'from_square', 'to_square', 'frequency', 'white_wins', 'black_wins', 'draws', 'relative_frequency', 'white_win_rate', 'black_win_rate', 'draw_rate', 'current_fen'
    ]
    for move in game.mainline_moves():
        move_san = board.san(move)
        move_from = chess.square_name(move.from_square)
        move_to = chess.square_name(move.to_square)
        current_fen = board.fen()
        board.push(move)
        move_sequence += " " + move_san if move_sequence else move_san
        state_stats = stats.get(move_sequence, {'frequency': 0, 'white_wins': 0, 'black_wins': 0, 'draws': 0})
        states.append(capture_board_state(board, piece_map, move_san, move_from, move_to) +
                      [state_stats['frequency'], state_stats['white_wins'], state_stats['black_wins'], state_stats['draws'],
                       state_stats['relative_frequency'], state_stats['white_win_rate'], state_stats['black_win_rate'], state_stats['draw_rate'], current_fen])
    df = pd.DataFrame(states, columns=columns)
    for key, value in metadata.items():
        df[key] = value
    df["move_number"] = range(1, len(df) + 1)
    df["game_number"] = game_number
    df["id"] = df.index
    df["opening_name"] = get_opening_from_ECO(eco)

    return df

def process_pgn_file(file_path):
    with open(file_path, 'r') as file:
        pgn_text = file.read()
    stats, total_games = collect_statistics(pgn_text)
    final_df = parse_pgn_and_create_dataframe(pgn_text, stats)
    final_df.to_csv("h0lter-white-rated-rapid-daily.csv", index=False)

# Call the function to process the PGN file
process_pgn_file("h0lter-white-rated-rapid-daily.pgn")
