# With one-hot encoding

In [1]:
import chess
import chess.pgn
import io
import pandas as pd
import numpy as np

def pgn_to_dataframe(pgn_string):
    pgn = chess.pgn.read_game(io.StringIO(pgn_string))
    
    metadata = {}
    for key in pgn.headers:
        metadata[key] = pgn.headers[key]
    
    board = chess.Board()
    states = []
    
    for move in pgn.mainline_moves():
        board.push(move)
        square_states = []
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece is None:
                square_states.extend([0] * 12)  # Empty square
            else:
                piece_type = piece.piece_type
                color = int(piece.color)
                square_states.extend([1 if i == piece_type - 1 + color * 6 else 0 for i in range(12)])
        states.append(square_states)
    
    columns = []
    for square in chess.SQUARES:
        columns.extend([f"{chess.square_name(square)}_{piece}" for piece in ["wp", "wn", "wb", "wr", "wq", "wk", "bp", "bn", "bb", "br", "bq", "bk"]])
    
    df = pd.DataFrame(states, columns=columns)
    
    for key, value in metadata.items():
        df[key] = value
    
    df["move_number"] = range(1, len(df) + 1)
    
    return df

pgn_string = """
[Event "Third Rosenwald Trophy"]
[Site "New York, NY USA"]
[Date "1956.10.17"]
[EventDate "1956.10.07"]
[Round "8"]
[Result "0-1"]
[White "Donald Byrne"]
[Black "Robert James Fischer"]
[ECO "D92"]
[WhiteElo "?"]
[BlackElo "?"]
[PlyCount "82"]

1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7 4. d4 O-O 5. Bf4 d5 6. Qb3 dxc4 7. Qxc4 c6 8. e4 Nbd7 9. Rd1 Nb6 10. Qc5 Bg4 11. Bg5 Na4 12. Qa3 Nxc3 13. bxc3 Nxe4 14. Bxe7 Qb6 15. Bc4 Nxc3 16. Bc5 Rfe8+ 17. Kf1 Be6 18. Bxb6 Bxc4+ 19. Kg1 Ne2+ 20. Kf1 Nxd4+ 21. Kg1 Ne2+ 22. Kf1 Nc3+ 23. Kg1 axb6 24. Qb4 Ra4 25. Qxb6 Nxd1 26. h3 Rxa2 27. Kh2 Nxf2 28. Re1 Rxe1 29. Qd8+ Bf8 30. Nxe1 Bd5 31. Nf3 Ne4 32. Qb8 b5 33. h4 h5 34. Ne5 Kg7 35. Kg1 Bc5+ 36. Kf1 Ng3+ 37. Ke1 Bb4+ 38. Kd1 Bb3+ 39. Kc1 Ne2+ 40. Kb1 Nc3+ 41. Kc1 Rc2# 0-1
"""

df = pgn_to_dataframe(pgn_string)
print(df.head())
df.to_csv("game.csv", index=False)


   a1_wp  a1_wn  a1_wb  a1_wr  a1_wq  a1_wk  a1_bp  a1_bn  a1_bb  a1_br  ...  \
0      0      0      0      0      0      0      0      0      0      1  ...   
1      0      0      0      0      0      0      0      0      0      1  ...   
2      0      0      0      0      0      0      0      0      0      1  ...   
3      0      0      0      0      0      0      0      0      0      1  ...   
4      0      0      0      0      0      0      0      0      0      1  ...   

   Round         White                 Black  Result   EventDate  ECO  \
0      8  Donald Byrne  Robert James Fischer     0-1  1956.10.07  D92   
1      8  Donald Byrne  Robert James Fischer     0-1  1956.10.07  D92   
2      8  Donald Byrne  Robert James Fischer     0-1  1956.10.07  D92   
3      8  Donald Byrne  Robert James Fischer     0-1  1956.10.07  D92   
4      8  Donald Byrne  Robert James Fischer     0-1  1956.10.07  D92   

   WhiteElo  BlackElo  PlyCount  move_number  
0         ?         ?        82  

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Without one-hot encoding
## using capital versus lower case to differentiate white from black
P = white pawn
p = black pawn

In [2]:
import chess
import chess.pgn
import io
import pandas as pd

def pgn_to_dataframe(pgn_string):
    pgn = chess.pgn.read_game(io.StringIO(pgn_string))
    
    # Extract metadata from the PGN headers
    metadata = {key: pgn.headers[key] for key in pgn.headers}
    
    board = chess.Board()
    states = []
    
    # Iterate through the moves of the game
    for move in pgn.mainline_moves():
        board.push(move)
        square_states = []
        
        # Iterate through all squares on the board
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece is None:
                square_states.append('none')  # Empty square
            else:
                # Use piece symbol to represent the piece on the square
                piece_symbol = piece.symbol()
                if piece.color == chess.WHITE:
                    square_states.append(piece_symbol.upper())  # White pieces
                else:
                    square_states.append(piece_symbol.lower())  # Black pieces
        
        states.append(square_states)
    
    # Create column names based on square names
    columns = [chess.square_name(square) for square in chess.SQUARES]
    
    # Create the DataFrame
    df = pd.DataFrame(states, columns=columns)
    
    # Add metadata and move number to the DataFrame
    for key, value in metadata.items():
        df[key] = value
    df["move_number"] = range(1, len(df) + 1)
    
    return df

# PGN string of the game
pgn_string = """
[Event "Third Rosenwald Trophy"]
[Site "New York, NY USA"]
[Date "1956.10.17"]
[EventDate "1956.10.07"]
[Round "8"]
[Result "0-1"]
[White "Donald Byrne"]
[Black "Robert James Fischer"]
[ECO "D92"]
[WhiteElo "?"]
[BlackElo "?"]
[PlyCount "82"]
1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7 4. d4 O-O 5. Bf4 d5 6. Qb3 dxc4 7. Qxc4 c6 8. e4 Nbd7 9. Rd1 Nb6 10. Qc5 Bg4 11. Bg5 Na4 12. Qa3 Nxc3 13. bxc3 Nxe4 14. Bxe7 Qb6 15. Bc4 Nxc3 16. Bc5 Rfe8+ 17. Kf1 Be6 18. Bxb6 Bxc4+ 19. Kg1 Ne2+ 20. Kf1 Nxd4+ 21. Kg1 Ne2+ 22. Kf1 Nc3+ 23. Kg1 axb6 24. Qb4 Ra4 25. Qxb6 Nxd1 26. h3 Rxa2 27. Kh2 Nxf2 28. Re1 Rxe1 29. Qd8+ Bf8 30. Nxe1 Bd5 31. Nf3 Ne4 32. Qb8 b5 33. h4 h5 34. Ne5 Kg7 35. Kg1 Bc5+ 36. Kf1 Ng3+ 37. Ke1 Bb4+ 38. Kd1 Bb3+ 39. Kc1 Ne2+ 40. Kb1 Nc3+ 41. Kc1 Rc2# 0-1
"""

# Convert PGN to DataFrame
df = pgn_to_dataframe(pgn_string)

# Display the first few rows of the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv("game.csv", index=False)


  a1    b1 c1 d1 e1 f1    g1 h1 a2 b2  ... Round         White  \
0  R     N  B  Q  K  B  none  R  P  P  ...     8  Donald Byrne   
1  R     N  B  Q  K  B  none  R  P  P  ...     8  Donald Byrne   
2  R     N  B  Q  K  B  none  R  P  P  ...     8  Donald Byrne   
3  R     N  B  Q  K  B  none  R  P  P  ...     8  Donald Byrne   
4  R  none  B  Q  K  B  none  R  P  P  ...     8  Donald Byrne   

                  Black Result   EventDate  ECO WhiteElo BlackElo PlyCount  \
0  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   
1  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   
2  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   
3  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   
4  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   

  move_number  
0           1  
1           2  
2           3  
3           4  
4           5  

[5 rows x 77 columns]


# Without one-hot encoding
## using prefix "w" and prefix "b" to differentiate white from black
wp = white pawn
bp = black pawn

In [3]:
import chess
import chess.pgn
import io
import pandas as pd

def pgn_to_dataframe(pgn_string):
    pgn = chess.pgn.read_game(io.StringIO(pgn_string))
    
    # Extract metadata from the PGN headers
    metadata = {key: pgn.headers[key] for key in pgn.headers}
    
    board = chess.Board()
    states = []
    
    # Define a mapping from piece types and colors to strings
    piece_map = {
        (chess.PAWN, chess.WHITE): 'wp',
        (chess.KNIGHT, chess.WHITE): 'wn',
        (chess.BISHOP, chess.WHITE): 'wb',
        (chess.ROOK, chess.WHITE): 'wr',
        (chess.QUEEN, chess.WHITE): 'wq',
        (chess.KING, chess.WHITE): 'wk',
        (chess.PAWN, chess.BLACK): 'bp',
        (chess.KNIGHT, chess.BLACK): 'bn',
        (chess.BISHOP, chess.BLACK): 'bb',
        (chess.ROOK, chess.BLACK): 'br',
        (chess.QUEEN, chess.BLACK): 'bq',
        (chess.KING, chess.BLACK): 'bk',
    }
    
    # Iterate through the moves of the game
    for move in pgn.mainline_moves():
        board.push(move)
        square_states = []
        
        # Iterate through all squares on the board
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece is None:
                square_states.append('none')  # Empty square
            else:
                # Map the piece type and color to the corresponding string
                square_states.append(piece_map[(piece.piece_type, piece.color)])
        
        states.append(square_states)
    
    # Create column names based on square names
    columns = [chess.square_name(square) for square in chess.SQUARES]
    
    # Create the DataFrame
    df = pd.DataFrame(states, columns=columns)
    
    # Add metadata and move number to the DataFrame
    for key, value in metadata.items():
        df[key] = value
    df["move_number"] = range(1, len(df) + 1)
    
    return df

# PGN string of the game
pgn_string = """
[Event "Third Rosenwald Trophy"]
[Site "New York, NY USA"]
[Date "1956.10.17"]
[EventDate "1956.10.07"]
[Round "8"]
[Result "0-1"]
[White "Donald Byrne"]
[Black "Robert James Fischer"]
[ECO "D92"]
[WhiteElo "?"]
[BlackElo "?"]
[PlyCount "82"]
1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7 4. d4 O-O 5. Bf4 d5 6. Qb3 dxc4 7. Qxc4 c6 8. e4 Nbd7 9. Rd1 Nb6 10. Qc5 Bg4 11. Bg5 Na4 12. Qa3 Nxc3 13. bxc3 Nxe4 14. Bxe7 Qb6 15. Bc4 Nxc3 16. Bc5 Rfe8+ 17. Kf1 Be6 18. Bxb6 Bxc4+ 19. Kg1 Ne2+ 20. Kf1 Nxd4+ 21. Kg1 Ne2+ 22. Kf1 Nc3+ 23. Kg1 axb6 24. Qb4 Ra4 25. Qxb6 Nxd1 26. h3 Rxa2 27. Kh2 Nxf2 28. Re1 Rxe1 29. Qd8+ Bf8 30. Nxe1 Bd5 31. Nf3 Ne4 32. Qb8 b5 33. h4 h5 34. Ne5 Kg7 35. Kg1 Bc5+ 36. Kf1 Ng3+ 37. Ke1 Bb4+ 38. Kd1 Bb3+ 39. Kc1 Ne2+ 40. Kb1 Nc3+ 41. Kc1 Rc2# 0-1
"""

# Convert PGN to DataFrame
df = pgn_to_dataframe(pgn_string)

# Display the first few rows of the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv("game.csv", index=False)


   a1    b1  c1  d1  e1  f1    g1  h1  a2  b2  ... Round         White  \
0  wr    wn  wb  wq  wk  wb  none  wr  wp  wp  ...     8  Donald Byrne   
1  wr    wn  wb  wq  wk  wb  none  wr  wp  wp  ...     8  Donald Byrne   
2  wr    wn  wb  wq  wk  wb  none  wr  wp  wp  ...     8  Donald Byrne   
3  wr    wn  wb  wq  wk  wb  none  wr  wp  wp  ...     8  Donald Byrne   
4  wr  none  wb  wq  wk  wb  none  wr  wp  wp  ...     8  Donald Byrne   

                  Black Result   EventDate  ECO WhiteElo BlackElo PlyCount  \
0  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   
1  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   
2  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   
3  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   
4  Robert James Fischer    0-1  1956.10.07  D92        ?        ?       82   

  move_number  
0           1  
1           2  
2           3  
3           4  
4     

In [1]:
import pandas as pd
import io
import chess
import chess.pgn

def pgn_to_dataframe(pgn_string, eco, name, opening_type, game_number):
    pgn = chess.pgn.read_game(io.StringIO(pgn_string))
    
    # Extract metadata from the PGN headers
    metadata = {key: pgn.headers[key] for key in pgn.headers}
    
    # Include additional metadata
    metadata["ECO"] = eco
    metadata["OpeningName"] = name
    metadata["OpeningType"] = opening_type
    
    board = chess.Board()
    states = []
    
    # Define a mapping from piece types and colors to strings
    piece_map = {
        (chess.PAWN, chess.WHITE): 'wp',
        (chess.KNIGHT, chess.WHITE): 'wn',
        (chess.BISHOP, chess.WHITE): 'wb',
        (chess.ROOK, chess.WHITE): 'wr',
        (chess.QUEEN, chess.WHITE): 'wq',
        (chess.KING, chess.WHITE): 'wk',
        (chess.PAWN, chess.BLACK): 'bp',
        (chess.KNIGHT, chess.BLACK): 'bn',
        (chess.BISHOP, chess.BLACK): 'bb',
        (chess.ROOK, chess.BLACK): 'br',
        (chess.QUEEN, chess.BLACK): 'bq',
        (chess.KING, chess.BLACK): 'bk',
    }

    # New variables for tracking last move
    last_move = None
    
    # Iterate through the moves of the game
    for move in pgn.mainline_moves():
        board.push(move)
        last_move_san = board.san(move)  # Get the standard algebraic notation of the last move
        square_states = []
        
        # Iterate through all squares on the board
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece is None:
                square_states.append('none')  # Empty square
            else:
                # Map the piece type and color to the corresponding string
                square_states.append(piece_map[(piece.piece_type, piece.color)])
        
        square_states.append(last_move_san)
        states.append(square_states)
    
    columns = [chess.square_name(square) for square in chess.SQUARES] + ['last_move']
    df = pd.DataFrame(states, columns=columns)
    
    # Add metadata and move number to the DataFrame
    for key, value in metadata.items():
        df[key] = value
    df["move_number"] = range(1, len(df) + 1)
    df["game_number"] = game_number  # Add game number to each row
    
    return df

def process_openings(files):
    dfs = []
    game_number = 0  # Initialize game_number
    for file in files:
        openings = pd.read_csv(f"./chess-openings/{file}", sep='\t')
        for _, row in openings.iterrows():
            game_number += 1  # Increment game_number for each game processed
            df = pgn_to_dataframe(row['pgn'], row['eco'], row['name'], file.replace(".tsv", ""), game_number)
            dfs.append(df)
    
    # Concatenate all DataFrames
    final_df = pd.concat(dfs, ignore_index=True)
    
    # Save to CSV
    final_df.to_csv("all_openings.csv", index=False)

# List of files to process
files = ["a.tsv", "b.tsv", "c.tsv", "d.tsv", "e.tsv"]

# Process the openings and save to a single large CSV
process_openings(files)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
