In [1]:
!rm -rf /kaggle/working/*

In [2]:
!pip install chess
import chess.pgn
import csv
import requests
from bs4 import BeautifulSoup
import zstandard as zstd
import os
from pathlib import Path

Collecting chess
  Downloading chess-1.10.0-py3-none-any.whl.metadata (19 kB)
Downloading chess-1.10.0-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chess
Successfully installed chess-1.10.0


In [3]:

# Define URL to fetch PGN files from
BASE_URL = "https://database.lichess.org/"

# Define number of files to download and process
NUM_FILES_TO_PROCESS = 1

def fetch_urls():
    """Fetches URLs of PGN files from Lichess database."""
    urls_with_standard = []
    try:
        response = requests.get(BASE_URL)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        for a_tag in soup.find_all("a"):
            href = a_tag.get('href')
            if href and "standard/" in href and "pgn.zst" in href and ".torrent" not in href:
                urls_with_standard.append(href.removeprefix("standard/"))

        urls_with_standard.sort()
        print(f"Found {len(urls_with_standard)} URLs")
        return urls_with_standard[:NUM_FILES_TO_PROCESS]  # Limit to desired number
    except requests.RequestException as e:
        print(f"Error fetching URLs: {e}")
        return []

def download_and_decompress(url):
    """Downloads and decompresses a single PGN file."""
    file_name = url
    full_url = requests.compat.urljoin(BASE_URL, "standard/"+file_name)

    try:
        # Download the .pgn.zst file
        response = requests.get(full_url)
        response.raise_for_status()  # Check for HTTP errors

        with open(file_name, 'wb') as file:
            file.write(response.content)

        # Decompress the .pgn.zst file
        input_file_path = Path(file_name)
        output_file_path = Path(file_name.replace(".zst", ""))
        with input_file_path.open('rb') as compressed_file, output_file_path.open('wb') as output_file:
            dctx = zstd.ZstdDecompressor()
            dctx.copy_stream(compressed_file, output_file)

        # Delete the .pgn.zst file
        os.remove(input_file_path)
        print(f"Downloaded and decompressed {file_name}")
        return output_file_path  # Return path of the decompressed file
    except requests.RequestException as e:
        print(f"Error downloading {file_name}: {e}")
        return None
    except OSError as e:
        print(f"File operation error for {file_name}: {e}")
        return None

if __name__ == "__main__":
    urls = fetch_urls()
    for url in urls:
        pgn_file_path = download_and_decompress(url)

Found 139 URLs
Downloaded and decompressed lichess_db_standard_rated_2013-01.pgn.zst


In [4]:
import chess
import chess.pgn
import csv
import re

csv_headers = [
    "Game Number", "Move Number", "Player", "Move (SAN)", "FEN Before", "FEN After",
    "Is Capture", "Is Check", "Is Checkmate", "Piece Moved", "Piece Captured",
    "Is Castling", "Is En Passant", "Is Promotion", "Promotion Type",
    "Result", "Termination", "Event", "URL", "White Elo", "Black Elo", "Time Control","Opening"
]

def split_url_and_event(text):
    """Splits the URL from the Event text and returns both."""
    url_pattern = r'(https?://\S+)'
    match = re.search(url_pattern, text)
    if match:
        url = match.group(0)
        event_name = re.sub(url_pattern, '', text).strip()
    else:
        url = ''
        event_name = text.strip()
    return url, event_name

def extract_move_data(pgn_file_path, csv_file_path, games_per_batch=1000):
    """Extracts move-specific data from a PGN file and writes to CSV in batches."""
    game_count = 0
    game_batch_data = []

    with open(csv_file_path, 'w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(csv_headers)

        with open(pgn_file_path, 'r') as pgn_file:
            while True:
                game = chess.pgn.read_game(pgn_file)
                if game is None:
                    break
                game_count += 1

                headers = game.headers
                result = headers.get("Result", "Unknown")
                event_with_url = headers.get("Event", "")
                url, event_name = split_url_and_event(event_with_url)
                termination = headers.get("Termination", "Unknown")
                white_elo = headers.get("WhiteElo", "Unknown")
                black_elo = headers.get("BlackElo", "Unknown")
                time_control = headers.get("TimeControl", "Unknown")
                opening = headers.get("Opening", "")
                
                board = game.board()
                move_number = 0

                for move in game.mainline_moves():
                    move_number += 1
                    fen_before = board.fen()
                    player = "White" if board.turn else "Black"

                    move_san = board.san(move)
                    is_capture = board.is_capture(move)
                    piece_moved = board.piece_at(move.from_square).symbol().upper()
                    is_castling = board.is_castling(move)
                    is_en_passant = board.is_en_passant(move)
                    
                    piece_captured = None
                    if is_en_passant:
                        piece_captured = "P"
                    elif is_capture:
                        piece_captured = board.piece_at(move.to_square).symbol().upper()
                        
                    board.push(move)
                    fen_after = board.fen()
                    is_check = board.is_check()
                    is_checkmate = board.is_checkmate()
                    is_promotion = move.promotion is not None
                    promotion_type = chess.PIECE_SYMBOLS[move.promotion].upper() if is_promotion else None
                    


                    game_batch_data.append([
                        game_count, move_number, player, move_san, fen_before, fen_after,
                        is_capture, is_check, is_checkmate, piece_moved, piece_captured,
                        is_castling, is_en_passant, is_promotion, promotion_type,
                        result, termination, event_name, url, white_elo, black_elo, time_control, opening
                    ])

                if game_count % games_per_batch == 0:
                    csv_writer.writerows(game_batch_data)
                    game_batch_data.clear()  
                    print(f"Processed {game_count} games and wrote to CSV...")

        if game_batch_data:
            csv_writer.writerows(game_batch_data)
            print(f"Wrote the final batch of games to CSV.")

        print(f"Total number of games in the PGN file: {game_count}")
        print(f"Move-specific data saved to {csv_file_path}")


csv_file_path = pgn_file_path.with_suffix('.csv')
extract_move_data(pgn_file_path, csv_file_path)


Processed 1000 games and wrote to CSV...
Processed 2000 games and wrote to CSV...
Processed 3000 games and wrote to CSV...
Processed 4000 games and wrote to CSV...
Processed 5000 games and wrote to CSV...
Processed 6000 games and wrote to CSV...
Processed 7000 games and wrote to CSV...
Processed 8000 games and wrote to CSV...
Processed 9000 games and wrote to CSV...
Processed 10000 games and wrote to CSV...
Processed 11000 games and wrote to CSV...
Processed 12000 games and wrote to CSV...
Processed 13000 games and wrote to CSV...
Processed 14000 games and wrote to CSV...
Processed 15000 games and wrote to CSV...
Processed 16000 games and wrote to CSV...
Processed 17000 games and wrote to CSV...
Processed 18000 games and wrote to CSV...
Processed 19000 games and wrote to CSV...
Processed 20000 games and wrote to CSV...
Processed 21000 games and wrote to CSV...
Processed 22000 games and wrote to CSV...
Processed 23000 games and wrote to CSV...
Processed 24000 games and wrote to CSV...
P