In [1]:
!pip install chess
import chess
import chess.pgn
import chess.svg
from IPython.display import SVG, display
import requests
from bs4 import BeautifulSoup
import zstandard as zstd
import os
from pathlib import Path

Collecting chess
  Downloading chess-1.10.0-py3-none-any.whl.metadata (19 kB)
Downloading chess-1.10.0-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: chess
Successfully installed chess-1.10.0


In [2]:
url_to_test = 2
urls_with_standard = []

def fetch_urls():
    try:
        response = requests.get("https://database.lichess.org/")
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        for a_tag in soup.find_all("a"):
            href = a_tag.get('href')
            if href and "standard/" in href and "pgn.zst" in href and ".torrent" not in href:
                urls_with_standard.append(href.removeprefix("standard/"))

        urls_with_standard.sort()
        print(f"Found {len(urls_with_standard)} URLs")
    except requests.RequestException as e:
        print(f"Error fetching URLs: {e}")

def download_and_process_files():
    dctx = zstd.ZstdDecompressor()

    for i in range(min(url_to_test, len(urls_with_standard))):
        name = urls_with_standard[i]
        full_url = requests.compat.urljoin("https://database.lichess.org/standard/", name)

        try:
            # Download the .pgn.zst file
            response = requests.get(full_url)
            response.raise_for_status()  # Check for HTTP errors
            with open(name, 'wb') as file:
                file.write(response.content)
            # Decompress the .pgn.zst file
            input_file_path = Path(name)
            output_file_path = Path(name.replace(".zst",""))
            with input_file_path.open('rb') as compressed_file, output_file_path.open('wb') as output_file:
                dctx.copy_stream(compressed_file, output_file)

            # Delete the .pgn.zst file
            os.remove(input_file_path)
            print(f"Downloaded {name}")

        except requests.RequestException as e:
            print(f"Error downloading {name}: {e}")
        except OSError as e:
            print(f"File operation error for {name}: {e}")

    print("Downloading complete. Only .pgn files remain.")

if __name__ == "__main__":
    fetch_urls()
    download_and_process_files()


Found 139 URLs
Downloaded lichess_db_standard_rated_2013-01.pgn.zst
Downloaded lichess_db_standard_rated_2013-02.pgn.zst
Downloading complete. Only .pgn files remain.


In [None]:
import chess.pgn
import csv

def count_and_extract_moves_from_pgn(pgn_file_path, csv_file_path):
    try:
        game_count = 0

        with open(csv_file_path, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            
            csv_writer.writerow([
                "Game_ID", "Move_Number", "Player", "Move", "FEN_Before", "FEN_After", 
                "Is_Capture", "Is_Check", "Is_Checkmate", "Piece_Moved", "Piece_Captured", 
                "Is_Castling", "Is_EnPassant", "Is_Promotion", "Promotion_Type",
                "Result", "Termination"
            ])
            
            with open(pgn_file_path, 'r') as pgn_file:
                while True:
                    game = chess.pgn.read_game(pgn_file)
                    if game is None:
                        break
                    game_count += 1

                    result = game.headers.get("Result", "Unknown")
                    termination = game.headers.get("Termination", "Unknown")
                    
                    # Traverse the game move by move
                    board = game.board()
                    move_number = 0
                    for move in game.mainline_moves():
                        move_number += 1
                        fen_before = board.fen()
                        player = "White" if board.turn else "Black"

                        move_san = board.san(move)  
                        is_capture = board.is_capture(move)
                        is_check = board.is_check()
                        is_checkmate = board.is_checkmate()
                        piece_moved = board.piece_at(move.from_square).symbol().upper()
                        
                        board.push(move)

                        piece_captured = board.piece_at(move.to_square).symbol().upper() if is_capture else None
                        is_castling = board.is_castling(move)
                        is_en_passant = board.is_en_passant(move)
                        is_promotion = move.promotion is not None
                        promotion_type = chess.PIECE_SYMBOLS[move.promotion].upper() if is_promotion else None

                        fen_after = board.fen()

                        csv_writer.writerow([
                            game_count, move_number, player, move_san, fen_before, fen_after,
                            is_capture, is_check, is_checkmate, piece_moved, piece_captured,
                            is_castling, is_en_passant, is_promotion, promotion_type,
                            result, termination
                        ])
                    
                    if game_count % 1000 == 0:
                        print(f"Processed {game_count} games...")

        print(f"Total number of games in the PGN file: {game_count}")
        print(f"Move-specific data saved to {csv_file_path}")
        return game_count

    except FileNotFoundError:
        print(f"Error: The file {pgn_file_path} does not exist.")
        return 0
    except Exception as e:
        print(f"Error: {e}")
        return 0

pgn_file_path = 'lichess_db_standard_rated_2013-01.pgn'  
csv_file_path = 'lichess_db_standard_rated_2013-01.csv'   

num_games = count_and_extract_moves_from_pgn(pgn_file_path, csv_file_path)

Processed 1000 games...
Processed 2000 games...
Processed 3000 games...
Processed 4000 games...
Processed 5000 games...
Processed 6000 games...
Processed 7000 games...
Processed 8000 games...
Processed 9000 games...
Processed 10000 games...
