In [5]:
import numpy as np, pandas as pd, sys
print("Python:", sys.version)
print("NumPy:", np.__version__, np.__file__)
print("Pandas:", pd.__version__, pd.__file__)


Python: 3.11.13 | packaged by conda-forge | (main, Jun  4 2025, 14:39:58) [MSC v.1943 64 bit (AMD64)]
NumPy: 1.26.4 c:\Users\ppava\anaconda3\envs\chess_env\Lib\site-packages\numpy\__init__.py
Pandas: 2.2.2 c:\Users\ppava\anaconda3\envs\chess_env\Lib\site-packages\pandas\__init__.py


In [6]:
import os
import sys
from itertools import islice

In [None]:
# --- Setup: point this to your giant PGN file ---
PGN_PATH = r"C:\Users\ppava\Desktop\Projects\AIP\DeepChessIQ\Data\lichess_db_standard_rated_2025-08.pgn"  # <-- change me

from itertools import islice
import os, io, sys, re

In [8]:
def peek_raw_lines(path, n_lines=60):
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for i, line in enumerate(islice(f, n_lines), start=1):
            # Strip only the newline for clean display
            print(f"{i:03d}: {line.rstrip()}")

peek_raw_lines(PGN_PATH, n_lines=60)

FileNotFoundError: DeepChessIQ\Data\lichess_db_standard_rated_2025-08.pgn

In [None]:
HEADER_LINE_RE = re.compile(r'^\[([A-Za-z0-9_]+)\s+"(.*)"\]\s*$')

def read_first_game_headers(path):
    header_lines = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        # Skip any preface until we reach the first header line starting with '['
        for line in f:
            if line.startswith('['):
                header_lines.append(line.rstrip("\n"))
                break
        # Collect the rest of the header block until a blank line
        for line in f:
            if not line.strip():
                break
            header_lines.append(line.rstrip("\n"))
    # Parse
    headers = {}
    for ln in header_lines:
        m = HEADER_LINE_RE.match(ln)
        if m:
            k, v = m.group(1), m.group(2)
            headers[k] = v
    return header_lines, headers

raw_header_lines, headers = read_first_game_headers(PGN_PATH)

print("="*80)
print("FIRST GAME HEADER LINES (raw):")
print("="*80)
for i, ln in enumerate(raw_header_lines, start=1):
    print(f"{i:02d}: {ln}")

print("\n" + "="*80)
print("PARSED HEADERS (key → value):")
print("="*80)
for k in sorted(headers.keys()):
    print(f"{k:18} {headers[k]}")


FIRST GAME HEADER LINES (raw):
01: [Event "Rated Blitz game"]
02: [Site "https://lichess.org/GBNcycCw"]
03: [Date "2025.08.01"]
04: [Round "-"]
05: [White "JessieLM"]
06: [Black "Trip_Team2022"]
07: [Result "0-1"]
08: [UTCDate "2025.08.01"]
09: [UTCTime "00:00:23"]
10: [WhiteElo "2253"]
11: [BlackElo "2297"]
12: [WhiteRatingDiff "-5"]
13: [BlackRatingDiff "+7"]
14: [ECO "A14"]
15: [Opening "Réti Opening: Anglo-Slav Variation, Bogoljubow Variation, Stonewall Line"]
16: [TimeControl "180+2"]
17: [Termination "Normal"]

PARSED HEADERS (key → value):
Black              Trip_Team2022
BlackElo           2297
BlackRatingDiff    +7
Date               2025.08.01
ECO                A14
Event              Rated Blitz game
Opening            Réti Opening: Anglo-Slav Variation, Bogoljubow Variation, Stonewall Line
Result             0-1
Round              -
Site               https://lichess.org/GBNcycCw
Termination        Normal
TimeControl        180+2
UTCDate            2025.08.01
UTCTime       

In [None]:
# --- Scan header keys across the first N games (streaming; no big RAM) ---
try:
    import chess.pgn
except Exception as e:
    raise RuntimeError("python-chess not installed. Run: pip install python-chess") from e

from collections import Counter, defaultdict

def scan_header_keys(path, n_games=3000, sample_per_key=4, progress_every=250):
    counts = Counter()
    samples = defaultdict(lambda: Counter())
    result_counts = Counter()
    term_counts = Counter()
    variant_counts = Counter()
    eco_counts = Counter()
    tc_counts = Counter()

    games = 0
    with open(path, "r", encoding="utf-8", errors="replace") as fh:
        while games < n_games:
            game = chess.pgn.read_game(fh)
            if game is None:
                break
            games += 1
            hdr = game.headers

            # count keys + collect example values
            for k, v in hdr.items():
                counts[k] += 1
                if v and len(samples[k]) < 1000:
                    samples[k][v] += 1

            # quick distributions
            result_counts[hdr.get("Result","")] += 1
            term_counts[hdr.get("Termination","")] += 1
            variant_counts[hdr.get("Variant","Standard")] += 1
            eco_counts[hdr.get("ECO","")] += 1
            tc_counts[hdr.get("TimeControl","")] += 1

            if games % progress_every == 0:
                print(f"… scanned {games} games")

    # build a summary table
    import pandas as pd
    rows = []
    for k, c in counts.most_common():
        cov = (c / games) * 100 if games else 0.0
        ex = [val for val, _ in samples[k].most_common(sample_per_key)]
        rows.append((k, c, round(cov, 2), ex))
    df = pd.DataFrame(rows, columns=["key", "count", "coverage_%", "example_values"])

    dists = {
        "games_scanned": games,
        "unique_header_keys": len(counts),
        "results": result_counts.most_common(),
        "terminations": term_counts.most_common(),
        "variants": variant_counts.most_common(),
        "eco_top20": eco_counts.most_common(20),
        "timecontrol_top20": tc_counts.most_common(20),
    }
    return df, dists

DF_KEYS, DISTS = scan_header_keys(PGN_PATH, n_games=3000, sample_per_key=4, progress_every=250)
DF_KEYS.head(20)


… scanned 250 games
… scanned 500 games
… scanned 750 games
… scanned 1000 games
… scanned 1250 games
… scanned 1500 games
… scanned 1750 games
… scanned 2000 games
… scanned 2250 games
… scanned 2500 games
… scanned 2750 games
… scanned 3000 games


Unnamed: 0,key,count,coverage_%,example_values
0,Event,3000,100.0,"[Rated Blitz game, Rated Bullet game, Rated Ra..."
1,Site,3000,100.0,"[https://lichess.org/GBNcycCw, https://lichess..."
2,Date,3000,100.0,[2025.08.01]
3,Round,3000,100.0,[-]
4,White,3000,100.0,"[Starryknights, Grant14, Hyperopic, JessieLM]"
5,Black,3000,100.0,"[Zakiman, Trip_Team2022, ABS1983, Fil-z-Lip]"
6,Result,3000,100.0,"[1-0, 0-1, 1/2-1/2]"
7,UTCDate,3000,100.0,[2025.08.01]
8,UTCTime,3000,100.0,"[00:00:06, 00:00:44, 00:00:37, 00:01:16]"
9,WhiteElo,3000,100.0,"[1500, 1516, 1285, 2167]"


In [None]:
print(f"Games scanned: {DISTS['games_scanned']}")
print(f"Unique header keys: {DISTS['unique_header_keys']}\n")

print("Results:", DISTS["results"])
print("\nTerminations:", DISTS["terminations"])
print("\nVariants:", DISTS["variants"])

print("\nTop ECO codes:")
for eco, c in DISTS["eco_top20"]:
    print(f"  {eco or '(missing)'}: {c}")

print("\nTop TimeControls:")
for tc, c in DISTS["timecontrol_top20"]:
    print(f"  {tc or '(missing)'}: {c}")


Games scanned: 3000
Unique header keys: 19

Results: [('1-0', 1516), ('0-1', 1368), ('1/2-1/2', 116)]

Terminations: [('Normal', 1981), ('Time forfeit', 991), ('Abandoned', 27), ('Rules infraction', 1)]

Variants: [('Standard', 3000)]

Top ECO codes:
  A00: 180
  B01: 172
  A40: 159
  B00: 154
  D00: 119
  C00: 111
  D02: 106
  B10: 80
  C44: 69
  C41: 65
  C50: 64
  C20: 60
  A01: 58
  B06: 57
  A45: 51
  B20: 50
  C42: 48
  B30: 47
  A04: 42
  C40: 40

Top TimeControls:
  60+0: 875
  180+0: 590
  300+0: 398
  600+0: 329
  180+2: 264
  120+1: 185
  300+3: 141
  600+5: 64
  300+2: 17
  30+0: 17
  1800+0: 15
  15+0: 12
  900+10: 11
  480+2: 11
  120+3: 7
  120+0: 7
  600+3: 4
  900+0: 4
  360+0: 4
  0+1: 4


In [None]:
THRESH = 80.0  # % coverage cutoff
candidates = DF_KEYS[DF_KEYS['coverage_%'] >= THRESH]['key'].tolist()
candidates


['Event',
 'Site',
 'Date',
 'Round',
 'White',
 'Black',
 'Result',
 'UTCDate',
 'UTCTime',
 'WhiteElo',
 'BlackElo',
 'ECO',
 'Opening',
 'TimeControl',
 'Termination',
 'WhiteRatingDiff',
 'BlackRatingDiff']

In [None]:
import io
import re
from typing import Optional
import chess.pgn

def game_id_from_site(hdr_site: str) -> Optional[str]:
    if not hdr_site:
        return None
    # lichess links look like https://lichess.org/GBNcycCw
    return hdr_site.rstrip("/").split("/")[-1] or None

def get_movetext(game: chess.pgn.Game, include_comments: bool = True) -> str:
    """
    Export just the movetext (no headers) as a single string.
    include_comments=True preserves things like { [%clk 0:03:00] } from lichess.
    """
    exporter = chess.pgn.StringExporter(
        headers=False,
        variations=False,
        comments=include_comments,
    )
    s = game.accept(exporter)
    # normalize whitespace a bit (keeps result at the end)
    return " ".join(s.split())

def preview_movetexts(path, n_games=5, max_chars=400):
    rows = []
    with open(path, "r", encoding="utf-8", errors="replace") as fh:
        for i in range(n_games):
            game = chess.pgn.read_game(fh)
            if game is None:
                break
            hdr = game.headers
            gid = game_id_from_site(hdr.get("Site", ""))

            full_text = get_movetext(game, include_comments=True)
            san_text  = get_movetext(game, include_comments=False)

            rows.append({
                "index": i+1,
                "game_id": gid,
                "white": hdr.get("White"),
                "black": hdr.get("Black"),
                "result": hdr.get("Result"),
                "timecontrol": hdr.get("TimeControl"),
                "pgn_movetext_full": full_text,
                "pgn_movetext_san": san_text,
                "len_full": len(full_text),
                "len_san": len(san_text),
            })

            # quick on-screen peek (truncated)
            print("="*100)
            print(f"Game #{i+1} | id={gid} | {hdr.get('White')} vs {hdr.get('Black')} | {hdr.get('Result')} | TC={hdr.get('TimeControl')}")
            print("- movetext FULL:", (full_text[:max_chars] + "…") if len(full_text) > max_chars else full_text)
            print("- movetext SAN :", (san_text[:max_chars]  + "…") if len(san_text)  > max_chars else san_text)

    import pandas as pd
    return pd.DataFrame(rows)

df_preview = preview_movetexts(PGN_PATH, n_games=3, max_chars=300)
df_preview


Game #1 | id=GBNcycCw | JessieLM vs Trip_Team2022 | 0-1 | TC=180+2
- movetext FULL: 1. g3 { [%clk 0:03:00] } 1... d5 { [%clk 0:03:00] } 2. Nf3 { [%clk 0:03:01] } 2... Nf6 { [%clk 0:03:02] } 3. Bg2 { [%clk 0:03:01] } 3... e6 { [%clk 0:03:04] } 4. O-O { [%clk 0:03:01] } 4... Be7 { [%clk 0:03:06] } 5. c4 { [%clk 0:03:02] } 5... O-O { [%clk 0:03:08] } 6. b3 { [%clk 0:03:04] } 6... c6 {…
- movetext SAN : 1. g3 d5 2. Nf3 Nf6 3. Bg2 e6 4. O-O Be7 5. c4 O-O 6. b3 c6 7. Bb2 Nbd7 8. d3 a5 9. Nbd2 a4 10. Qc2 a3 11. Bc3 c5 12. cxd5 Nxd5 13. Nc4 Nxc3 14. Qxc3 Bf6 15. Nfe5 Rb8 16. f4 b5 17. Ne3 b4 18. Qc2 Nxe5 19. fxe5 Bxe5 20. Rac1 Bd4 21. Qd2 Qg5 22. Rf3 Bb7 0-1
Game #2 | id=tT5omaaN | Haytroy vs ABS1983 | 1-0 | TC=180+2
- movetext FULL: 1. e4 { [%clk 0:03:00] } 1... e5 { [%clk 0:03:00] } 2. Qh5 { [%clk 0:03:01] } 2... Nc6 { [%clk 0:03:00] } 3. Bc4 { [%clk 0:02:55] } 3... g6 { [%clk 0:02:59] } 4. Qf3 { [%clk 0:02:49] } 4... Nf6 { [%clk 0:03:00] } 5. d3 { [%clk 0:02:49] } 5... Bg7 { [%clk 0:02:59] 

Unnamed: 0,index,game_id,white,black,result,timecontrol,pgn_movetext_full,pgn_movetext_san,len_full,len_san
0,1,GBNcycCw,JessieLM,Trip_Team2022,0-1,180+2,1. g3 { [%clk 0:03:00] } 1... d5 { [%clk 0:03:...,1. g3 d5 2. Nf3 Nf6 3. Bg2 e6 4. O-O Be7 5. c4...,1214,255
1,2,tT5omaaN,Haytroy,ABS1983,1-0,180+2,1. e4 { [%clk 0:03:00] } 1... e5 { [%clk 0:03:...,1. e4 e5 2. Qh5 Nc6 3. Bc4 g6 4. Qf3 Nf6 5. d3...,1307,285
2,3,RNqZylfV,Xafir,Fil-z-Lip,0-1,180+2,1. d4 { [%clk 0:03:00] } 1... d5 { [%clk 0:03:...,1. d4 d5 2. Nc3 Nf6 3. e4 dxe4 4. Bc4 Bg4 5. N...,1168,253


In [None]:
from typing import Optional, Dict, Any, List
import chess.pgn

# --- helpers reused later ---
def game_id_from_site(hdr_site: str) -> Optional[str]:
    if not hdr_site:
        return None
    return hdr_site.rstrip("/").split("/")[-1] or None

def get_movetext(game: chess.pgn.Game, include_comments: bool = True) -> str:
    exporter = chess.pgn.StringExporter(
        headers=False,
        variations=False,
        comments=include_comments,
    )
    s = game.accept(exporter)
    return " ".join(s.split())

def count_plies(game: chess.pgn.Game) -> int:
    c = 0
    for _ in game.mainline_moves():
        c += 1
    return c

# pick a reasonable default header set we actually saw in your dump
HEADER_KEYS = [
    "Event","Site","Date","Round",
    "White","Black","Result",
    "UTCDate","UTCTime",
    "WhiteElo","BlackElo",
    "WhiteRatingDiff","BlackRatingDiff",
    "ECO","Opening","TimeControl","Termination","Variant"
]

def row_from_game(game: chess.pgn.Game) -> Dict[str, Any]:
    hdr = game.headers
    row = {k: hdr.get(k) for k in HEADER_KEYS}
    row["game_id"] = game_id_from_site(hdr.get("Site", ""))
    row["ply_count"] = count_plies(game)
    # movetexts
    row["pgn_movetext_full"] = get_movetext(game, include_comments=True)
    row["pgn_movetext_san"]  = get_movetext(game, include_comments=False)
    return row

def preview_rows(path: str, n_games: int = 200) -> "pd.DataFrame":
    import pandas as pd
    rows: List[Dict[str, Any]] = []
    with open(path, "r", encoding="utf-8", errors="replace") as fh:
        for i in range(n_games):
            game = chess.pgn.read_game(fh)
            if game is None:
                break
            try:
                rows.append(row_from_game(game))
            except Exception as e:
                # keep streaming even if a game is malformed
                print(f"[warn] game {i+1}: {e}")
                continue
    df = pd.DataFrame(rows)
    print(f"Previewed {len(df)} games. Columns: {list(df.columns)}")
    return df

# --- run the preview (adjust n_games if you want) ---
df_preview = preview_rows(PGN_PATH, n_games=200)
df_preview.head(5)


Previewed 200 games. Columns: ['Event', 'Site', 'Date', 'Round', 'White', 'Black', 'Result', 'UTCDate', 'UTCTime', 'WhiteElo', 'BlackElo', 'WhiteRatingDiff', 'BlackRatingDiff', 'ECO', 'Opening', 'TimeControl', 'Termination', 'Variant', 'game_id', 'ply_count', 'pgn_movetext_full', 'pgn_movetext_san']


Unnamed: 0,Event,Site,Date,Round,White,Black,Result,UTCDate,UTCTime,WhiteElo,...,BlackRatingDiff,ECO,Opening,TimeControl,Termination,Variant,game_id,ply_count,pgn_movetext_full,pgn_movetext_san
0,Rated Blitz game,https://lichess.org/GBNcycCw,2025.08.01,-,JessieLM,Trip_Team2022,0-1,2025.08.01,00:00:23,2253,...,7,A14,"Réti Opening: Anglo-Slav Variation, Bogoljubow...",180+2,Normal,,GBNcycCw,44,1. g3 { [%clk 0:03:00] } 1... d5 { [%clk 0:03:...,1. g3 d5 2. Nf3 Nf6 3. Bg2 e6 4. O-O Be7 5. c4...
1,Rated Blitz game,https://lichess.org/tT5omaaN,2025.08.01,-,Haytroy,ABS1983,1-0,2025.08.01,00:00:23,1111,...,-6,C20,King's Pawn Game: Wayward Queen Attack,180+2,Time forfeit,,tT5omaaN,47,1. e4 { [%clk 0:03:00] } 1... e5 { [%clk 0:03:...,1. e4 e5 2. Qh5 Nc6 3. Bc4 g6 4. Qf3 Nf6 5. d3...
2,Rated Blitz game,https://lichess.org/RNqZylfV,2025.08.01,-,Xafir,Fil-z-Lip,0-1,2025.08.01,00:00:23,1468,...,5,D00,Blackmar-Diemer Gambit,180+2,Normal,,RNqZylfV,42,1. d4 { [%clk 0:03:00] } 1... d5 { [%clk 0:03:...,1. d4 d5 2. Nc3 Nf6 3. e4 dxe4 4. Bc4 Bg4 5. N...
3,Rated Blitz game,https://lichess.org/lAJM1UrE,2025.08.01,-,Nikolai_Petrov,XadrezTotalAP,0-1,2025.08.01,00:00:23,1976,...,6,A11,English Opening: Caro-Kann Defensive System,180+2,Time forfeit,,lAJM1UrE,88,1. c4 { [%clk 0:03:00] } 1... c6 { [%clk 0:03:...,1. c4 c6 2. Nc3 d5 3. cxd5 cxd5 4. d4 Nf6 5. B...
4,Rated Blitz game,https://lichess.org/ev97hGeo,2025.08.01,-,SANSONSAMARIO,Meruemchess,0-1,2025.08.01,00:00:23,1287,...,7,C50,Italian Game: Giuoco Piano,180+2,Normal,,ev97hGeo,30,1. e4 { [%clk 0:03:00] } 1... e5 { [%clk 0:03:...,1. e4 e5 2. Nf3 Nc6 3. Bc4 Bc5 4. O-O Nf6 5. d...
