In [26]:
# ==========================================
# 1. Imports & Paths
# ==========================================

from pathlib import Path
import json
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

# --- Ensure python-chess is available (works in VS Code & Colab) ---
try:
    import chess
except ModuleNotFoundError:
    import sys, subprocess
    print("python-chess not found. Installing...")
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "python-chess", "--quiet"])
    except Exception:
        # fallback to user install if needed
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--user", "python-chess", "--quiet"])
    import chess
print("python-chess version OK.")

# --- Paths ---
NB_DIR   = Path.cwd()
DATA_DIR = (NB_DIR / "../data").resolve()
OUT_DIR  = DATA_DIR  # save cleaned files back to data/
OUT_DIR.mkdir(parents=True, exist_ok=True)

RAW_SUBSET = DATA_DIR / "chess_games_subset.csv"   # from 01_downsampling.ipynb
CLEAN_CSV  = DATA_DIR / "clean_chess_games.csv"
META_JSON  = DATA_DIR / "clean_metadata.json"

print("Data dir:", DATA_DIR)
print("Input:", RAW_SUBSET)

python-chess version OK.
Data dir: E:\Github Projects\chess-outcome-prediction\data
Input: E:\Github Projects\chess-outcome-prediction\data\chess_games_subset.csv


In [27]:
# ==========================================
# 2. Load Subset, Define Target, Verify Moves Column
# ==========================================

assert RAW_SUBSET.exists(), f"{RAW_SUBSET} not found. Run 01_down_sampling.ipynb first."

# Load subset CSV
df_raw = pd.read_csv(RAW_SUBSET)
print("Raw shape:", df_raw.shape)
display(df_raw.head(3))

# ---- Target mapping ----
target = None
for cand in ["target", "Result", "result"]:
    if cand in df_raw.columns:
        if cand.lower() == "result":
            target = df_raw[cand].map({"1-0": "white", "0-1": "black", "1/2-1/2": "draw"})
        else:
            target = df_raw[cand]
        break
assert target is not None, "Could not locate a result/target column in subset."

df_raw["target"] = target.astype(str)
print("Target distribution:")
display(df_raw["target"].value_counts())

# ---- Find moves column & quick stats ----
moves_col = None
for cand in ["Moves", "moves", "PGN", "pgn", "AN"]:
    if cand in df_raw.columns:
        moves_col = cand
        break
assert moves_col is not None, "Could not find a moves column (e.g., 'Moves')."

print(f"Using moves column: '{moves_col}'")

# Rough ply-count estimate from SAN: count tokens that are not move numbers like '1.' '2.' etc.
def approx_plies(s):
    if not isinstance(s, str):
        return 0
    toks = [t for t in s.replace("\n", " ").split(" ") if t and not t[0].isdigit()]
    return len(toks)

ply_counts = df_raw[moves_col].fillna("").map(approx_plies)
total_games = len(ply_counts)
short_games = int((ply_counts < 30).sum())  # < 30 plies == < 15 moves total
print(f"Games with < 15 moves (kept): {short_games} / {total_games} ({short_games/total_games:.1%})")
print(f"Median plies: {int(ply_counts.median())}, Mean plies: {ply_counts.mean():.1f}")

Raw shape: (119868, 15)


Unnamed: 0,Event,White,Black,Result,UTCDate,UTCTime,WhiteElo,BlackElo,WhiteRatingDiff,BlackRatingDiff,ECO,Opening,TimeControl,Termination,AN
0,Bullet,MM2010,behrang71,1-0,2016.07.01,10:30:21,1812,1803,10.0,-10.0,A03,Bird Opening: Dutch Variation,120+0,Normal,1. f4 d5 2. g3 c5 3. Bg2 Nc6 4. Nf3 Bf5 5. O-O...
1,Blitz,tgnoo,janbubba,1-0,2016.07.01,11:01:23,1840,1791,10.0,-9.0,B00,Owen Defense,180+0,Normal,1. e4 b6 2. d4 Bb7 3. Bd3 e6 4. f4 d6 5. Nf3 h...
2,Bullet,Chaliko,TheGameHen,0-1,2016.07.01,00:27:23,2193,1782,-25.0,20.0,E20,Nimzo-Indian Defense #2,60+2,Time forfeit,1. d4 Nf6 2. c4 e6 3. Nc3 c5 4. d5 d6 5. e3 e5...


Target distribution:


target
white    59875
black    55329
draw      4640
nan         24
Name: count, dtype: int64

Using moves column: 'AN'
Games with < 15 moves (kept): 10672 / 119868 (8.9%)
Median plies: 70, Mean plies: 94.8


In [28]:
# ==========================================
# 3. First-15-Moves Feature Extraction (ply = 30)
# ==========================================

# If the raw dataset uses 'AN' for the moves, rename it now for consistency
if "AN" in df_raw.columns:
    df_raw = df_raw.rename(columns={"AN": "Moves"})
    print("Renamed 'AN' -> 'Moves'")

# Confirm moves column exists
assert "Moves" in df_raw.columns, "Expected a 'Moves' column after renaming from 'AN'."

import chess

def features_from_ply30(moves_san: str) -> dict:
    """
    Build a compact, leakage-safe feature vector from the position after ply 30
    (i.e., 15 full moves), or earlier if the game is shorter.
    """
    # Guard against missing/NaN
    if not isinstance(moves_san, str) or not moves_san.strip():
        moves_san = ""

    board = chess.Board()
    captures = 0
    checks = 0

    # Tokenise SAN and filter out move numbers like '1.' '2.' etc.
    tokens = [t for t in moves_san.replace("\n", " ").split(" ") if t and not t[0].isdigit()]
    ply_limit = min(30, len(tokens))  # 30 ply == 15 moves total

    for i in range(ply_limit):
        san = tokens[i]
        try:
            move = board.parse_san(san)
        except Exception:
            # If a token can’t be parsed, stop at the last valid position
            break
        if board.is_capture(move):
            captures += 1
        board.push(move)
        if board.is_check():
            checks += 1

    # --- Feature block from board state at ply_limit ---
    def count(piece, colour):
        return len(board.pieces(piece, colour))

    feat = {}
    # Piece counts
    for piece, name in [
        (chess.PAWN, "P"), (chess.KNIGHT, "N"), (chess.BISHOP, "B"),
        (chess.ROOK, "R"), (chess.QUEEN, "Q")
    ]:
        feat[f"w_{name}"] = count(piece, chess.WHITE)
        feat[f"b_{name}"] = count(piece, chess.BLACK)

    # Material (simple values) & balance
    val = {"P": 1, "N": 3, "B": 3, "R": 5, "Q": 9}
    w_mat = sum(feat[f"w_{k}"] * v for k, v in val.items())
    b_mat = sum(feat[f"b_{k}"] * v for k, v in val.items())
    feat["material_white"]   = w_mat
    feat["material_black"]   = b_mat
    feat["material_balance"] = w_mat - b_mat

    # Side to move & castling rights
    feat["stm_white"] = int(board.turn == chess.WHITE)
    feat["wk_castle"] = int(board.has_kingside_castling_rights(chess.WHITE))
    feat["wq_castle"] = int(board.has_queenside_castling_rights(chess.WHITE))
    feat["bk_castle"] = int(board.has_kingside_castling_rights(chess.BLACK))
    feat["bq_castle"] = int(board.has_queenside_castling_rights(chess.BLACK))

    # Centre control & mobility proxies
    centres = [chess.D4, chess.E4, chess.D5, chess.E5]
    feat["w_centre_ctrl"] = sum(board.is_attacked_by(chess.WHITE, sq) for sq in centres)
    feat["b_centre_ctrl"] = sum(board.is_attacked_by(chess.BLACK, sq) for sq in centres)

    # Mobility measured at current position
    # (Count legal moves for side to move, and approximate for the other by toggling turn)
    if board.turn == chess.WHITE:
        feat["w_mobility"] = board.legal_moves.count()
        board.turn = chess.BLACK
        feat["b_mobility"] = board.legal_moves.count()
        board.turn = chess.WHITE
    else:
        feat["b_mobility"] = board.legal_moves.count()
        board.turn = chess.WHITE
        feat["w_mobility"] = board.legal_moves.count()
        board.turn = chess.BLACK

    # Cumulative dynamics up to ply 30
    feat["captures_upto30"] = captures
    feat["checks_upto30"]   = checks
    feat["ply_processed"]   = ply_limit
    return feat

# Apply feature extraction
feats = df_raw["Moves"].fillna("").apply(features_from_ply30)
feat_df = pd.DataFrame(list(feats))

print("First-15-moves feature shape:", feat_df.shape)
display(feat_df.head(3))


Renamed 'AN' -> 'Moves'
First-15-moves feature shape: (119868, 25)


Unnamed: 0,w_P,b_P,w_N,b_N,w_B,b_B,w_R,b_R,w_Q,b_Q,material_white,material_black,material_balance,stm_white,wk_castle,wq_castle,bk_castle,bq_castle,w_centre_ctrl,b_centre_ctrl,w_mobility,b_mobility,captures_upto30,checks_upto30,ply_processed
0,5,6,2,1,2,2,2,2,1,1,36,34,2,1,0,0,1,0,4,4,48,46,6,0,30
1,6,7,2,1,1,2,2,2,1,1,34,35,-1,1,0,0,0,0,1,3,46,29,5,2,28
2,8,7,1,1,1,2,2,2,1,1,33,35,-2,1,0,0,0,0,4,3,37,31,4,0,30


In [29]:
# ==========================================
# 3A. Rename Engineered Columns (make human-readable)
# ==========================================

rename_map = {
    "w_P": "white_pawns", "b_P": "black_pawns",
    "w_N": "white_knights", "b_N": "black_knights",
    "w_B": "white_bishops", "b_B": "black_bishops",
    "w_R": "white_rooks", "b_R": "black_rooks",
    "w_Q": "white_queens", "b_Q": "black_queens",
    "material_white": "white_material_value",
    "material_black": "black_material_value",
    "material_balance": "material_balance_white_minus_black",
    "stm_white": "side_to_move_is_white",
    "wk_castle": "white_can_castle_kingside",
    "wq_castle": "white_can_castle_queenside",
    "bk_castle": "black_can_castle_kingside",
    "bq_castle": "black_can_castle_queenside",
    "w_centre_ctrl": "white_centre_control",
    "b_centre_ctrl": "black_centre_control",
    "w_mobility": "white_mobility_legal_moves",
    "b_mobility": "black_mobility_legal_moves",
    "captures_upto30": "captures_in_first_15_moves",
    "checks_upto30": "checks_in_first_15_moves",
    "ply_processed": "plies_processed"
}

feat_df = feat_df.rename(columns=rename_map)
print("Renamed feature columns.")
display(feat_df.head(3))

Renamed feature columns.


Unnamed: 0,white_pawns,black_pawns,white_knights,black_knights,white_bishops,black_bishops,white_rooks,black_rooks,white_queens,black_queens,white_material_value,black_material_value,material_balance_white_minus_black,side_to_move_is_white,white_can_castle_kingside,white_can_castle_queenside,black_can_castle_kingside,black_can_castle_queenside,white_centre_control,black_centre_control,white_mobility_legal_moves,black_mobility_legal_moves,captures_in_first_15_moves,checks_in_first_15_moves,plies_processed
0,5,6,2,1,2,2,2,2,1,1,36,34,2,1,0,0,1,0,4,4,48,46,6,0,30
1,6,7,2,1,1,2,2,2,1,1,34,35,-1,1,0,0,0,0,1,3,46,29,5,2,28
2,8,7,1,1,1,2,2,2,1,1,33,35,-2,1,0,0,0,0,4,3,37,31,4,0,30


In [30]:
# ==========================================
# 3b. Store first-15 SAN moves (for interpretability / optional sequence models)
# ==========================================

def first15_san(s: str) -> str:
    if not isinstance(s, str):
        return ""
    toks = [t for t in s.replace("\n", " ").split(" ") if t and not t[0].isdigit()]
    return " ".join(toks[:30])  # 30 plies = 15 full moves

# Ensure we have the unified 'Moves' column already (AN → Moves done earlier)
assert "Moves" in df_raw.columns, "Expected 'Moves' column."
df_raw["moves_first15_san"] = df_raw["Moves"].fillna("").map(first15_san)

print("Example first-15 SAN:", df_raw["moves_first15_san"].iloc[0][:120], "...")


Example first-15 SAN: f4 d5 g3 c5 Bg2 Nc6 Nf3 Bf5 O-O e6 d3 g6 Be3 h5 Bf2 d4 e3 dxe3 Bxe3 Bg7 c3 b6 Ng5 Rc8 d4 cxd4 cxd4 Nxd4 Bxd4 Rc4 ...


In [31]:
# ==========================================
# 4. Add Pre-Game Features & Assemble Working Frame (no leakage)
# ==========================================

# Build X from safe, pre-game info + engineered first-15-move features
X = pd.DataFrame(index=df_raw.index)

# Ratings (pre-game)
for w_elo, b_elo in [("WhiteElo", "BlackElo"), ("white_rating", "black_rating")]:
    if {w_elo, b_elo} <= set(df_raw.columns):
        X["white_rating"] = pd.to_numeric(df_raw[w_elo], errors="coerce")
        X["black_rating"] = pd.to_numeric(df_raw[b_elo], errors="coerce")
        X["rating_diff"]  = X["white_rating"] - X["black_rating"]
        X["avg_rating"]   = (X["white_rating"] + X["black_rating"]) / 2
        break

# Date → calendar (pre-game)
for cand in ["Date", "date"]:
    if cand in df_raw.columns:
        dt = pd.to_datetime(df_raw[cand], errors="coerce")
        X["year"]      = dt.dt.year
        X["month"]     = dt.dt.month
        X["dayofweek"] = dt.dt.dayofweek
        break

# Rated flag normalisation (pre-game)
for cand in ["rated", "Rated"]:
    if cand in df_raw.columns:
        tmp = df_raw[cand].astype(str).str.strip().str.lower().map(
            {"true": 1, "1": 1, "yes": 1, "false": 0, "0": 0, "no": 0}
        )
        X["rated"] = tmp.fillna(0).astype("int8")
        break

# Merge engineered features from first 15 moves
X = pd.concat([X, feat_df], axis=1)

# 💡 Keep the truncated move sequence for interpretability / optional sequence models
# (not used in numeric features by default)
if "moves_first15_san" in df_raw.columns:
    X["moves_first15_san"] = df_raw["moves_first15_san"]

# Attach target label (do NOT include raw 'Result' in features)
X["target"] = df_raw["target"].astype(str)

print("Working frame shape:", X.shape)
display(X.head(3))

# For transparency, list typical leakage fields present in raw data (not used as features)
possible_leak = [
    "Result", "result", "winner", "Winner", "termination", "Termination",
    "num_moves", "NumMoves", "Opening", "opening", "ECO", "eco", "Moves", "AN"
]
present_leak = [c for c in possible_leak if c in df_raw.columns]
print("Leakage-related columns found in raw (excluded from features):", present_leak)

Working frame shape: (119868, 31)


Unnamed: 0,white_rating,black_rating,rating_diff,avg_rating,white_pawns,black_pawns,white_knights,black_knights,white_bishops,black_bishops,white_rooks,black_rooks,white_queens,black_queens,white_material_value,black_material_value,material_balance_white_minus_black,side_to_move_is_white,white_can_castle_kingside,white_can_castle_queenside,black_can_castle_kingside,black_can_castle_queenside,white_centre_control,black_centre_control,white_mobility_legal_moves,black_mobility_legal_moves,captures_in_first_15_moves,checks_in_first_15_moves,plies_processed,moves_first15_san,target
0,1812,1803,9,1807.5,5,6,2,1,2,2,2,2,1,1,36,34,2,1,0,0,1,0,4,4,48,46,6,0,30,f4 d5 g3 c5 Bg2 Nc6 Nf3 Bf5 O-O e6 d3 g6 Be3 h...,white
1,1840,1791,49,1815.5,6,7,2,1,1,2,2,2,1,1,34,35,-1,1,0,0,0,0,1,3,46,29,5,2,28,e4 b6 d4 Bb7 Bd3 e6 f4 d6 Nf3 h6 O-O a6 Qe2 Ne...,white
2,2193,1782,411,1987.5,8,7,1,1,1,2,2,2,1,1,33,35,-2,1,0,0,0,0,4,3,37,31,4,0,30,d4 Nf6 c4 e6 Nc3 c5 d5 d6 e3 e5 Nf3 Be7 Bd3 Nb...,black


Leakage-related columns found in raw (excluded from features): ['Result', 'Termination', 'Opening', 'ECO', 'Moves']


In [32]:
# ==========================================
# 5. Assemble Working Frame, Impute, Downcast
# ==========================================

# Build X from safe pre-game info you already created + engineered move features
X = pd.DataFrame(index=df_raw.index)

# Ratings
if {"WhiteElo", "BlackElo"} <= set(df_raw.columns):
    X["white_rating"] = pd.to_numeric(df_raw["WhiteElo"], errors="coerce")
    X["black_rating"] = pd.to_numeric(df_raw["BlackElo"], errors="coerce")
    X["rating_diff"]  = X["white_rating"] - X["black_rating"]
    X["avg_rating"]   = (X["white_rating"] + X["black_rating"]) / 2

# Date → calendar
for cand in ["Date", "date"]:
    if cand in df_raw.columns:
        dt = pd.to_datetime(df_raw[cand], errors="coerce")
        X["year"]      = dt.dt.year
        X["month"]     = dt.dt.month
        X["dayofweek"] = dt.dt.dayofweek
        break

# Rated → clean 0/1
for cand in ["rated", "Rated"]:
    if cand in df_raw.columns:
        tmp = df_raw[cand].astype(str).str.strip().str.lower().map(
            {"true":1,"1":1,"yes":1,"false":0,"0":0,"no":0}
        )
        X["rated"] = tmp.fillna(0).astype("int8")
        break

# Merge engineered features from first 15 moves
X = pd.concat([X, feat_df], axis=1)

# Attach target last
X["target"] = df_raw["target"].astype(str)

# --- Impute numerics (median) ---
num_cols = [c for c in X.columns if c != "target" and pd.api.types.is_numeric_dtype(X[c])]
for c in num_cols:
    if X[c].isna().any():
        X[c] = X[c].fillna(X[c].median())

# --- Downcast numeric types to keep file size small ---
for c in num_cols:
    if pd.api.types.is_float_dtype(X[c]):
        X[c] = pd.to_numeric(X[c], downcast="float")
    elif pd.api.types.is_integer_dtype(X[c]):
        X[c] = pd.to_numeric(X[c], downcast="integer")

print("Working frame (post-impute/downcast) shape:", X.shape)
display(X.head(3))

Working frame (post-impute/downcast) shape: (119868, 30)


Unnamed: 0,white_rating,black_rating,rating_diff,avg_rating,white_pawns,black_pawns,white_knights,black_knights,white_bishops,black_bishops,white_rooks,black_rooks,white_queens,black_queens,white_material_value,black_material_value,material_balance_white_minus_black,side_to_move_is_white,white_can_castle_kingside,white_can_castle_queenside,black_can_castle_kingside,black_can_castle_queenside,white_centre_control,black_centre_control,white_mobility_legal_moves,black_mobility_legal_moves,captures_in_first_15_moves,checks_in_first_15_moves,plies_processed,target
0,1812,1803,9,1807.5,5,6,2,1,2,2,2,2,1,1,36,34,2,1,0,0,1,0,4,4,48,46,6,0,30,white
1,1840,1791,49,1815.5,6,7,2,1,1,2,2,2,1,1,34,35,-1,1,0,0,0,0,1,3,46,29,5,2,28,white
2,2193,1782,411,1987.5,8,7,1,1,1,2,2,2,1,1,33,35,-2,1,0,0,0,0,4,3,37,31,4,0,30,black


In [34]:
# ==========================================
# 6. Sanity Checks (Leakage / Balance / NAs)
# ==========================================

# 1) Ensure no leaky columns snuck into features
#    (We allow 'moves_first15_san' because it's truncated at 15 moves and kept mainly for interpretability.)
banned = {
    "Result", "result", "winner", "Winner", "termination", "Termination",
    "num_moves", "NumMoves", "Opening", "opening", "ECO", "eco", "Moves", "AN"
}
present_banned = [c for c in X.columns if c in banned]
assert not present_banned, f"Leaky columns present in features: {present_banned}"

print("Has moves_first15_san:", "moves_first15_san" in X.columns)

# 2) Class balance
print("\nTarget distribution (counts):")
display(X["target"].value_counts())
print("Target distribution (percent):")
display((X["target"].value_counts(normalize=True) * 100).round(2))

# 3) NA audit
na_counts = X.isna().sum()
na_counts = na_counts[na_counts > 0]
if not na_counts.empty:
    print("\nColumns with missing values after imputation:")
    display(na_counts)
else:
    print("\nNo missing values remaining")

Has moves_first15_san: False

Target distribution (counts):


target
white    59875
black    55329
draw      4640
nan         24
Name: count, dtype: int64

Target distribution (percent):


target
white    49.95
black    46.16
draw      3.87
nan       0.02
Name: proportion, dtype: float64


No missing values remaining


In [36]:
# ==========================================
# 7. Save Clean Dataset + Metadata (to data/)
# ==========================================

CLEAN_CSV  = DATA_DIR / "clean_chess_dataset.csv"   # name makes purpose explicit
META_JSON  = DATA_DIR / "clean_chess_dataset_meta.json"

X.to_csv(CLEAN_CSV, index=False)
size_mb = CLEAN_CSV.stat().st_size / (1024**2)
print(f"Saved: {CLEAN_CSV}")
print(f"File size: {size_mb:.2f} MB (target < 100 MB)")

meta = {
    "target_col": "target",
    "n_rows": int(X.shape[0]),
    "n_cols": int(X.shape[1]),
    "feature_names": list(X.columns),
    "notes": "Features include pre-game ratings/date/rated + board/state features from first 15 moves (ply=30). No ECO/opening raw text or post-game fields included."
}
with open(META_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)
print(f"Metadata saved: {META_JSON}")

Saved: E:\Github Projects\chess-outcome-prediction\data\clean_chess_dataset.csv
File size: 9.52 MB (target < 100 MB)
Metadata saved: E:\Github Projects\chess-outcome-prediction\data\clean_chess_dataset_meta.json
