In [3]:
# ==========================================
# CHESS OUTCOME PREDICTION — PREPROCESSING
# ==========================================

# --- Imports (everything used in this notebook lives here) ---
from pathlib import Path
import os
import json
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

# --- Paths ---
NB_DIR = Path.cwd()
CANDIDATES = [
    NB_DIR / "../data/raw_chess_games.csv",
    NB_DIR / "../../data/raw_chess_games.csv",
    NB_DIR / "data/raw_chess_games.csv",
]

RAW_CSV = next((p.resolve() for p in CANDIDATES if p.exists()), None)
if RAW_CSV is None:
    raise FileNotFoundError(
        "Could not find raw_chess_games.csv. Expected it under ../data/ relative to this notebook."
    )

RESULTS_DIR = (NB_DIR / "../results").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# --- Load ---
df = pd.read_csv(RAW_CSV)
print(f"Loaded: {RAW_CSV}")
print("Initial shape:", df.shape)
df.head()

Loaded: E:\Github Projects\chess-outcome-prediction\data\raw_chess_games.csv
Initial shape: (214980, 23)


Unnamed: 0,game_id,white_username,black_username,white_rating,black_rating,white_country,black_country,result,termination,time_control,time_class,initial_time,increment,date,eco,opening,num_moves,rated,event,source,avg_rating,rating_diff,time_category
0,f57a1313-0f25-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3222,3307,nihalsarin,hikaru,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Modern-Defense-...,,312.0,True,,chess.com,3264.5,-85.0,blitz
1,d2b8e5d3-0f23-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3330,3199,hikaru,nihalsarin,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Nimzowitsch-Lar...,,320.0,True,,chess.com,3264.5,131.0,blitz
2,70b20004-0f27-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3226,3303,nihalsarin,hikaru,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Torre-Attack-Fi...,,334.0,True,,chess.com,3264.5,-77.0,blitz
3,35b89065-0f25-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3317,3212,hikaru,nihalsarin,1/2-1/2,repetition,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Bishops-Opening...,,274.0,True,,chess.com,3264.5,105.0,blitz
4,23a8b502-0f29-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3214,3315,nihalsarin,hikaru,1/2-1/2,resignation,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Torre-Attack......,,496.0,True,,chess.com,3264.5,-101.0,blitz


In [4]:
# ==========================================
# 2. Inspect Dataset Info
# ==========================================

# If this cell is run before Cell 1, reload the dataset for safety
if "df" not in globals():
    df = pd.read_csv(RAW_CSV)
    print(f"(Re)loaded: {RAW_CSV}")

print("Shape:", df.shape)
df.info()
display(df.head(10))
display(df.describe(include="all").transpose().head(50))

Shape: (214980, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214980 entries, 0 to 214979
Data columns (total 23 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   game_id         214980 non-null  object 
 1   white_username  214980 non-null  object 
 2   black_username  214980 non-null  object 
 3   white_rating    214980 non-null  int64  
 4   black_rating    214980 non-null  int64  
 5   white_country   200047 non-null  object 
 6   black_country   200047 non-null  object 
 7   result          214980 non-null  object 
 8   termination     200047 non-null  object 
 9   time_control    200047 non-null  object 
 10  time_class      200047 non-null  object 
 11  initial_time    200047 non-null  float64
 12  increment       200047 non-null  float64
 13  date            200047 non-null  object 
 14  eco             197327 non-null  object 
 15  opening         0 non-null       float64
 16  num_moves       200047 non-null  flo

Unnamed: 0,game_id,white_username,black_username,white_rating,black_rating,white_country,black_country,result,termination,time_control,time_class,initial_time,increment,date,eco,opening,num_moves,rated,event,source,avg_rating,rating_diff,time_category
0,f57a1313-0f25-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3222,3307,nihalsarin,hikaru,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Modern-Defense-...,,312.0,True,,chess.com,3264.5,-85.0,blitz
1,d2b8e5d3-0f23-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3330,3199,hikaru,nihalsarin,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Nimzowitsch-Lar...,,320.0,True,,chess.com,3264.5,131.0,blitz
2,70b20004-0f27-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3226,3303,nihalsarin,hikaru,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Torre-Attack-Fi...,,334.0,True,,chess.com,3264.5,-77.0,blitz
3,35b89065-0f25-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3317,3212,hikaru,nihalsarin,1/2-1/2,repetition,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Bishops-Opening...,,274.0,True,,chess.com,3264.5,105.0,blitz
4,23a8b502-0f29-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3214,3315,nihalsarin,hikaru,1/2-1/2,resignation,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Torre-Attack......,,496.0,True,,chess.com,3264.5,-101.0,blitz
5,d1939b12-0f2a-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3211,3318,nihalsarin,hikaru,1/2-1/2,unknown,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Caro-Kann-Defen...,,572.0,True,,chess.com,3264.5,-107.0,blitz
6,bb63d0f6-0f26-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3313,3216,hikaru,nihalsarin,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Nimzowitsch-Lar...,,358.0,True,,chess.com,3264.5,97.0,blitz
7,050015c6-0f2a-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3321,3208,hikaru,nihalsarin,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Trompowsky-Atta...,,486.0,True,,chess.com,3264.5,113.0,blitz
8,38496dee-0f28-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3309,3220,hikaru,nihalsarin,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Reti-Opening-Ni...,,776.0,True,,chess.com,3264.5,89.0,blitz
9,97019d9d-0f24-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3210,3319,nihalsarin,hikaru,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Indian-Game-Sla...,,264.0,True,,chess.com,3264.5,-109.0,blitz


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
game_id,214980.0,214980.0,f57a1313-0f25-11f0-8731-6cfe544c0428,1.0,,,,,,,
white_username,214980.0,41650.0,francyIM,263.0,,,,,,,
black_username,214980.0,41611.0,francyIM,269.0,,,,,,,
white_rating,214980.0,,,,2409.71556,335.325708,348.0,2209.0,2445.0,2637.0,3517.0
black_rating,214980.0,,,,2408.870955,335.117792,112.0,2209.0,2444.0,2636.0,3400.0
white_country,200047.0,40686.0,dpopadic,248.0,,,,,,,
black_country,200047.0,40612.0,francyim,252.0,,,,,,,
result,214980.0,15.0,1-0,96830.0,,,,,,,
termination,200047.0,8.0,checkmate,112899.0,,,,,,,
time_control,200047.0,89.0,180,86198.0,,,,,,,


In [5]:
# ==========================================
# 3. Define Target + Drop IDs/Leakage Columns
# ==========================================

# Map results to target labels (adjust to match your CSV’s result column)
if "result" in df.columns:
    result_map = {"1-0": "white", "0-1": "black", "1/2-1/2": "draw"}
    df["target"] = df["result"].map(result_map)

drop_cols = [
    "game_id", "white_username", "black_username",
    "white_country", "black_country",
    "num_moves", "termination", "opening", "eco"
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

print("After dropping ID/leakage cols:", df.shape)
df.head(3)

After dropping ID/leakage cols: (214980, 15)


Unnamed: 0,white_rating,black_rating,result,time_control,time_class,initial_time,increment,date,rated,event,source,avg_rating,rating_diff,time_category,target
0,3222,3307,1-0,180,blitz,3.0,0.0,2025-04-01,True,,chess.com,3264.5,-85.0,blitz,white
1,3330,3199,1-0,180,blitz,3.0,0.0,2025-04-01,True,,chess.com,3264.5,131.0,blitz,white
2,3226,3303,1-0,180,blitz,3.0,0.0,2025-04-01,True,,chess.com,3264.5,-77.0,blitz,white


In [6]:
# ==========================================
# 4. Add Calendar Features + Ratings
# ==========================================

# Convert date → calendar fields
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["dayofweek"] = df["date"].dt.dayofweek
    df = df.drop(columns=["date"])

# Rating-based features
if {"white_rating", "black_rating"} <= set(df.columns):
    df["rating_diff"] = df["white_rating"] - df["black_rating"]
    df["avg_rating"] = (df["white_rating"] + df["black_rating"]) / 2

# Convert booleans to integers
if "rated" in df.columns and df["rated"].dtype == bool:
    df["rated"] = df["rated"].astype(int)

print("Columns now:", list(df.columns))
df.head(3)

Columns now: ['white_rating', 'black_rating', 'result', 'time_control', 'time_class', 'initial_time', 'increment', 'rated', 'event', 'source', 'avg_rating', 'rating_diff', 'time_category', 'target', 'year', 'month', 'dayofweek']


Unnamed: 0,white_rating,black_rating,result,time_control,time_class,initial_time,increment,rated,event,source,avg_rating,rating_diff,time_category,target,year,month,dayofweek
0,3222,3307,1-0,180,blitz,3.0,0.0,True,,chess.com,3264.5,-85,blitz,white,2025.0,4.0,1.0
1,3330,3199,1-0,180,blitz,3.0,0.0,True,,chess.com,3264.5,131,blitz,white,2025.0,4.0,1.0
2,3226,3303,1-0,180,blitz,3.0,0.0,True,,chess.com,3264.5,-77,blitz,white,2025.0,4.0,1.0


In [7]:
# ==========================================
# 5. Handle Missing Values + Encode Categoricals
# ==========================================

target_col = "target"
feature_cols = [c for c in df.columns if c not in {target_col, "result"}]

cat_cols = [c for c in feature_cols if df[c].dtype == "object"]
num_cols = [c for c in feature_cols if c not in cat_cols]

# Impute numerics
for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].median())

# Impute categoricals with "Unknown"
for c in cat_cols:
    df[c] = df[c].fillna("Unknown")

# One-hot encode categoricals
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("Encoded shape:", df_encoded.shape)
print("Target distribution:")
display(df_encoded[target_col].value_counts(dropna=False))
df_encoded.head(3)

Encoded shape: (214980, 1894)
Target distribution:


target
white    96830
draw     60817
black    42400
NaN      14933
Name: count, dtype: int64

Unnamed: 0,white_rating,black_rating,result,initial_time,increment,avg_rating,rating_diff,target,year,month,dayofweek,time_control_1/172800,time_control_1/259200,time_control_1/432000,time_control_1/604800,time_control_1/86400,time_control_1/864000,time_control_10,time_control_10+0.1,time_control_10+1,time_control_1080+2,time_control_120,time_control_120+1,time_control_120+2,time_control_120+3,time_control_1200,time_control_1200+10,time_control_1200+15,time_control_1200+2,time_control_1200+20,time_control_1200+5,time_control_121,time_control_1220+10,time_control_1220+5,time_control_15,time_control_15+5,time_control_1500+10,time_control_1500+5,time_control_1510,time_control_1560+10,time_control_180,time_control_180+1,time_control_180+2,time_control_180+3,time_control_1800,time_control_182,time_control_185,time_control_20,time_control_20+1,time_control_240,time_control_240+1,time_control_240+2,time_control_2400,time_control_2400+5,time_control_2700,time_control_2700+15,time_control_2700+45,time_control_30,time_control_300,time_control_300+1,time_control_300+2,time_control_300+3,time_control_300+5,time_control_303,time_control_3600,time_control_3600+10,time_control_3600+15,time_control_3600+5,time_control_40,time_control_420+2,time_control_420+3,time_control_420+5,time_control_421+2,time_control_4500+5,time_control_480+5,time_control_5400,time_control_5400+30,time_control_546+4,time_control_6,time_control_60,time_control_60+1,time_control_60+2,time_control_60+3,time_control_60+6,time_control_600,time_control_600+10,time_control_600+2,time_control_600+3,time_control_600+5,time_control_602,time_control_61,time_control_62,time_control_7200,time_control_7200+60,time_control_900,time_control_900+10,time_control_900+2,time_control_900+3,time_control_900+5,time_control_Unknown,...,event_https://api.chess.com/pub/tournament/the-4th-flaming-tiger-online-blitz-event-1180933,event_https://api.chess.com/pub/tournament/the-accused-5283897,event_https://api.chess.com/pub/tournament/the-after-party-4443909,event_https://api.chess.com/pub/tournament/the-battle-of-helms-deep-1,event_https://api.chess.com/pub/tournament/the-best-1-2021,event_https://api.chess.com/pub/tournament/the-biggest-world-chess-championship-2025,event_https://api.chess.com/pub/tournament/the-new-tournament-welcome,event_https://api.chess.com/pub/tournament/the-qualifying-stage-1825362,event_https://api.chess.com/pub/tournament/the-queens-gambit-5-1,event_https://api.chess.com/pub/tournament/the-warsaw-rising-tournament-2024-1851-2150,event_https://api.chess.com/pub/tournament/this-sunday-we-have-a-tournament-with-prizes-439035,event_https://api.chess.com/pub/tournament/this-will-go-into-2022-and-beyond,event_https://api.chess.com/pub/tournament/titeld-tuesday--4140845,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-august-03-2021-2500049,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-august-17-2021-2528831,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-august-31-2021-2546688,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-december-07-2021-2779114,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-december-14-2021-2793478,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-december-21-2021-2825101,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-december-28-2021-2825102,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-february-01-2022-a-2930234,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-february-01-2022-b-2930235,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-january-04-2022-2854823,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-january-11-2022-2868439,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-january-18-2022-2892377,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-january-25-2022-2906225,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-july-06-2021-2443189,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-july-13-2021-2457070,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-july-20-2021-2471236,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-01-2021-2359358,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-08-2021-2384337,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-15-2021-2399019,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-22-2021-2413621,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-29-2021-2428655,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-may-11-2021-2314015,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-may-18-2021-2319376,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-may-25-2021-2333682,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-may-4-2021-2288851,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-02-2021-2688099,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-09-2021-2710127,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-16-2021-2736618,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-23-2021-2750236,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-30-2021-2764222,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-october-05-2021-2640054,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-october-12-2021-2654644,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-october-19-2021-2669205,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-october-26-2021-2673117,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-september-02-2025-5905663,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-september-07-2021-2562438,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-september-09-2025-5905665,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-september-16-2025-5925403,event_https://api.chess.com/pub/tournament/torneios-200k-gm-evandro-4409363,event_https://api.chess.com/pub/tournament/torneo-1-ajedrez-503-1246781,event_https://api.chess.com/pub/tournament/torneo-abierto-47-aniversario-universidad-nacional-de-entre-ros-1222837,event_https://api.chess.com/pub/tournament/torneo-blitz-v-san-salvador--3247207,event_https://api.chess.com/pub/tournament/torneo-chesschocomath-blitz-5909687,event_https://api.chess.com/pub/tournament/torneo-con-los-maestros-1538828,event_https://api.chess.com/pub/tournament/torneo-con-premios-3070984,event_https://api.chess.com/pub/tournament/torneo-de-ajedrez-en-lnea-jden-2024-etapa-regional---regin-central-masculino--4816110,event_https://api.chess.com/pub/tournament/torneo-de-invierno-9,event_https://api.chess.com/pub/tournament/torneo-fenamac-1596399,event_https://api.chess.com/pub/tournament/torneo-matecitos--3562545,event_https://api.chess.com/pub/tournament/torneo-online-pergamino-3807923,event_https://api.chess.com/pub/tournament/torneo-por-diversion-214186,event_https://api.chess.com/pub/tournament/torneo-prueba-5,event_https://api.chess.com/pub/tournament/torneo-solidario-jorge-fernndez--3979237,event_https://api.chess.com/pub/tournament/torneo-solidario-jorge-fernndez-4053995,event_https://api.chess.com/pub/tournament/torneo-torre-blanca-domingo-22-3-18-hs-1163961,event_https://api.chess.com/pub/tournament/torre-blanca-blitz-del-lunes-23-de-marzo-18-hs-1164834,event_https://api.chess.com/pub/tournament/torre-blanca-de-madrugada-1163742,event_https://api.chess.com/pub/tournament/tournament-00000000385bc3ca0000000000125ca4,event_https://api.chess.com/pub/tournament/tournament-for-honest-players,event_https://api.chess.com/pub/tournament/tournoi-1b,event_https://api.chess.com/pub/tournament/tsct-2022-amateur-blitz-1659334,event_https://api.chess.com/pub/tournament/ttb-open-2020-1,event_https://api.chess.com/pub/tournament/turnaj-pratelstvi,event_https://api.chess.com/pub/tournament/uefa-nations-league-2022-23-germany-1,event_https://api.chess.com/pub/tournament/uji-coba-xclusive-challenge-2320265,event_https://api.chess.com/pub/tournament/upasya-039-s-tournament,event_https://api.chess.com/pub/tournament/us-chess-regular-open-5176709,event_https://api.chess.com/pub/tournament/v-torneo-krakens-1311346,event_https://api.chess.com/pub/tournament/vidovdanski-kup-2023-vidovdan-cup-2023,event_https://api.chess.com/pub/tournament/women-in-chess-foundation---charity-stream-3920711,event_https://api.chess.com/pub/tournament/womens-scc-qualifier-11-1273144,event_https://api.chess.com/pub/tournament/womens-scc-qualifier-31-1273143,event_https://api.chess.com/pub/tournament/womens-speed-chess-qualifier-1272484,event_https://api.chess.com/pub/tournament/world-chess-league-summer-practice-arena-august-31st-4399903,event_https://api.chess.com/pub/tournament/wscc-play-in-1-4977157,event_https://api.chess.com/pub/tournament/wscc-play-in-2-4977159,event_https://api.chess.com/pub/tournament/wscc-play-in-3-4977161,event_https://api.chess.com/pub/tournament/wscc-play-in-4-4977163,event_https://api.chess.com/pub/tournament/wscc-qualifier-2-3165382,event_https://api.chess.com/pub/tournament/wscc-qualifier-4361393,event_https://api.chess.com/pub/tournament/wscc-qualifier-7-2356652,event_https://api.chess.com/pub/tournament/xclusive-challenge-seri-2-u12-2381310,event_https://api.chess.com/pub/tournament/zinzino-internetowe-mistrzostwa-polski-1173898,time_category_blitz,time_category_bullet,time_category_daily,time_category_rapid
0,3222,3307,1-0,3.0,0.0,3264.5,-85,white,2025.0,4.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,3330,3199,1-0,3.0,0.0,3264.5,131,white,2025.0,4.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
2,3226,3303,1-0,3.0,0.0,3264.5,-77,white,2025.0,4.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [8]:
# ==========================================
# 6. Leakage Sanity Check
# ==========================================

banlist = {"num_moves", "termination", "opening", "eco"}
present_banned = [c for c in df_encoded.columns if c in banlist]
if present_banned:
    df_encoded = df_encoded.drop(columns=present_banned, errors="ignore")
    print("Removed banned columns:", present_banned)

print("Final feature count (incl. target):", df_encoded.shape[1])

Final feature count (incl. target): 1894


In [None]:
# ==========================================
# 7. Save Clean Dataset + Metadata (to data/)
# ==========================================

DATA_DIR = (NB_DIR / "../data").resolve()
DATA_DIR.mkdir(parents=True, exist_ok=True)

clean_csv = DATA_DIR / "clean_chess_games.csv"
df_encoded.to_csv(clean_csv, index=False)

meta = {
    "target_col": target_col,
    "n_rows": int(df_encoded.shape[0]),
    "n_cols": int(df_encoded.shape[1]),
}
with open(DATA_DIR / "clean_metadata.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print(f"✅ Saved cleaned dataset to: {clean_csv}")
print(f"📝 Saved metadata to: {DATA_DIR / 'clean_metadata.json'}")
df_encoded.head()

✅ Saved cleaned dataset to: E:\Github Projects\chess-outcome-prediction\results\clean_chess_games.csv
📝 Saved metadata to: E:\Github Projects\chess-outcome-prediction\results\clean_metadata.json


Unnamed: 0,white_rating,black_rating,result,initial_time,increment,avg_rating,rating_diff,target,year,month,dayofweek,time_control_1/172800,time_control_1/259200,time_control_1/432000,time_control_1/604800,time_control_1/86400,time_control_1/864000,time_control_10,time_control_10+0.1,time_control_10+1,time_control_1080+2,time_control_120,time_control_120+1,time_control_120+2,time_control_120+3,time_control_1200,time_control_1200+10,time_control_1200+15,time_control_1200+2,time_control_1200+20,time_control_1200+5,time_control_121,time_control_1220+10,time_control_1220+5,time_control_15,time_control_15+5,time_control_1500+10,time_control_1500+5,time_control_1510,time_control_1560+10,time_control_180,time_control_180+1,time_control_180+2,time_control_180+3,time_control_1800,time_control_182,time_control_185,time_control_20,time_control_20+1,time_control_240,time_control_240+1,time_control_240+2,time_control_2400,time_control_2400+5,time_control_2700,time_control_2700+15,time_control_2700+45,time_control_30,time_control_300,time_control_300+1,time_control_300+2,time_control_300+3,time_control_300+5,time_control_303,time_control_3600,time_control_3600+10,time_control_3600+15,time_control_3600+5,time_control_40,time_control_420+2,time_control_420+3,time_control_420+5,time_control_421+2,time_control_4500+5,time_control_480+5,time_control_5400,time_control_5400+30,time_control_546+4,time_control_6,time_control_60,time_control_60+1,time_control_60+2,time_control_60+3,time_control_60+6,time_control_600,time_control_600+10,time_control_600+2,time_control_600+3,time_control_600+5,time_control_602,time_control_61,time_control_62,time_control_7200,time_control_7200+60,time_control_900,time_control_900+10,time_control_900+2,time_control_900+3,time_control_900+5,time_control_Unknown,...,event_https://api.chess.com/pub/tournament/the-4th-flaming-tiger-online-blitz-event-1180933,event_https://api.chess.com/pub/tournament/the-accused-5283897,event_https://api.chess.com/pub/tournament/the-after-party-4443909,event_https://api.chess.com/pub/tournament/the-battle-of-helms-deep-1,event_https://api.chess.com/pub/tournament/the-best-1-2021,event_https://api.chess.com/pub/tournament/the-biggest-world-chess-championship-2025,event_https://api.chess.com/pub/tournament/the-new-tournament-welcome,event_https://api.chess.com/pub/tournament/the-qualifying-stage-1825362,event_https://api.chess.com/pub/tournament/the-queens-gambit-5-1,event_https://api.chess.com/pub/tournament/the-warsaw-rising-tournament-2024-1851-2150,event_https://api.chess.com/pub/tournament/this-sunday-we-have-a-tournament-with-prizes-439035,event_https://api.chess.com/pub/tournament/this-will-go-into-2022-and-beyond,event_https://api.chess.com/pub/tournament/titeld-tuesday--4140845,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-august-03-2021-2500049,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-august-17-2021-2528831,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-august-31-2021-2546688,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-december-07-2021-2779114,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-december-14-2021-2793478,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-december-21-2021-2825101,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-december-28-2021-2825102,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-february-01-2022-a-2930234,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-february-01-2022-b-2930235,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-january-04-2022-2854823,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-january-11-2022-2868439,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-january-18-2022-2892377,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-january-25-2022-2906225,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-july-06-2021-2443189,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-july-13-2021-2457070,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-july-20-2021-2471236,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-01-2021-2359358,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-08-2021-2384337,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-15-2021-2399019,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-22-2021-2413621,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-june-29-2021-2428655,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-may-11-2021-2314015,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-may-18-2021-2319376,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-may-25-2021-2333682,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-may-4-2021-2288851,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-02-2021-2688099,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-09-2021-2710127,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-16-2021-2736618,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-23-2021-2750236,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-november-30-2021-2764222,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-october-05-2021-2640054,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-october-12-2021-2654644,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-october-19-2021-2669205,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-october-26-2021-2673117,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-september-02-2025-5905663,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-september-07-2021-2562438,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-september-09-2025-5905665,event_https://api.chess.com/pub/tournament/titled-tuesday-blitz-september-16-2025-5925403,event_https://api.chess.com/pub/tournament/torneios-200k-gm-evandro-4409363,event_https://api.chess.com/pub/tournament/torneo-1-ajedrez-503-1246781,event_https://api.chess.com/pub/tournament/torneo-abierto-47-aniversario-universidad-nacional-de-entre-ros-1222837,event_https://api.chess.com/pub/tournament/torneo-blitz-v-san-salvador--3247207,event_https://api.chess.com/pub/tournament/torneo-chesschocomath-blitz-5909687,event_https://api.chess.com/pub/tournament/torneo-con-los-maestros-1538828,event_https://api.chess.com/pub/tournament/torneo-con-premios-3070984,event_https://api.chess.com/pub/tournament/torneo-de-ajedrez-en-lnea-jden-2024-etapa-regional---regin-central-masculino--4816110,event_https://api.chess.com/pub/tournament/torneo-de-invierno-9,event_https://api.chess.com/pub/tournament/torneo-fenamac-1596399,event_https://api.chess.com/pub/tournament/torneo-matecitos--3562545,event_https://api.chess.com/pub/tournament/torneo-online-pergamino-3807923,event_https://api.chess.com/pub/tournament/torneo-por-diversion-214186,event_https://api.chess.com/pub/tournament/torneo-prueba-5,event_https://api.chess.com/pub/tournament/torneo-solidario-jorge-fernndez--3979237,event_https://api.chess.com/pub/tournament/torneo-solidario-jorge-fernndez-4053995,event_https://api.chess.com/pub/tournament/torneo-torre-blanca-domingo-22-3-18-hs-1163961,event_https://api.chess.com/pub/tournament/torre-blanca-blitz-del-lunes-23-de-marzo-18-hs-1164834,event_https://api.chess.com/pub/tournament/torre-blanca-de-madrugada-1163742,event_https://api.chess.com/pub/tournament/tournament-00000000385bc3ca0000000000125ca4,event_https://api.chess.com/pub/tournament/tournament-for-honest-players,event_https://api.chess.com/pub/tournament/tournoi-1b,event_https://api.chess.com/pub/tournament/tsct-2022-amateur-blitz-1659334,event_https://api.chess.com/pub/tournament/ttb-open-2020-1,event_https://api.chess.com/pub/tournament/turnaj-pratelstvi,event_https://api.chess.com/pub/tournament/uefa-nations-league-2022-23-germany-1,event_https://api.chess.com/pub/tournament/uji-coba-xclusive-challenge-2320265,event_https://api.chess.com/pub/tournament/upasya-039-s-tournament,event_https://api.chess.com/pub/tournament/us-chess-regular-open-5176709,event_https://api.chess.com/pub/tournament/v-torneo-krakens-1311346,event_https://api.chess.com/pub/tournament/vidovdanski-kup-2023-vidovdan-cup-2023,event_https://api.chess.com/pub/tournament/women-in-chess-foundation---charity-stream-3920711,event_https://api.chess.com/pub/tournament/womens-scc-qualifier-11-1273144,event_https://api.chess.com/pub/tournament/womens-scc-qualifier-31-1273143,event_https://api.chess.com/pub/tournament/womens-speed-chess-qualifier-1272484,event_https://api.chess.com/pub/tournament/world-chess-league-summer-practice-arena-august-31st-4399903,event_https://api.chess.com/pub/tournament/wscc-play-in-1-4977157,event_https://api.chess.com/pub/tournament/wscc-play-in-2-4977159,event_https://api.chess.com/pub/tournament/wscc-play-in-3-4977161,event_https://api.chess.com/pub/tournament/wscc-play-in-4-4977163,event_https://api.chess.com/pub/tournament/wscc-qualifier-2-3165382,event_https://api.chess.com/pub/tournament/wscc-qualifier-4361393,event_https://api.chess.com/pub/tournament/wscc-qualifier-7-2356652,event_https://api.chess.com/pub/tournament/xclusive-challenge-seri-2-u12-2381310,event_https://api.chess.com/pub/tournament/zinzino-internetowe-mistrzostwa-polski-1173898,time_category_blitz,time_category_bullet,time_category_daily,time_category_rapid
0,3222,3307,1-0,3.0,0.0,3264.5,-85,white,2025.0,4.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,3330,3199,1-0,3.0,0.0,3264.5,131,white,2025.0,4.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
2,3226,3303,1-0,3.0,0.0,3264.5,-77,white,2025.0,4.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,3317,3212,1/2-1/2,3.0,0.0,3264.5,105,draw,2025.0,4.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
4,3214,3315,1/2-1/2,3.0,0.0,3264.5,-101,draw,2025.0,4.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
