In [3]:
# ==========================================
# CHESS OUTCOME PREDICTION — PREPROCESSING
# ==========================================

# --- Imports (everything used in this notebook lives here) ---
from pathlib import Path
import os
import json
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

# --- Paths ---
NB_DIR = Path.cwd()
CANDIDATES = [
    NB_DIR / "../data/raw_chess_games.csv",
    NB_DIR / "../../data/raw_chess_games.csv",
    NB_DIR / "data/raw_chess_games.csv",
]

RAW_CSV = next((p.resolve() for p in CANDIDATES if p.exists()), None)
if RAW_CSV is None:
    raise FileNotFoundError(
        "Could not find raw_chess_games.csv. Expected it under ../data/ relative to this notebook."
    )

RESULTS_DIR = (NB_DIR / "../results").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# --- Load ---
df = pd.read_csv(RAW_CSV)
print(f"Loaded: {RAW_CSV}")
print("Initial shape:", df.shape)
df.head()

Loaded: E:\Github Projects\chess-outcome-prediction\data\raw_chess_games.csv
Initial shape: (214980, 23)


Unnamed: 0,game_id,white_username,black_username,white_rating,black_rating,white_country,black_country,result,termination,time_control,time_class,initial_time,increment,date,eco,opening,num_moves,rated,event,source,avg_rating,rating_diff,time_category
0,f57a1313-0f25-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3222,3307,nihalsarin,hikaru,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Modern-Defense-...,,312.0,True,,chess.com,3264.5,-85.0,blitz
1,d2b8e5d3-0f23-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3330,3199,hikaru,nihalsarin,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Nimzowitsch-Lar...,,320.0,True,,chess.com,3264.5,131.0,blitz
2,70b20004-0f27-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3226,3303,nihalsarin,hikaru,1-0,checkmate,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Torre-Attack-Fi...,,334.0,True,,chess.com,3264.5,-77.0,blitz
3,35b89065-0f25-11f0-8731-6cfe544c0428,Hikaru,nihalsarin,3317,3212,hikaru,nihalsarin,1/2-1/2,repetition,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Bishops-Opening...,,274.0,True,,chess.com,3264.5,105.0,blitz
4,23a8b502-0f29-11f0-8731-6cfe544c0428,nihalsarin,Hikaru,3214,3315,nihalsarin,hikaru,1/2-1/2,resignation,180,blitz,3.0,0.0,2025-04-01,https://www.chess.com/openings/Torre-Attack......,,496.0,True,,chess.com,3264.5,-101.0,blitz


In [1]:
# -------------------------------
# 2. Inspect dataset info
# -------------------------------
df.info()
df.describe(include="all").transpose()

NameError: name 'df' is not defined

In [None]:
# -------------------------------
# 3. Drop unnecessary/redundant columns
# -------------------------------
# This will depend on your CSV columns, but typically:
# - Game ID, URL, or other identifiers → not useful for prediction
# - Exact outcome fields (if duplicated or too revealing) → potential leakage

# Example placeholder: adjust after inspecting your CSV
drop_cols = ["game_id", "url", "moves", "opening_code"]
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

print("After dropping redundant cols:", df.shape)

In [None]:
# -------------------------------
# 5. Handle missing values
# -------------------------------
# Strategy: 
# - For numeric columns: fill with median
# - For categorical (already encoded): fill with 0

for col in df_encoded.columns:
    if df_encoded[col].dtype in ["int64", "float64"]:
        df_encoded[col] = df_encoded[col].fillna(df_encoded[col].median())
    else:
        df_encoded[col] = df_encoded[col].fillna(0)

print("Missing values remaining:", df_encoded.isnull().sum().sum())

In [None]:
# -------------------------------
# 6. Prevent data leakage
# -------------------------------
# Example:
# - If 'winner' or 'termination' directly reveals the game result, drop it.
# - Keep only features known BEFORE the game starts (ratings, opening, etc.).

leakage_cols = ["winner", "termination"]
df_encoded = df_encoded.drop(columns=[c for c in leakage_cols if c in df_encoded.columns])

print("After removing leakage columns:", df_encoded.shape)

In [None]:
# -------------------------------
# 7. Save cleaned dataset
# -------------------------------
output_path = "../results/clean_chess_games.csv"
df_encoded.to_csv(output_path, index=False)

print(f"✅ Cleaned dataset saved to {output_path}")