# Dataset Info
The full dataset (`chess_games.csv`) is too large to include in this repo.
You can download it from Kaggle:
https://www.kaggle.com/datasets/arevel/chess-games

After downloading, place it in the `data/` folder.
Run this notebook (`01_downsampling.ipynb`) to create `chess_games_subset.csv`
(~80 MB), which is used for all subsequent notebooks.

In [1]:
# ==========================================
# 1. Imports & Paths
# ==========================================

import pandas as pd
from pathlib import Path

# Base paths
DATA_DIR = Path("../data")
RAW_FILE = DATA_DIR / "chess_games.csv"
OUT_FILE = DATA_DIR / "chess_games_subset.csv"

print("Raw file:", RAW_FILE)

Raw file: ..\data\chess_games.csv


In [2]:
# ==========================================
# 2. Estimate Size & Choose Fraction
# ==========================================

# Original file size in MB
size_mb = RAW_FILE.stat().st_size / (1024**2)
print(f"Original file size: {size_mb:.2f} MB")

# Target subset size (change this if you want a different cap)
target_mb = 80  

# Approx fraction to sample
frac = min(1.0, target_mb / size_mb)
print(f"Target subset size ≈ {target_mb} MB")
print(f"Sampling fraction ≈ {frac:.3f} ({frac*100:.1f}%)")

Original file size: 4176.04 MB
Target subset size ≈ 80 MB
Sampling fraction ≈ 0.019 (1.9%)


In [3]:
# ==========================================
# 3. Downsample & Save
# ==========================================

chunksize = 100_000
subset_chunks = []

for chunk in pd.read_csv(RAW_FILE, chunksize=chunksize):
    subset_chunks.append(chunk.sample(frac=frac, random_state=42))

df_small = pd.concat(subset_chunks)

# Save
df_small.to_csv(OUT_FILE, index=False)

# Report
out_size_mb = OUT_FILE.stat().st_size / (1024**2)
print(f"Subset saved to: {OUT_FILE}")
print(f"Subset shape: {df_small.shape}")
print(f"Subset size: {out_size_mb:.2f} MB")

Subset saved to: ..\data\chess_games_subset.csv
Subset shape: (119868, 15)
Subset size: 80.02 MB
