# Chess Outcome — Model Training

**Purpose:** Train deep-learning models to predict the game result from early-game move sequences (first *N* full moves, as configured).

**Inputs:** Clean dataset from Notebook 2 (sequence column + target).

**Process:** Load & tokenise → build vocab from train only → pad/truncate to the configured length → stratified split → train model(s) with early stopping and LR scheduling → handle class imbalance (balanced batches or class weights) → evaluate and save artefacts.

**Metrics:** Accuracy, Macro-F1, and confusion matrix.

**Outputs:** Best model (`results/*.keras`) and run report (`results/seq_report.json`). Seed fixed; GPU used if available (mixed precision optional).

In [10]:
# ==============================================
# 1. Imports & Paths
# ==============================================
from pathlib import Path
import json
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

import tensorflow as tf

# ----------------------------------------------
# 1.1 Config
# ----------------------------------------------
SEED = 42
CUTOFF_FULL_MOVES = 30
CUTOFF_PLIES = CUTOFF_FULL_MOVES * 2
MOVE_COL = f"moves_first{CUTOFF_FULL_MOVES}_san"

# ----------------------------------------------
# 1.2 Reproducibility
# ----------------------------------------------
np.random.seed(SEED)
tf.random.set_seed(SEED)
gpus = tf.config.list_physical_devices("GPU")
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except Exception:
        pass

# ----------------------------------------------
# 1.3 Paths
# ----------------------------------------------
NB_DIR = Path.cwd()
DATA_DIR = (NB_DIR / "../data").resolve()
RESULTS_DIR = (NB_DIR / "../results").resolve()
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
ELO_FEATS = ["elo_diff", "elo_avg"]

CLEAN_CSV = DATA_DIR / "chess_games_clean.csv"
META_JSON = DATA_DIR / "chess_games_clean_meta.json"

print("Data:", CLEAN_CSV)
print("Meta:", META_JSON)
print(f"Cutoff: {CUTOFF_FULL_MOVES} moves ({CUTOFF_PLIES} plies)")
print("Results dir:", RESULTS_DIR)

Data: E:\Github Projects\chess-outcome-prediction\data\chess_games_clean.csv
Meta: E:\Github Projects\chess-outcome-prediction\data\chess_games_clean_meta.json
Cutoff: 30 moves (60 plies)
Results dir: E:\Github Projects\chess-outcome-prediction\results


In [2]:
# ==============================================
# 2. Load, Tokenise, Split
# ==============================================
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import json

# ----------------------------------------------
# 2.1 Load
# ----------------------------------------------
df = pd.read_csv(CLEAN_CSV)
expected = {MOVE_COL, "plies_processed", "cutoff_reached", "target"}
missing = sorted(list(expected - set(df.columns)))
assert not missing, f"Missing columns: {missing}"
df = df.dropna(subset=[MOVE_COL, "target"]).reset_index(drop=True)
df = df[df[MOVE_COL].str.len() > 0].reset_index(drop=True)

# ----------------------------------------------
# 2.2 Labels
# ----------------------------------------------
CLASS_ORDER = ["black", "draw", "white"]
label_to_id = {c:i for i,c in enumerate(CLASS_ORDER)}
id_to_label = {i:c for c,i in label_to_id.items()}
y = df["target"].map(label_to_id)
mask = y.notna()
df, y = df[mask].reset_index(drop=True), y[mask].astype(int).reset_index(drop=True)

# ----------------------------------------------
# 2.3 Tokenise
# ----------------------------------------------
def to_tokens(s):
    return str(s).split()

tokens = df[MOVE_COL].apply(to_tokens)

# ----------------------------------------------
# 2.4 Split
# ----------------------------------------------
X_tmp, X_test_tok, y_tmp, y_test = train_test_split(
    tokens, y, test_size=0.15, random_state=SEED, stratify=y
)
X_train_tok, X_val_tok, y_train, y_val = train_test_split(
    X_tmp, y_tmp, test_size=0.1765, random_state=SEED, stratify=y_tmp
)  # 0.85 * 0.1765 ≈ 0.15 → 70/15/15

# ----------------------------------------------
# 2.5 Vocab (train only)
# ----------------------------------------------
counter = Counter(t for seq in X_train_tok for t in seq[:CUTOFF_PLIES])
vocab = ["<PAD>", "<UNK>"] + [tok for tok, _ in counter.most_common()]
stoi = {t:i for i,t in enumerate(vocab)}
PAD_ID, UNK_ID = 0, 1

# ----------------------------------------------
# 2.6 Numericalise
# ----------------------------------------------
def to_ids(seq, max_len=CUTOFF_PLIES):
    ids = [stoi.get(t, UNK_ID) for t in seq[:max_len]]
    if len(ids) < max_len:
        ids += [PAD_ID] * (max_len - len(ids))
    return np.array(ids, dtype=np.int32)

X_train_seq = np.stack([to_ids(s) for s in X_train_tok])
X_val_seq   = np.stack([to_ids(s) for s in X_val_tok])
X_test_seq  = np.stack([to_ids(s) for s in X_test_tok])

# ----------------------------------------------
# 2.7 Class Weights
# ----------------------------------------------
classes = np.array(sorted(label_to_id.values()))
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}

# ----------------------------------------------
# 2.8 Persist Artifacts
# ----------------------------------------------
(RESULTS_DIR / "vocab.json").write_text(json.dumps({"vocab": vocab}, ensure_ascii=False))
(RESULTS_DIR / "label_mapping.json").write_text(json.dumps({"label_to_id": label_to_id, "id_to_label": id_to_label}, indent=2))
(RESULTS_DIR / "class_weights.json").write_text(json.dumps(class_weight, indent=2))
(RESULTS_DIR / "dl_config.json").write_text(json.dumps({"cutoff_full_moves": CUTOFF_FULL_MOVES, "cutoff_plies": CUTOFF_PLIES, "pad_id": PAD_ID, "unk_id": UNK_ID}, indent=2))

# ----------------------------------------------
# 2.9 Summary
# ----------------------------------------------
print("Shapes:", X_train_seq.shape, X_val_seq.shape, X_test_seq.shape)
print("Vocab size:", len(vocab))
print("Label map:", label_to_id)
print("Class weights:", class_weight)
print("Samples (train/val/test):", len(X_train_seq), len(X_val_seq), len(X_test_seq))

Shapes: (99608, 60) (21349, 60) (21346, 60)
Vocab size: 6809
Label map: {'black': 0, 'draw': 1, 'white': 2}
Class weights: {0: 0.7208725041070511, 1: 8.601727115716754, 2: 0.6682095970268402}
Samples (train/val/test): 99608 21349 21346


In [3]:
# ==============================================
# 2.10 Numeric features (Elo) — standardise on train only
# ==============================================

from sklearn.preprocessing import StandardScaler

ELO_FEATS = ["elo_diff", "elo_avg"] if "ELO_FEATS" not in globals() else ELO_FEATS
assert all(f in df.columns for f in ELO_FEATS), f"Missing columns: {set(ELO_FEATS)-set(df.columns)}"

X_train_num = df.loc[X_train_tok.index, ELO_FEATS].astype("float32").values
X_val_num   = df.loc[X_val_tok.index,   ELO_FEATS].astype("float32").values
X_test_num  = df.loc[X_test_tok.index,  ELO_FEATS].astype("float32").values

num_scaler = StandardScaler().fit(X_train_num)
X_train_num = num_scaler.transform(X_train_num)
X_val_num   = num_scaler.transform(X_val_num)
X_test_num  = num_scaler.transform(X_test_num)

print("Seq shapes:", X_train_seq.shape, X_val_seq.shape, X_test_seq.shape)
print("Num shapes:", X_train_num.shape, X_val_num.shape, X_test_num.shape)

Seq shapes: (99608, 60) (21349, 60) (21346, 60)
Num shapes: (99608, 2) (21349, 2) (21346, 2)


In [4]:
# ==============================================
# 2.11 Label sanity & class weights (robust)
# ==============================================
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

NUM_CLASSES = len(CLASS_ORDER)

def to_np_int(a):
    return np.asarray(a, dtype=np.int32)

y_train = to_np_int(y_train)
y_val   = to_np_int(y_val)
y_test  = to_np_int(y_test)

valid_classes = np.arange(NUM_CLASSES, dtype=np.int32)
extra_train = np.setdiff1d(np.unique(y_train), valid_classes)
extra_val   = np.setdiff1d(np.unique(y_val),   valid_classes)
extra_test  = np.setdiff1d(np.unique(y_test),  valid_classes)
assert extra_train.size == 0 and extra_val.size == 0 and extra_test.size == 0, \
    f"Unexpected labels found. Train:{extra_train}, Val:{extra_val}, Test:{extra_test}"

cw = compute_class_weight(class_weight="balanced", classes=valid_classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(valid_classes, cw)}

print("Label uniques:",
      {k: sorted(np.unique(v).tolist()) for k, v in
       {"train": y_train, "val": y_val, "test": y_test}.items()})
print("class_weight keys:", list(class_weight.keys()))

Label uniques: {'train': [0, 1, 2], 'val': [0, 1, 2], 'test': [0, 1, 2]}
class_weight keys: [0, 1, 2]


In [5]:
# ==============================================
# 2.12 Build tf.data datasets (balanced training)
# ==============================================

import tensorflow as tf

BATCH = 512 if "BATCH" not in globals() else BATCH
OVERSAMPLE_DRAWS = 6 if "OVERSAMPLE_DRAWS" not in globals() else OVERSAMPLE_DRAWS

train_seq = tf.constant(X_train_seq)
train_num = tf.constant(X_train_num)
train_lbl = tf.constant(y_train)

draw_id = label_to_id["draw"]
blk_id  = label_to_id["black"]
wht_id  = label_to_id["white"]

draw_idx = tf.where(train_lbl == draw_id)[:, 0]
blk_idx  = tf.where(train_lbl == blk_id)[:, 0]
wht_idx  = tf.where(train_lbl == wht_id)[:, 0]

def ds_from_indices(idx):
    return tf.data.Dataset.from_tensor_slices(
        ((tf.gather(train_seq, idx), tf.gather(train_num, idx)),
         tf.gather(train_lbl, idx))
    )

ds_train = (
    ds_from_indices(blk_idx)
    .concatenate(ds_from_indices(wht_idx))
    .concatenate(ds_from_indices(draw_idx).repeat(OVERSAMPLE_DRAWS))
    .shuffle(200_000, seed=SEED)
    .batch(BATCH)
    .prefetch(2)
)

ds_val  = tf.data.Dataset.from_tensor_slices(((X_val_seq,  X_val_num),  y_val )).batch(BATCH).prefetch(2)
ds_test = tf.data.Dataset.from_tensor_slices(((X_test_seq, X_test_num), y_test)).batch(BATCH)

print("Balanced two-input training dataset ready.")


Balanced two-input training dataset ready.


In [6]:
# ==============================================
# 3. Model · Train (balanced) · Evaluate · Save
# ==============================================

import json, numpy as np, tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

VOCAB_SIZE = len(vocab)
NUM_CLASSES = len(CLASS_ORDER)
EMB_DIM = 128

seq_in = layers.Input(shape=(CUTOFF_PLIES,), dtype="int32", name="seq_in")
x = layers.Embedding(VOCAB_SIZE, EMB_DIM, mask_zero=True)(seq_in)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.Dropout(0.20)(x)
x = layers.Bidirectional(layers.LSTM(128))(x)
x = layers.Dropout(0.20)(x)
seq_repr = layers.Dense(128, activation="relu")(x)

num_in = layers.Input(shape=(len(ELO_FEATS),), dtype="float32", name="num_in")
n = layers.LayerNormalization()(num_in)
n = layers.Dense(32, activation="relu")(n)

h = layers.Concatenate()([seq_repr, n])
h = layers.Dropout(0.15)(h)
h = layers.Dense(128, activation="relu")(h)
out = layers.Dense(NUM_CLASSES, activation="softmax", dtype="float32")(h)

model = models.Model([seq_in, num_in], out)
model.summary()

try:
    opt = tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-5, clipnorm=1.0)
except Exception:
    opt = tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1.0)

model.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

ckpt_path = RESULTS_DIR / "best_seq_model.keras"
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(str(ckpt_path), monitor="val_loss",
                                       save_best_only=True, save_weights_only=False),
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6, verbose=1),
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True, verbose=1),
]

EPOCHS = 40 if "EPOCHS" not in globals() else EPOCHS
history = model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

def eval_ds(m, ds, y, name):
    p = m.predict(ds, verbose=0).argmax(1)
    acc = accuracy_score(y, p)
    f1m = f1_score(y, p, average="macro")
    rep = classification_report(y, p, target_names=CLASS_ORDER, output_dict=True, zero_division=0)
    cm  = confusion_matrix(y, p).tolist()
    print(f"[{name}] acc={acc:.4f}  f1_macro={f1m:.4f}")
    return acc, f1m, rep, cm

val_acc, val_f1, val_rep, val_cm = eval_ds(model, ds_val,  y_val,  "Validation")
tst_acc, tst_f1, tst_rep, tst_cm = eval_ds(model, ds_test, y_test, "Test")

report = {
    "config": {
        "cutoff_full_moves": int(CUTOFF_FULL_MOVES),
        "cutoff_plies": int(CUTOFF_PLIES),
        "vocab_size": int(VOCAB_SIZE),
        "embedding_dim": int(EMB_DIM),
        "dropout": {"after_lstm": 0.20, "after_concat": 0.15},
        "optimizer": "AdamW" if isinstance(opt, tf.keras.optimizers.AdamW) else "Adam",
        "clipnorm": 1.0,
        "epochs": int(EPOCHS),
        "batch_size": int(BATCH),
        "balanced_oversample_draws": int(OVERSAMPLE_DRAWS),
        "elo_features": ELO_FEATS
    },
    "val":  {"acc": float(val_acc),  "f1_macro": float(val_f1),  "confusion_matrix": val_cm},
    "test": {"acc": float(tst_acc), "f1_macro": float(tst_f1), "report": tst_rep, "confusion_matrix": tst_cm},
    "artifacts": {"best_model": str(ckpt_path), "vocab": str(RESULTS_DIR / "vocab.json")}
}
(RESULTS_DIR / "seq_report.json").write_text(json.dumps(report, indent=2))
print("Saved:", RESULTS_DIR / "seq_report.json")
print("Best model:", ckpt_path)

Epoch 1/40
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 572ms/step - accuracy: 0.4903 - loss: 0.9821 - val_accuracy: 0.5759 - val_loss: 0.8010 - learning_rate: 0.0010
Epoch 2/40
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 314ms/step - accuracy: 0.5961 - loss: 0.8163 - val_accuracy: 0.5548 - val_loss: 0.8309 - learning_rate: 0.0010
Epoch 3/40
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350ms/step - accuracy: 0.6393 - loss: 0.7387
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 376ms/step - accuracy: 0.6393 - loss: 0.7387 - val_accuracy: 0.5369 - val_loss: 0.8531 - learning_rate: 0.0010
Epoch 4/40
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 338ms/step - accuracy: 0.6782 - loss: 0.6577 - val_accuracy: 0.5798 - val_loss: 0.9195 - learning_rate: 5.0000e-04
Epoch 5/40
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━