In [13]:
# === Self-play analysis for self_play_results.csv (10-game sets only; top 10 players) ===
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display

# Load data
path = Path("self_play_results.csv")  # change if needed
df = pd.read_csv(path)

# Basic validation / cleaning
need = {"model","algorithm","games","wins_X","wins_O","draws"}
missing = need - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

for c in ["games","wins_X","wins_O","draws"]:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Keep only consistent rows (each should have wins_X + wins_O + draws == games)
df = df[(df["games"] > 0) & ((df["wins_X"] + df["wins_O"] + df["draws"]) == df["games"])].copy()
if df.empty:
    raise ValueError("No valid rows after cleaning (check sums: wins_X + wins_O + draws == games).")

# Points & rates (draw = 0.5)
df["points_as_X"]  = df["wins_X"] + 0.5 * df["draws"]
df["points_as_O"]  = df["wins_O"] + 0.5 * df["draws"]
df["wr_as_X"]      = df["points_as_X"] / df["games"]
df["wr_as_O"]      = df["points_as_O"] / df["games"]
df["side_bias"]    = df["wr_as_X"] - df["wr_as_O"]

# === Per model + algorithm (collapsed view; no games column) ===
g = (df.groupby(["model","algorithm"], as_index=False)
       .agg(wins_X=("wins_X","sum"),
            wins_O=("wins_O","sum"),
            draws=("draws","sum"),
            points_as_X=("points_as_X","sum"),
            points_as_O=("points_as_O","sum")))

g["wr_as_X"]   = g["points_as_X"] / 10.0  # each run is 10 games, but totals can vary in aggregation
g["wr_as_O"]   = g["points_as_O"] / 10.0
g["side_bias"] = g["wr_as_X"] - g["wr_as_O"]
g["X%"]        = (g["wr_as_X"] * 10).round(1)
g["O%"]        = (g["wr_as_O"] * 10).round(1)
g["bias%"]     = (g["side_bias"] * 10).round(1)

print("=== Per model + algorithm ===")
display(g[["model","algorithm","wins_X","wins_O","draws","X%","O%","bias%"]]
        .sort_values(["model","algorithm"]).reset_index(drop=True))

# === Per model (aggregated across algorithms) ===
m = (g.groupby("model", as_index=False)
       .agg(wins_X=("wins_X","sum"),
            wins_O=("wins_O","sum"),
            draws=("draws","sum"),
            points_as_X=("points_as_X","sum"),
            points_as_O=("points_as_O","sum")))

denom = (m["wins_X"] + m["wins_O"] + m["draws"]).replace(0, np.nan)
m["wr_as_X"]   = m["points_as_X"] / denom
m["wr_as_O"]   = m["points_as_O"] / denom
m["side_bias"] = m["wr_as_X"] - m["wr_as_O"]
m["X%"]        = (m["wr_as_X"] * 100).round(1)
m["O%"]        = (m["wr_as_O"] * 100).round(1)
m["bias%"]     = (m["side_bias"] * 100).round(1)

print("\n=== Per model (aggregated across algorithms) ===")
display(m[["model","wins_X","wins_O","draws","X%","O%","bias%"]]
        .sort_values(["model"]).reset_index(drop=True))

# === Top 10 models overall (by mean of X% & O%) ===
m["_avg"] = (m["X%"] + m["O%"]) / 2.0
m["_total_wins"] = m["wins_X"] + m["wins_O"]
top10 = (m.sort_values(["_avg","_total_wins"], ascending=[False, False])
           .head(10)
           .drop(columns=["_avg","_total_wins"]))

print("\n=== Top 10 models overall ===")
display(top10[["model","wins_X","wins_O","draws","X%","O%","bias%"]].reset_index(drop=True))

# === Per algorithm (across all models) ===
a = (g.groupby("algorithm", as_index=False)
       .agg(wins_X=("wins_X","sum"),
            wins_O=("wins_O","sum"),
            draws=("draws","sum"),
            points_as_X=("points_as_X","sum"),
            points_as_O=("points_as_O","sum")))

denom_a = (a["wins_X"] + a["wins_O"] + a["draws"]).replace(0, np.nan)
a["wr_as_X"] = a["points_as_X"] / denom_a
a["wr_as_O"] = a["points_as_O"] / denom_a
a["side_bias"] = a["wr_as_X"] - a["wr_as_O"]
a["X%"] = (a["wr_as_X"] * 100).round(1)
a["O%"] = (a["wr_as_O"] * 100).round(1)
a["bias%"] = (a["side_bias"] * 100).round(1)

print("\n=== Per algorithm (across all models) ===")
display(a[["algorithm","wins_X","wins_O","draws","X%","O%","bias%"]]
        .sort_values(["algorithm"]).reset_index(drop=True))

# Expose handy variables
per_model_algo   = g
per_model_overall= m
top10_models     = top10
per_algorithm    = a


=== Per model + algorithm ===


Unnamed: 0,model,algorithm,wins_X,wins_O,draws,X%,O%,bias%
0,large-3x3-e100-b128-q-sp10.pt,Q,2,1,7,5.5,4.5,1.0
1,large-3x3-e100-b128-q-sp100.pt,Q,72,13,15,79.5,20.5,59.0
2,large-3x3-e100-b128-q-sp50.pt,Q,39,3,8,43.0,7.0,36.0
3,large-3x3-e100-b128-sarsa-sp10.pt,SARSA,4,1,5,6.5,3.5,3.0
4,large-3x3-e100-b128-sarsa-sp100.pt,SARSA,75,11,14,82.0,18.0,64.0
...,...,...,...,...,...,...,...,...
967,small-5x5x5-e50-b64-q-sp100.pt,Q,48,52,0,48.0,52.0,-4.0
968,small-5x5x5-e50-b64-q-sp50.pt,Q,18,32,0,18.0,32.0,-14.0
969,small-5x5x5-e50-b64-sarsa-sp10.pt,SARSA,7,3,0,7.0,3.0,4.0
970,small-5x5x5-e50-b64-sarsa-sp100.pt,SARSA,47,53,0,47.0,53.0,-6.0



=== Per model (aggregated across algorithms) ===


Unnamed: 0,model,wins_X,wins_O,draws,X%,O%,bias%
0,large-3x3-e100-b128-q-sp10.pt,2,1,7,55.0,45.0,10.0
1,large-3x3-e100-b128-q-sp100.pt,72,13,15,79.5,20.5,59.0
2,large-3x3-e100-b128-q-sp50.pt,39,3,8,86.0,14.0,72.0
3,large-3x3-e100-b128-sarsa-sp10.pt,4,1,5,65.0,35.0,30.0
4,large-3x3-e100-b128-sarsa-sp100.pt,75,11,14,82.0,18.0,64.0
...,...,...,...,...,...,...,...
967,small-5x5x5-e50-b64-q-sp100.pt,48,52,0,48.0,52.0,-4.0
968,small-5x5x5-e50-b64-q-sp50.pt,18,32,0,36.0,64.0,-28.0
969,small-5x5x5-e50-b64-sarsa-sp10.pt,7,3,0,70.0,30.0,40.0
970,small-5x5x5-e50-b64-sarsa-sp100.pt,47,53,0,47.0,53.0,-6.0



=== Top 10 models overall ===


Unnamed: 0,model,wins_X,wins_O,draws,X%,O%,bias%
0,large-3x3x3-e100-b128-q-sp100.pt,43,57,0,43.0,57.0,-14.0
1,large-3x3x3-e100-b128-sarsa-sp100.pt,56,44,0,56.0,44.0,12.0
2,large-3x3x3-e100-b256-q-sp100.pt,64,36,0,64.0,36.0,28.0
3,large-3x3x3-e100-b256-sarsa-sp100.pt,71,29,0,71.0,29.0,42.0
4,large-3x3x3-e100-b64-q-sp100.pt,47,53,0,47.0,53.0,-6.0
5,large-3x3x3-e100-b64-sarsa-sp100.pt,57,43,0,57.0,43.0,14.0
6,large-3x3x3-e20-b128-q-sp100.pt,68,32,0,68.0,32.0,36.0
7,large-3x3x3-e20-b128-sarsa-sp100.pt,57,43,0,57.0,43.0,14.0
8,large-3x3x3-e20-b256-q-sp100.pt,46,54,0,46.0,54.0,-8.0
9,large-3x3x3-e20-b256-sarsa-sp100.pt,57,43,0,57.0,43.0,14.0



=== Per algorithm (across all models) ===


Unnamed: 0,algorithm,wins_X,wins_O,draws,X%,O%,bias%
0,Q,10992,8152,6776,55.5,44.5,11.0
1,SARSA,11107,8357,6456,55.3,44.7,10.6


In [14]:
# === Full Independent Analysis for tournament_results.csv (grand total games; no per-table 'games' columns) ===
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict
from IPython.display import display

# === Load Data ===
path = Path("tournament_results.csv")  # change path if needed
df = pd.read_csv(path)

required = {"shape","model_a","model_b","games","wins_a","wins_b","draws"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Clean and validate
for c in ["games","wins_a","wins_b","draws"]:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)
df = df[(df["games"] > 0) & ((df["wins_a"] + df["wins_b"] + df["draws"]) == df["games"])].copy()

# --- Grand total games (each model plays 10 games against every other model on the same board) ---
GAMES_PER_OPPONENT = 10
num_models = len(set(df["model_a"]) | set(df["model_b"]))
grand_total_games = (num_models - 1) * GAMES_PER_OPPONENT

print(f"=== Grand total games: {grand_total_games} (models: {num_models}, each plays {GAMES_PER_OPPONENT}×(n-1)) ===")


# --- Derived values ---
df["points_a"] = df["wins_a"] + 0.5 * df["draws"]
df["points_b"] = df["wins_b"] + 0.5 * df["draws"]

# --- Per-player aggregates (keep games internally; don't display it) ---
models = sorted(set(df["model_a"]) | set(df["model_b"]))

stats = []
for m in models:
    g_as_a = df.loc[df["model_a"] == m, "games"].sum()
    g_as_b = df.loc[df["model_b"] == m, "games"].sum()
    games = g_as_a + g_as_b

    w_as_a = df.loc[df["model_a"] == m, "wins_a"].sum()
    w_as_b = df.loc[df["model_b"] == m, "wins_b"].sum()
    wins = w_as_a + w_as_b

    l_as_a = df.loc[df["model_a"] == m, "wins_b"].sum()
    l_as_b = df.loc[df["model_b"] == m, "wins_a"].sum()
    losses = l_as_a + l_as_b

    d_as_a = df.loc[df["model_a"] == m, "draws"].sum()
    d_as_b = df.loc[df["model_b"] == m, "draws"].sum()
    draws = d_as_a + d_as_b

    wr = (wins + 0.5*draws) / games if games else np.nan
    stats.append((m, games, wins, draws, losses, wr))

per_player = pd.DataFrame(stats, columns=["player","games","wins","draws","losses","winrate"])

# --- Wilson CI (uses internal 'games', but we won't display 'games') ---
def wilson_ci(successes, totals, z=1.96):
    totals = np.maximum(totals.astype(float), 1e-12)
    p = successes / totals
    denom = 1 + (z**2)/totals
    center = (p + (z**2)/(2*totals)) / denom
    margin = (z*np.sqrt((p*(1-p) + (z**2)/(4*totals)) / totals)) / denom
    lo = np.clip(center - margin, 0, 1)
    hi = np.clip(center + margin, 0, 1)
    return lo, hi

lo, hi = wilson_ci(per_player["wins"] + 0.5*per_player["draws"], per_player["games"])
per_player["winrate_lo"], per_player["winrate_hi"] = lo, hi
per_player = per_player.sort_values(["winrate","games"], ascending=[False,False]).reset_index(drop=True)

print("=== Per-player Summary (Wilson 95% CI) — games hidden ===")
tmp = per_player.copy()
tmp["winrate%"] = (tmp["winrate"]*100).round(2)
tmp["CI95%"]    = (tmp["winrate_lo"]*100).round(1).astype(str) + "–" + (tmp["winrate_hi"]*100).round(1).astype(str)
# Display WITHOUT the 'games' column
display(tmp[["player","wins","draws","losses","winrate%","CI95%"]].head(30))

# --- Per-shape head-to-head (only pairs that actually played; % points only) ---
print("\n=== Head-to-head tables per shape (only pairs that actually played; % points only) ===")
h2h_rate_by_shape = {}

for sh, g in df.groupby("shape"):
    pairs = (g.groupby(["model_a","model_b"], as_index=False)
               .agg(points_a=("points_a","sum"), games=("games","sum")))
    pairs = pairs[pairs["games"] > 0].copy()
    if pairs.empty:
        print(f"\n--- Shape: {sh} --- (no played pairs)")
        continue

    pairs["pct_a"] = (pairs["points_a"] / pairs["games"] * 100).round(1)

    flip = pairs.rename(columns={"model_a":"model_b","model_b":"model_a","pct_a":"pct_b"})
    flip["pct_a"] = (100.0 - flip["pct_b"]).round(1)
    flip = flip[["model_a","model_b","pct_a"]]

    long = pd.concat([pairs[["model_a","model_b","pct_a"]], flip], ignore_index=True)
    long = long.drop_duplicates(subset=["model_a","model_b"], keep="first").reset_index(drop=True)

    models_active = sorted(set(long["model_a"]) | set(long["model_b"]))
    mat_pct = long.pivot(index="model_a", columns="model_b", values="pct_a").reindex(
        index=models_active, columns=models_active
    )

    if len(models_active) > 0:
        idx = np.arange(len(models_active))
        mat_pct.values[idx, idx] = np.nan

    row_has_opp = mat_pct.notna().sum(axis=1) > 0
    col_has_opp = mat_pct.notna().sum(axis=0) > 0
    keep_rows = row_has_opp[row_has_opp].index
    keep_cols = col_has_opp[col_has_opp].index
    mat_pct = mat_pct.loc[keep_rows, keep_cols]

    h2h_rate_by_shape[sh] = mat_pct

    print(f"\n--- Shape: {sh} --- ({mat_pct.shape[0]} models)")
    print("Row’s % points vs column (draw=0.5):")
    display(mat_pct)

# --- Elo Rating (keep games internally; don't display it) ---
def elo_from_aggregates(df_pairs, K=24.0, base=400.0):
    ratings, games = defaultdict(lambda: 1500.0), defaultdict(int)
    def expected(ra, rb): return 1 / (1 + 10 ** ((rb - ra)/base))
    for _, r in df_pairs.iterrows():
        a, b = r["model_a"], r["model_b"]
        outcomes = ([1.0]*int(r["wins_a"])) + ([0.0]*int(r["wins_b"])) + ([0.5]*int(r["draws"]))
        for s1 in outcomes:
            r1, r2 = ratings[a], ratings[b]
            e1, e2 = expected(r1, r2), expected(r2, r1)
            s2 = 1.0 - s1 if s1 in (0.0,1.0) else 0.5
            ratings[a] = r1 + K*(s1 - e1)
            ratings[b] = r2 + K*(s2 - e2)
            games[a]+=1; games[b]+=1
    out = pd.DataFrame({
        "player": list(ratings.keys()),
        "elo": [ratings[p] for p in ratings.keys()],
        "games": [games[p] for p in ratings.keys()]
    }).sort_values(["elo","games"], ascending=[False,False]).reset_index(drop=True)
    return out

elo = elo_from_aggregates(df)

print("\n=== Elo Ratings — games hidden ===")
display(elo[["player","elo"]].head(30))  # no games column shown

# --- League Table (sort by Elo only; hide games) ---
league = (
    elo.merge(per_player[["player","wins","draws","losses"]], on="player", how="left")
       .sort_values(["elo"], ascending=[False])
)
print("\n=== League Table — games hidden ===")
display(league[["player","elo","wins","draws","losses"]].head(50))


=== Grand total games: 11330 (models: 1134, each plays 10×(n-1)) ===
=== Per-player Summary (Wilson 95% CI) — games hidden ===


Unnamed: 0,player,wins,draws,losses,winrate%,CI95%
0,large-5x5x5-e100-b128-q-sp10.pt,1675,0,205,89.1,87.6–90.4
1,large-5x5x5-e100-b128.pt,1635,0,245,86.97,85.4–88.4
2,large-5x5x5-e100-b128-sarsa-sp10.pt,1625,0,255,86.44,84.8–87.9
3,large-5x5x5-e100-b256.pt,1625,0,255,86.44,84.8–87.9
4,medium-4x4x4-e20-b64.pt,1540,0,340,81.91,80.1–83.6
5,medium-5x5x5-e50-b128.pt,1535,0,345,81.65,79.8–83.3
6,large-4x4x4-e20-b64.pt,1500,0,380,79.79,77.9–81.5
7,large-4x4x4-e20-b64-q-sp10.pt,1485,0,395,78.99,77.1–80.8
8,medium-5x5x5-e100-b256-q-sp10.pt,1485,0,395,78.99,77.1–80.8
9,medium-5x5x5-e100-b256-sarsa-sp10.pt,1480,0,400,78.72,76.8–80.5



=== Head-to-head tables per shape (only pairs that actually played; % points only) ===

--- Shape: 3x3 --- (189 models)
Row’s % points vs column (draw=0.5):


model_b,large-3x3-e100-b128-q-sp10.pt,large-3x3-e100-b128-q-sp100.pt,large-3x3-e100-b128-q-sp50.pt,large-3x3-e100-b128-sarsa-sp10.pt,large-3x3-e100-b128-sarsa-sp100.pt,large-3x3-e100-b128-sarsa-sp50.pt,large-3x3-e100-b128.pt,large-3x3-e100-b256-q-sp10.pt,large-3x3-e100-b256-q-sp100.pt,large-3x3-e100-b256-q-sp50.pt,...,small-3x3-e50-b256-sarsa-sp100.pt,small-3x3-e50-b256-sarsa-sp50.pt,small-3x3-e50-b256.pt,small-3x3-e50-b64-q-sp10.pt,small-3x3-e50-b64-q-sp100.pt,small-3x3-e50-b64-q-sp50.pt,small-3x3-e50-b64-sarsa-sp10.pt,small-3x3-e50-b64-sarsa-sp100.pt,small-3x3-e50-b64-sarsa-sp50.pt,small-3x3-e50-b64.pt
model_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
large-3x3-e100-b128-q-sp10.pt,,50.0,75.0,50.0,25.0,25.0,50.0,50.0,75.0,50.0,...,25.0,25.0,0.0,100.0,50.0,50.0,100.0,50.0,50.0,100.0
large-3x3-e100-b128-q-sp100.pt,50.0,,50.0,25.0,0.0,50.0,50.0,50.0,50.0,50.0,...,25.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
large-3x3-e100-b128-q-sp50.pt,25.0,50.0,,25.0,25.0,0.0,25.0,50.0,50.0,50.0,...,0.0,0.0,0.0,100.0,50.0,50.0,100.0,50.0,50.0,100.0
large-3x3-e100-b128-sarsa-sp10.pt,50.0,75.0,75.0,,25.0,25.0,50.0,50.0,75.0,50.0,...,75.0,0.0,50.0,100.0,50.0,50.0,100.0,50.0,50.0,100.0
large-3x3-e100-b128-sarsa-sp100.pt,75.0,100.0,75.0,75.0,,75.0,75.0,25.0,50.0,50.0,...,0.0,0.0,0.0,50.0,50.0,50.0,50.0,0.0,50.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
small-3x3-e50-b64-q-sp50.pt,50.0,50.0,50.0,50.0,50.0,50.0,25.0,50.0,25.0,0.0,...,50.0,0.0,100.0,50.0,0.0,,50.0,0.0,50.0,25.0
small-3x3-e50-b64-sarsa-sp10.pt,0.0,50.0,0.0,0.0,50.0,50.0,25.0,50.0,25.0,0.0,...,75.0,25.0,25.0,50.0,0.0,50.0,,0.0,50.0,25.0
small-3x3-e50-b64-sarsa-sp100.pt,50.0,50.0,50.0,50.0,100.0,50.0,50.0,50.0,25.0,0.0,...,50.0,0.0,50.0,100.0,50.0,100.0,100.0,,100.0,50.0
small-3x3-e50-b64-sarsa-sp50.pt,50.0,50.0,50.0,50.0,50.0,50.0,25.0,50.0,25.0,0.0,...,50.0,0.0,75.0,50.0,0.0,50.0,50.0,0.0,,25.0



--- Shape: 3x3x3 --- (189 models)
Row’s % points vs column (draw=0.5):


model_b,large-3x3x3-e100-b128-q-sp10.pt,large-3x3x3-e100-b128-q-sp100.pt,large-3x3x3-e100-b128-q-sp50.pt,large-3x3x3-e100-b128-sarsa-sp10.pt,large-3x3x3-e100-b128-sarsa-sp100.pt,large-3x3x3-e100-b128-sarsa-sp50.pt,large-3x3x3-e100-b128.pt,large-3x3x3-e100-b256-q-sp10.pt,large-3x3x3-e100-b256-q-sp100.pt,large-3x3x3-e100-b256-q-sp50.pt,...,small-3x3x3-e50-b256-sarsa-sp100.pt,small-3x3x3-e50-b256-sarsa-sp50.pt,small-3x3x3-e50-b256.pt,small-3x3x3-e50-b64-q-sp10.pt,small-3x3x3-e50-b64-q-sp100.pt,small-3x3x3-e50-b64-q-sp50.pt,small-3x3x3-e50-b64-sarsa-sp10.pt,small-3x3x3-e50-b64-sarsa-sp100.pt,small-3x3x3-e50-b64-sarsa-sp50.pt,small-3x3x3-e50-b64.pt
model_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
large-3x3x3-e100-b128-q-sp10.pt,,100.0,100.0,50.0,50.0,50.0,100.0,50.0,100.0,100.0,...,100.0,50.0,100.0,100.0,50.0,50.0,100.0,100.0,50.0,100.0
large-3x3x3-e100-b128-q-sp100.pt,0.0,,0.0,0.0,0.0,50.0,0.0,50.0,50.0,0.0,...,50.0,50.0,0.0,50.0,50.0,50.0,100.0,50.0,50.0,50.0
large-3x3x3-e100-b128-q-sp50.pt,0.0,100.0,,0.0,100.0,100.0,0.0,100.0,0.0,100.0,...,100.0,100.0,50.0,50.0,50.0,50.0,50.0,50.0,100.0,50.0
large-3x3x3-e100-b128-sarsa-sp10.pt,50.0,100.0,100.0,,50.0,50.0,50.0,50.0,50.0,50.0,...,100.0,100.0,100.0,100.0,50.0,50.0,100.0,100.0,50.0,100.0
large-3x3x3-e100-b128-sarsa-sp100.pt,50.0,100.0,0.0,50.0,,0.0,100.0,50.0,0.0,50.0,...,100.0,100.0,50.0,100.0,100.0,50.0,50.0,0.0,50.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
small-3x3x3-e50-b64-q-sp50.pt,50.0,50.0,50.0,50.0,50.0,50.0,0.0,50.0,50.0,0.0,...,50.0,50.0,100.0,50.0,100.0,,0.0,0.0,100.0,50.0
small-3x3x3-e50-b64-sarsa-sp10.pt,0.0,0.0,50.0,0.0,50.0,50.0,50.0,50.0,0.0,0.0,...,100.0,100.0,100.0,50.0,100.0,100.0,,50.0,100.0,0.0
small-3x3x3-e50-b64-sarsa-sp100.pt,0.0,50.0,50.0,0.0,100.0,50.0,0.0,0.0,50.0,0.0,...,100.0,100.0,50.0,0.0,100.0,100.0,50.0,,50.0,50.0
small-3x3x3-e50-b64-sarsa-sp50.pt,50.0,50.0,0.0,50.0,50.0,50.0,0.0,50.0,50.0,0.0,...,100.0,50.0,100.0,0.0,50.0,0.0,0.0,50.0,,50.0



--- Shape: 4x4 --- (189 models)
Row’s % points vs column (draw=0.5):


model_b,large-4x4-e100-b128-q-sp10.pt,large-4x4-e100-b128-q-sp100.pt,large-4x4-e100-b128-q-sp50.pt,large-4x4-e100-b128-sarsa-sp10.pt,large-4x4-e100-b128-sarsa-sp100.pt,large-4x4-e100-b128-sarsa-sp50.pt,large-4x4-e100-b128.pt,large-4x4-e100-b256-q-sp10.pt,large-4x4-e100-b256-q-sp100.pt,large-4x4-e100-b256-q-sp50.pt,...,small-4x4-e50-b256-sarsa-sp100.pt,small-4x4-e50-b256-sarsa-sp50.pt,small-4x4-e50-b256.pt,small-4x4-e50-b64-q-sp10.pt,small-4x4-e50-b64-q-sp100.pt,small-4x4-e50-b64-q-sp50.pt,small-4x4-e50-b64-sarsa-sp10.pt,small-4x4-e50-b64-sarsa-sp100.pt,small-4x4-e50-b64-sarsa-sp50.pt,small-4x4-e50-b64.pt
model_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
large-4x4-e100-b128-q-sp10.pt,,50.0,75.0,50.0,50.0,75.0,50.0,0.0,50.0,25.0,...,50.0,50.0,75.0,50.0,50.0,50.0,75.0,75.0,50.0,25.0
large-4x4-e100-b128-q-sp100.pt,50.0,,50.0,75.0,50.0,75.0,50.0,25.0,50.0,75.0,...,25.0,50.0,75.0,50.0,50.0,100.0,75.0,25.0,75.0,50.0
large-4x4-e100-b128-q-sp50.pt,25.0,50.0,,50.0,50.0,50.0,25.0,50.0,75.0,75.0,...,50.0,25.0,50.0,25.0,75.0,50.0,50.0,50.0,75.0,75.0
large-4x4-e100-b128-sarsa-sp10.pt,50.0,25.0,50.0,,75.0,50.0,75.0,50.0,50.0,50.0,...,50.0,25.0,50.0,75.0,50.0,75.0,75.0,50.0,75.0,25.0
large-4x4-e100-b128-sarsa-sp100.pt,50.0,50.0,50.0,25.0,,100.0,50.0,25.0,50.0,75.0,...,50.0,50.0,50.0,75.0,75.0,75.0,75.0,50.0,75.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
small-4x4-e50-b64-q-sp50.pt,50.0,0.0,50.0,25.0,25.0,25.0,50.0,50.0,0.0,25.0,...,75.0,50.0,75.0,75.0,50.0,,75.0,50.0,75.0,75.0
small-4x4-e50-b64-sarsa-sp10.pt,25.0,25.0,50.0,25.0,25.0,50.0,75.0,75.0,50.0,25.0,...,75.0,75.0,50.0,25.0,25.0,25.0,,25.0,50.0,25.0
small-4x4-e50-b64-sarsa-sp100.pt,25.0,75.0,50.0,50.0,50.0,50.0,50.0,0.0,50.0,0.0,...,50.0,50.0,50.0,75.0,50.0,50.0,75.0,,50.0,50.0
small-4x4-e50-b64-sarsa-sp50.pt,50.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,50.0,25.0,...,50.0,50.0,50.0,50.0,25.0,25.0,50.0,50.0,,25.0



--- Shape: 4x4x4 --- (189 models)
Row’s % points vs column (draw=0.5):


model_b,large-4x4x4-e100-b128-q-sp10.pt,large-4x4x4-e100-b128-q-sp100.pt,large-4x4x4-e100-b128-q-sp50.pt,large-4x4x4-e100-b128-sarsa-sp10.pt,large-4x4x4-e100-b128-sarsa-sp100.pt,large-4x4x4-e100-b128-sarsa-sp50.pt,large-4x4x4-e100-b128.pt,large-4x4x4-e100-b256-q-sp10.pt,large-4x4x4-e100-b256-q-sp100.pt,large-4x4x4-e100-b256-q-sp50.pt,...,small-4x4x4-e50-b256-sarsa-sp100.pt,small-4x4x4-e50-b256-sarsa-sp50.pt,small-4x4x4-e50-b256.pt,small-4x4x4-e50-b64-q-sp10.pt,small-4x4x4-e50-b64-q-sp100.pt,small-4x4x4-e50-b64-q-sp50.pt,small-4x4x4-e50-b64-sarsa-sp10.pt,small-4x4x4-e50-b64-sarsa-sp100.pt,small-4x4x4-e50-b64-sarsa-sp50.pt,small-4x4x4-e50-b64.pt
model_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
large-4x4x4-e100-b128-q-sp10.pt,,100.0,50.0,50.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,0.0,100.0,0.0,0.0,100.0,50.0,0.0
large-4x4x4-e100-b128-q-sp100.pt,0.0,,100.0,0.0,100.0,100.0,0.0,0.0,100.0,50.0,...,0.0,50.0,0.0,50.0,0.0,50.0,100.0,0.0,0.0,50.0
large-4x4x4-e100-b128-q-sp50.pt,50.0,0.0,,100.0,100.0,100.0,0.0,50.0,100.0,100.0,...,0.0,100.0,100.0,50.0,0.0,0.0,0.0,50.0,0.0,50.0
large-4x4x4-e100-b128-sarsa-sp10.pt,50.0,100.0,0.0,,100.0,100.0,50.0,100.0,100.0,100.0,...,100.0,100.0,100.0,0.0,100.0,0.0,0.0,100.0,100.0,0.0
large-4x4x4-e100-b128-sarsa-sp100.pt,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,100.0,100.0,...,50.0,0.0,50.0,100.0,0.0,0.0,100.0,0.0,50.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
small-4x4x4-e50-b64-q-sp50.pt,100.0,50.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,0.0,0.0,0.0,100.0,,0.0,0.0,100.0,50.0
small-4x4x4-e50-b64-sarsa-sp10.pt,100.0,0.0,100.0,100.0,0.0,50.0,100.0,100.0,100.0,100.0,...,100.0,100.0,50.0,50.0,50.0,100.0,,50.0,0.0,100.0
small-4x4x4-e50-b64-sarsa-sp100.pt,0.0,100.0,50.0,0.0,100.0,100.0,0.0,100.0,100.0,50.0,...,50.0,50.0,50.0,50.0,50.0,100.0,50.0,,100.0,0.0
small-4x4x4-e50-b64-sarsa-sp50.pt,50.0,100.0,100.0,0.0,50.0,0.0,0.0,100.0,100.0,0.0,...,50.0,0.0,0.0,100.0,50.0,0.0,100.0,0.0,,0.0



--- Shape: 5x5 --- (189 models)
Row’s % points vs column (draw=0.5):


model_b,large-5x5-e100-b128-q-sp10.pt,large-5x5-e100-b128-q-sp100.pt,large-5x5-e100-b128-q-sp50.pt,large-5x5-e100-b128-sarsa-sp10.pt,large-5x5-e100-b128-sarsa-sp100.pt,large-5x5-e100-b128-sarsa-sp50.pt,large-5x5-e100-b128.pt,large-5x5-e100-b256-q-sp10.pt,large-5x5-e100-b256-q-sp100.pt,large-5x5-e100-b256-q-sp50.pt,...,small-5x5-e50-b256-sarsa-sp100.pt,small-5x5-e50-b256-sarsa-sp50.pt,small-5x5-e50-b256.pt,small-5x5-e50-b64-q-sp10.pt,small-5x5-e50-b64-q-sp100.pt,small-5x5-e50-b64-q-sp50.pt,small-5x5-e50-b64-sarsa-sp10.pt,small-5x5-e50-b64-sarsa-sp100.pt,small-5x5-e50-b64-sarsa-sp50.pt,small-5x5-e50-b64.pt
model_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
large-5x5-e100-b128-q-sp10.pt,,100.0,50.0,75.0,100.0,50.0,50.0,0.0,50.0,25.0,...,25.0,50.0,75.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
large-5x5-e100-b128-q-sp100.pt,0.0,,50.0,75.0,25.0,50.0,25.0,50.0,50.0,50.0,...,25.0,50.0,50.0,75.0,50.0,50.0,50.0,50.0,50.0,75.0
large-5x5-e100-b128-q-sp50.pt,50.0,50.0,,25.0,50.0,75.0,50.0,50.0,75.0,25.0,...,50.0,25.0,75.0,50.0,50.0,25.0,50.0,50.0,25.0,50.0
large-5x5-e100-b128-sarsa-sp10.pt,25.0,25.0,75.0,,100.0,75.0,25.0,0.0,25.0,25.0,...,50.0,75.0,25.0,50.0,50.0,50.0,50.0,50.0,50.0,75.0
large-5x5-e100-b128-sarsa-sp100.pt,0.0,75.0,50.0,0.0,,75.0,0.0,50.0,25.0,100.0,...,25.0,25.0,75.0,75.0,75.0,75.0,100.0,50.0,75.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
small-5x5-e50-b64-q-sp50.pt,50.0,50.0,75.0,50.0,25.0,50.0,50.0,25.0,25.0,25.0,...,50.0,50.0,50.0,50.0,25.0,,50.0,25.0,50.0,50.0
small-5x5-e50-b64-sarsa-sp10.pt,50.0,50.0,50.0,50.0,0.0,50.0,50.0,50.0,50.0,25.0,...,50.0,50.0,50.0,75.0,50.0,50.0,,50.0,50.0,50.0
small-5x5-e50-b64-sarsa-sp100.pt,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,...,50.0,50.0,50.0,50.0,25.0,75.0,50.0,,50.0,50.0
small-5x5-e50-b64-sarsa-sp50.pt,50.0,50.0,75.0,50.0,25.0,25.0,50.0,50.0,25.0,75.0,...,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,,50.0



--- Shape: 5x5x5 --- (189 models)
Row’s % points vs column (draw=0.5):


model_b,large-5x5x5-e100-b128-q-sp10.pt,large-5x5x5-e100-b128-q-sp100.pt,large-5x5x5-e100-b128-q-sp50.pt,large-5x5x5-e100-b128-sarsa-sp10.pt,large-5x5x5-e100-b128-sarsa-sp100.pt,large-5x5x5-e100-b128-sarsa-sp50.pt,large-5x5x5-e100-b128.pt,large-5x5x5-e100-b256-q-sp10.pt,large-5x5x5-e100-b256-q-sp100.pt,large-5x5x5-e100-b256-q-sp50.pt,...,small-5x5x5-e50-b256-sarsa-sp100.pt,small-5x5x5-e50-b256-sarsa-sp50.pt,small-5x5x5-e50-b256.pt,small-5x5x5-e50-b64-q-sp10.pt,small-5x5x5-e50-b64-q-sp100.pt,small-5x5x5-e50-b64-q-sp50.pt,small-5x5x5-e50-b64-sarsa-sp10.pt,small-5x5x5-e50-b64-sarsa-sp100.pt,small-5x5x5-e50-b64-sarsa-sp50.pt,small-5x5x5-e50-b64.pt
model_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
large-5x5x5-e100-b128-q-sp10.pt,,100.0,100.0,50.0,100.0,0.0,0.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,50.0,100.0
large-5x5x5-e100-b128-q-sp100.pt,0.0,,100.0,0.0,100.0,0.0,0.0,0.0,50.0,100.0,...,0.0,50.0,0.0,100.0,50.0,50.0,100.0,0.0,0.0,100.0
large-5x5x5-e100-b128-q-sp50.pt,0.0,0.0,,50.0,100.0,0.0,0.0,0.0,100.0,100.0,...,100.0,100.0,50.0,100.0,0.0,0.0,100.0,0.0,0.0,100.0
large-5x5x5-e100-b128-sarsa-sp10.pt,50.0,100.0,50.0,,100.0,100.0,0.0,0.0,100.0,100.0,...,100.0,100.0,100.0,100.0,50.0,50.0,100.0,100.0,50.0,100.0
large-5x5x5-e100-b128-sarsa-sp100.pt,0.0,0.0,0.0,0.0,,100.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
small-5x5x5-e50-b64-q-sp50.pt,0.0,50.0,100.0,50.0,100.0,50.0,0.0,0.0,50.0,50.0,...,0.0,50.0,0.0,50.0,50.0,,50.0,0.0,100.0,100.0
small-5x5x5-e50-b64-sarsa-sp10.pt,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0,...,50.0,50.0,0.0,50.0,0.0,50.0,,50.0,100.0,0.0
small-5x5x5-e50-b64-sarsa-sp100.pt,0.0,100.0,100.0,0.0,100.0,50.0,0.0,100.0,100.0,100.0,...,50.0,50.0,0.0,100.0,0.0,100.0,50.0,,100.0,100.0
small-5x5x5-e50-b64-sarsa-sp50.pt,50.0,100.0,100.0,50.0,100.0,50.0,0.0,0.0,50.0,0.0,...,50.0,50.0,50.0,50.0,100.0,0.0,0.0,0.0,,50.0



=== Elo Ratings — games hidden ===


Unnamed: 0,player,elo
0,medium-5x5x5-e100-b256-q-sp10.pt,2164.545773
1,large-5x5x5-e20-b128-sarsa-sp10.pt,2160.763791
2,small-5x5x5-e50-b256.pt,2134.723531
3,medium-5x5x5-e100-b128-q-sp10.pt,2065.352224
4,large-5x5x5-e100-b128.pt,2059.490596
5,medium-5x5x5-e20-b128-sarsa-sp10.pt,2053.02764
6,small-4x4x4-e50-b128-sarsa-sp10.pt,2051.899833
7,small-5x5x5-e20-b128-sarsa-sp10.pt,2038.206978
8,small-5x5x5-e50-b64-sarsa-sp100.pt,2025.518508
9,medium-4x4x4-e50-b64.pt,2019.739844



=== League Table — games hidden ===


Unnamed: 0,player,elo,wins,draws,losses
0,medium-5x5x5-e100-b256-q-sp10.pt,2164.545773,1485,0,395
1,large-5x5x5-e20-b128-sarsa-sp10.pt,2160.763791,1395,15,470
2,small-5x5x5-e50-b256.pt,2134.723531,1150,5,725
3,medium-5x5x5-e100-b128-q-sp10.pt,2065.352224,1140,0,740
4,large-5x5x5-e100-b128.pt,2059.490596,1635,0,245
5,medium-5x5x5-e20-b128-sarsa-sp10.pt,2053.02764,1240,5,635
6,small-4x4x4-e50-b128-sarsa-sp10.pt,2051.899833,1180,0,700
7,small-5x5x5-e20-b128-sarsa-sp10.pt,2038.206978,1265,0,615
8,small-5x5x5-e50-b64-sarsa-sp100.pt,2025.518508,1130,0,750
9,medium-4x4x4-e50-b64.pt,2019.739844,1255,0,625


In [15]:
import pandas as pd
import numpy as np
import re

# Adjust paths if needed
self_play_path = "self_play_results.csv"
tournament_path = "tournament_results.csv"

df_sp = pd.read_csv(self_play_path)
df_t  = pd.read_csv(tournament_path)



In [16]:
# === Self-play analysis (fully independent) ===

sp = df_sp.copy()

# If your CSV already has a 'games' column, we keep it;
# otherwise we compute it from wins_X + wins_O + draws.
if "games" not in sp.columns:
    sp["games"] = sp[["wins_X", "wins_O", "draws"]].sum(axis=1)

sp["total_games"] = sp["games"]

def extract_shape(model_name: str) -> str:
    """
    Extracts board shape like '3x3', '4x4x4', etc. from a model filename.
    """
    m = re.search(r"-(\d+x\d+(x\d+)?)", model_name)
    return m.group(1) if m else "unknown"

def extract_arch(model_name: str) -> str:
    """
    Extracts architecture label from the beginning of the filename.
    """
    return model_name.split("-", 1)[0]

sp["shape"] = sp["model"].apply(extract_shape)
sp["arch"]  = sp["model"].apply(extract_arch)

# === Per algorithm (X%, O%, bias%) ===
algo_grp = (
    sp.groupby("algorithm", as_index=False)
      .agg(
          wins_X=("wins_X", "sum"),
          wins_O=("wins_O", "sum"),
          draws=("draws", "sum"),
          total_games=("total_games", "sum"),
      )
)

algo_grp["X%"]    = 100 * algo_grp["wins_X"] / algo_grp["total_games"]
algo_grp["O%"]    = 100 * algo_grp["wins_O"] / algo_grp["total_games"]
algo_grp["bias%"] = algo_grp["X%"] - algo_grp["O%"]

print("=== Per algorithm (across all models) ===")
display(algo_grp)

# === Per board shape ===
shape_grp = (
    sp.groupby("shape", as_index=False)
      .agg(
          wins_X=("wins_X", "sum"),
          wins_O=("wins_O", "sum"),
          draws=("draws", "sum"),
          total_games=("total_games", "sum"),
      )
)

shape_grp["win_rate"] = (shape_grp["wins_X"] + shape_grp["wins_O"]) / shape_grp["total_games"]

print("\n=== Average self-play win rate by board shape ===")
display(shape_grp.sort_values("shape"))

# === Per architecture ===
arch_grp = (
    sp.groupby("arch", as_index=False)
      .agg(
          wins_X=("wins_X", "sum"),
          wins_O=("wins_O", "sum"),
          draws=("draws", "sum"),
          total_games=("total_games", "sum"),
      )
)

arch_grp["win_rate"] = (arch_grp["wins_X"] + arch_grp["wins_O"]) / arch_grp["total_games"]

print("\n=== Average self-play win rate by architecture ===")
display(arch_grp.sort_values("arch"))

# Save for later if you want to reuse
per_algorithm_sp   = algo_grp
per_shape_selfplay = shape_grp
per_arch_selfplay  = arch_grp


=== Per algorithm (across all models) ===


Unnamed: 0,algorithm,wins_X,wins_O,draws,total_games,X%,O%,bias%
0,Q,10992,8152,6776,25920,42.407407,31.450617,10.95679
1,SARSA,11107,8357,6456,25920,42.85108,32.241512,10.609568



=== Average self-play win rate by board shape ===


Unnamed: 0,shape,wins_X,wins_O,draws,total_games,win_rate
0,3x3,4921,1747,1972,8640,0.771759
1,3x3x3,5177,3463,0,8640,1.0
2,4x4,1319,1940,5381,8640,0.377199
3,4x4x4,4551,4083,6,8640,0.999306
4,5x5,1784,996,5860,8640,0.321759
5,5x5x5,4347,4280,13,8640,0.998495



=== Average self-play win rate by architecture ===


Unnamed: 0,arch,wins_X,wins_O,draws,total_games,win_rate
0,large,7511,5916,3853,17280,0.777025
1,medium,7736,5321,4223,17280,0.755613
2,small,6852,5272,5156,17280,0.70162


In [17]:
# === Tournament analysis: per-player stats & best model per shape ===

t = df_t.copy()

rows = []
for _, row in t.iterrows():
    shape = row["shape"]
    games = row["games"]

    rows.append({
        "player": row["model_a"],
        "shape": shape,
        "games": games,
        "wins": row["wins_a"],
        "losses": row["wins_b"],
        "draws": row["draws"],
    })
    rows.append({
        "player": row["model_b"],
        "shape": shape,
        "games": games,
        "wins": row["wins_b"],
        "losses": row["wins_a"],
        "draws": row["draws"],
    })

per_player = (
    pd.DataFrame(rows)
      .groupby(["player", "shape"], as_index=False)
      .agg(
          games=("games", "sum"),
          wins=("wins", "sum"),
          draws=("draws", "sum"),
          losses=("losses", "sum"),
      )
)

per_player["win_rate"] = per_player["wins"] / per_player["games"]

print("=== Per-player tournament stats (top 10 by win_rate) ===")
display(per_player.sort_values("win_rate", ascending=False).head(10))

# Best model per shape by tournament win_rate
best_per_shape = (
    per_player
    .sort_values(["shape", "win_rate"], ascending=[True, False])
    .groupby("shape", as_index=False)
    .first()
)

print("\n=== Best model per shape (by tournament win_rate) ===")
display(best_per_shape[["shape", "player", "win_rate", "wins", "draws", "losses"]])

# Keep these handy for later / LaTeX
per_player_tournament = per_player
best_model_per_shape  = best_per_shape


=== Per-player tournament stats (top 10 by win_rate) ===


Unnamed: 0,player,shape,games,wins,draws,losses,win_rate
315,large-5x5x5-e100-b128-q-sp10.pt,5x5x5,1880,1675,0,205,0.890957
321,large-5x5x5-e100-b128.pt,5x5x5,1880,1635,0,245,0.869681
328,large-5x5x5-e100-b256.pt,5x5x5,1880,1625,0,255,0.864362
318,large-5x5x5-e100-b128-sarsa-sp10.pt,5x5x5,1880,1625,0,255,0.864362
608,medium-4x4x4-e20-b64.pt,4x4x4,1880,1540,0,340,0.819149
741,medium-5x5x5-e50-b128.pt,5x5x5,1880,1535,0,345,0.816489
230,large-4x4x4-e20-b64.pt,4x4x4,1880,1500,0,380,0.797872
700,medium-5x5x5-e100-b256-q-sp10.pt,5x5x5,1880,1485,0,395,0.789894
224,large-4x4x4-e20-b64-q-sp10.pt,4x4x4,1880,1485,0,395,0.789894
703,medium-5x5x5-e100-b256-sarsa-sp10.pt,5x5x5,1880,1480,0,400,0.787234



=== Best model per shape (by tournament win_rate) ===


Unnamed: 0,shape,player,win_rate,wins,draws,losses
0,3x3,large-3x3-e20-b256-sarsa-sp100.pt,0.62766,1180,130,570
1,3x3x3,large-3x3x3-e50-b64-q-sp10.pt,0.670213,1260,0,620
2,4x4,medium-4x4-e100-b256-sarsa-sp100.pt,0.526596,990,695,195
3,4x4x4,medium-4x4x4-e20-b64.pt,0.819149,1540,0,340
4,5x5,medium-5x5-e20-b256-q-sp100.pt,0.476064,895,750,235
5,5x5x5,large-5x5x5-e100-b128-q-sp10.pt,0.890957,1675,0,205


In [18]:
# === Baseline vs self-play improvement using tournament per_player ===

pp = per_player_tournament.copy()

def stem_from_model_name(name: str) -> str:
    """
    Removes q/sarsa self-play suffix and '.pt' to get the stem.
    """
    stem = name
    if stem.endswith(".pt"):
        stem = stem[:-3]
    for tag in ["-q-sp", "-sarsa-sp"]:
        if tag in stem:
            stem = stem.split(tag)[0]
    return stem

def is_baseline_model(name: str) -> bool:
    """
    Baseline models are those WITHOUT q/sarsa self-play suffixes.
    """
    return ("-q-sp" not in name) and ("-sarsa-sp" not in name)

pp["stem"] = pp["player"].apply(stem_from_model_name)
pp["is_baseline"] = pp["player"].apply(is_baseline_model)

baseline_stats = pp[pp["is_baseline"]].copy()
variant_stats  = pp[~pp["is_baseline"]].copy()

print("Baselines with tournament stats:", len(baseline_stats))
print("Self-play variants with tournament stats:", len(variant_stats))

improvement_rows = []

for _, base_row in baseline_stats.iterrows():
    stem = base_row["stem"]
    shape = base_row["shape"]
    base_player = base_row["player"]
    base_wr = base_row["win_rate"]

    variants = variant_stats[variant_stats["stem"] == stem]
    if variants.empty:
        continue

    for _, var_row in variants.iterrows():
        var_player = var_row["player"]
        var_wr = var_row["win_rate"]
        improvement = var_wr - base_wr

        improvement_rows.append({
            "shape": shape,
            "stem": stem,
            "baseline": base_player,
            "variant": var_player,
            "baseline_wr": base_wr,
            "variant_wr": var_wr,
            "improvement": improvement,
        })

df_improvement = pd.DataFrame(improvement_rows)

print("\n=== Baseline vs self-play variants (top 10 by improvement) ===")
display(df_improvement.sort_values("improvement", ascending=False).head(10))

# Best variant per baseline
best_variant_per_baseline = (
    df_improvement
    .sort_values("improvement", ascending=False)
    .groupby("stem", as_index=False)
    .first()
)

print("\n=== Best self-play variant per baseline (head) ===")
display(best_variant_per_baseline.head(10))

gains  = best_variant_per_baseline[best_variant_per_baseline["improvement"] > 0]["improvement"]
losses = best_variant_per_baseline[best_variant_per_baseline["improvement"] < 0]["improvement"]

print("\n=== Summary of baseline → best self-play improvements ===")
print(f"Baselines with at least one variant: {len(best_variant_per_baseline)}")
print(f"Improved baselines:  {len(gains)}")
print(f"Worsened baselines: {len(losses)}")

if len(gains) > 0:
    print(f"Average positive improvement: {gains.mean():.4f} win_rate")

if len(losses) > 0:
    print(f"Average negative change: {losses.mean():.4f} win_rate")

print(f"Overall mean improvement across baselines: {best_variant_per_baseline['improvement'].mean():.4f} win_rate")

# Save for later if needed
baseline_vs_variant = df_improvement
best_variant_summary = best_variant_per_baseline


Baselines with tournament stats: 162
Self-play variants with tournament stats: 972

=== Baseline vs self-play variants (top 10 by improvement) ===


Unnamed: 0,shape,stem,baseline,variant,baseline_wr,variant_wr,improvement
442,4x4,medium-4x4-e100-b256,medium-4x4-e100-b256.pt,medium-4x4-e100-b256-sarsa-sp100.pt,0.12766,0.526596,0.398936
565,5x5,medium-5x5-e20-b256,medium-5x5-e20-b256.pt,medium-5x5-e20-b256-q-sp100.pt,0.148936,0.476064,0.327128
566,5x5,medium-5x5-e20-b256,medium-5x5-e20-b256.pt,medium-5x5-e20-b256-q-sp50.pt,0.148936,0.457447,0.308511
569,5x5,medium-5x5-e20-b256,medium-5x5-e20-b256.pt,medium-5x5-e20-b256-sarsa-sp50.pt,0.148936,0.444149,0.295213
40,3x3,large-3x3-e50-b128,large-3x3-e50-b128.pt,large-3x3-e50-b128-sarsa-sp100.pt,0.340426,0.614362,0.273936
158,4x4,large-4x4-e50-b64,large-4x4-e50-b64.pt,large-4x4-e50-b64-q-sp50.pt,0.204787,0.478723,0.273936
161,4x4,large-4x4-e50-b64,large-4x4-e50-b64.pt,large-4x4-e50-b64-sarsa-sp50.pt,0.204787,0.462766,0.257979
460,4x4,medium-4x4-e20-b256,medium-4x4-e20-b256.pt,medium-4x4-e20-b256-sarsa-sp100.pt,0.125,0.37234,0.24734
28,3x3,large-3x3-e20-b256,large-3x3-e20-b256.pt,large-3x3-e20-b256-sarsa-sp100.pt,0.380319,0.62766,0.24734
439,4x4,medium-4x4-e100-b256,medium-4x4-e100-b256.pt,medium-4x4-e100-b256-q-sp100.pt,0.12766,0.369681,0.242021



=== Best self-play variant per baseline (head) ===


Unnamed: 0,stem,shape,baseline,variant,baseline_wr,variant_wr,improvement
0,large-3x3-e100-b128,3x3,large-3x3-e100-b128.pt,large-3x3-e100-b128-sarsa-sp50.pt,0.388298,0.428191,0.039894
1,large-3x3-e100-b256,3x3,large-3x3-e100-b256.pt,large-3x3-e100-b256-sarsa-sp100.pt,0.396277,0.574468,0.178191
2,large-3x3-e100-b64,3x3,large-3x3-e100-b64.pt,large-3x3-e100-b64-sarsa-sp100.pt,0.393617,0.542553,0.148936
3,large-3x3-e20-b128,3x3,large-3x3-e20-b128.pt,large-3x3-e20-b128-q-sp50.pt,0.356383,0.425532,0.069149
4,large-3x3-e20-b256,3x3,large-3x3-e20-b256.pt,large-3x3-e20-b256-sarsa-sp100.pt,0.380319,0.62766,0.24734
5,large-3x3-e20-b64,3x3,large-3x3-e20-b64.pt,large-3x3-e20-b64-q-sp100.pt,0.292553,0.473404,0.180851
6,large-3x3-e50-b128,3x3,large-3x3-e50-b128.pt,large-3x3-e50-b128-sarsa-sp100.pt,0.340426,0.614362,0.273936
7,large-3x3-e50-b256,3x3,large-3x3-e50-b256.pt,large-3x3-e50-b256-sarsa-sp100.pt,0.332447,0.56383,0.231383
8,large-3x3-e50-b64,3x3,large-3x3-e50-b64.pt,large-3x3-e50-b64-q-sp10.pt,0.420213,0.417553,-0.00266
9,large-3x3x3-e100-b128,3x3x3,large-3x3x3-e100-b128.pt,large-3x3x3-e100-b128-sarsa-sp50.pt,0.62766,0.598404,-0.029255



=== Summary of baseline → best self-play improvements ===
Baselines with at least one variant: 162
Improved baselines:  128
Worsened baselines: 31
Average positive improvement: 0.0928 win_rate
Average negative change: -0.0333 win_rate
Overall mean improvement across baselines: 0.0669 win_rate
