In [None]:
# selfplay_train.py

import os
import time
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.logger import configure
from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from makruk_env import FairyStockfishMakruk
from stable_baselines3 import PPO


class SelfPlayMakruk(FairyStockfishMakruk):
    """
    A Makruk Env where the 'opponent' is a fixed PPO model.
    We update self (the 'current' PPO) but keep self.opponent frozen.
    """
    def __init__(
        self,
        opponent_path: str,
        device: str = "cpu",
        **kwargs  # everything FairyStockfishMakruk.__init__ accepts
    ):
        # 1) Init base env
        super().__init__(**kwargs)
        # 2) Load frozen opponent
        try:
            # try maskable first
            self.opponent = MaskablePPO.load(opponent_path, device=device)
            self._is_maskable = True
        except ValueError:
            # fallback to vanilla SB3 PPO
            self.opponent = PPO.load(opponent_path, device=device)
            self._is_maskable = False
        # 3) Force self-play mode
        self.play_mode = "selfplay"

    def step(self, action):
        # 1) Agent plays
        obs, reward, done, truncated, info = super().step(action)
        if done:
            return obs, reward, done, truncated, info

        # 2) Opponent plays
        mask = self.get_legal_moves_mask()
        opp_act, _ = self.opponent.predict(obs, action_masks=mask, deterministic=True)
        obs, opp_reward, done, truncated, info = super().step(opp_act)

        # 3) Invert opponent’s reward
        return obs, reward - opp_reward, done, truncated, info

In [15]:
# 1) Paths to your checkpoints
CURRENT_MODEL_PATH = "./ppo_makruk_pvp.zip"
# CURRENT_MODEL_PATH = "./ppo_makruk_self_pvp.zip"
# CURRENT_MODEL_PATH = "./ppo_makruk_notebook.zip"
# BEST_MODEL_PATH    = "./ppo_makruk_pvp.zip"
# BEST_MODEL_PATH    = "./best_model/best_model.zip"
# BEST_MODEL_PATH    = "./ppo_makruk_self_pvp.zip"
BEST_MODEL_PATH = "./ppo_imitation_raw_policy.zip"

In [16]:
from sb3_contrib import MaskablePPO
model = MaskablePPO.load(CURRENT_MODEL_PATH, device="mps")


In [17]:
# 7) Quick headless self-play evaluation vs. the frozen “best” model
eval_env = SelfPlayMakruk(
    opponent_path=BEST_MODEL_PATH,
    device="mps",
    path="./engine/fairy-stockfish-arm",
    max_depth=1,            # opponent difficulty
    engine_timeout=2.0,
    render_mode=None        # headless
)

# 3) Run N episodes, tallying wins vs. losses
n_eval = 100
wins, losses = 0, 0

for _ in range(n_eval):
    obs, info = eval_env.reset()
    done = False
    while not done:
        mask = eval_env.get_legal_moves_mask()
        action, _ = model.predict(obs, action_masks=mask, deterministic=True)
        obs, reward, done, _, info = eval_env.step(action)

    # reward > 0  → our agent delivered mate (win)
    # reward < 0  → opponent delivered mate (loss)
    if reward > 0:
        wins += 1
    elif reward < 0:
        losses += 1

eval_env.close()
print(f"Against frozen best over {n_eval} games → wins={wins}, losses={losses}")




TypeError: BaseAlgorithm.predict() got an unexpected keyword argument 'action_masks'

In [18]:
from sb3_contrib import MaskablePPO
from stable_baselines3 import PPO
import os

class SelfPlayMakruk(FairyStockfishMakruk):
    def __init__(self, opponent_path: str, device: str = "cpu", **kwargs):
        # 1) Init base env
        super().__init__(**kwargs)
        # 2) Load frozen opponent with the right loader
        try:
            # try maskable first
            self.opponent = MaskablePPO.load(opponent_path, device=device)
            self._is_maskable = True
        except ValueError:
            # fallback to vanilla SB3 PPO
            self.opponent = PPO.load(opponent_path, device=device)
            self._is_maskable = False

        # 3) Force self-play mode
        self.play_mode = "selfplay"

    def get_best_move(self, depth=None):
        # 1) Observation + mask
        obs  = self.get_fen_tensor()
        mask = self.get_legal_moves_mask()

        if self._is_maskable:
            # MaskablePPO → supply action_masks
            a2, _ = self.opponent.predict(obs, action_masks=mask, deterministic=True)
        else:
            # vanilla PPO → ignore masks
            a2, _ = self.opponent.predict(obs, deterministic=True)

        return self.uci_moves[a2]


In [20]:
agent1 = MaskablePPO.load("ppo_makruk_self_pvp.zip", device="mps")
agent2_path = "./ppo_imitation_raw_policy.zip"  # could be Maskable or not

env = SelfPlayMakruk(
    opponent_path=agent2_path,
    device="cpu",
    path="./engine/fairy-stockfish-arm",
    max_depth=1,
    engine_timeout=2.0,
    render_mode=None
)

wins = losses = 0
for ep in range(100):
    obs, _ = env.reset()
    done = False
    while not done:
        a1, _ = agent1.predict(obs, deterministic=True)
        obs, reward, done, _, _ = env.step(a1)
    if reward > 0:
        wins += 1
    elif reward < 0:
        losses += 1

print(f"Agent1 wins={wins}, Agent2 wins={losses}")


KeyboardInterrupt: 