### Training PVP


In [6]:
# selfplay_train.py

import os
import time
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.logger import configure
from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from makruk_env import FairyStockfishMakruk

class SelfPlayMakruk(FairyStockfishMakruk):
    """
    A Makruk Env where the 'opponent' is a fixed PPO model.
    We update self (the 'current' PPO) but keep self.opponent frozen.
    """
    def __init__(
        self,
        opponent_path: str,
        device: str = "cpu",
        **kwargs  # everything FairyStockfishMakruk.__init__ accepts
    ):
        # 1) Init base env
        super().__init__(**kwargs)
        # 2) Load frozen opponent
        self.opponent: MaskablePPO = MaskablePPO.load(opponent_path, device=device)
        # 3) Force self-play mode
        self.play_mode = "selfplay"

    def step(self, action):
        # 1) Agent plays
        obs, reward, done, truncated, info = super().step(action)
        if done:
            return obs, reward, done, truncated, info

        # 2) Opponent plays
        mask = self.get_legal_moves_mask()
        opp_act, _ = self.opponent.predict(obs, action_masks=mask, deterministic=True)
        obs, opp_reward, done, truncated, info = super().step(opp_act)

        # 3) Invert opponent’s reward
        return obs, reward - opp_reward, done, truncated, info

In [7]:


# ————————————————————————————————————————————————

# 1) Paths to your checkpoints
CURRENT_MODEL_PATH = "./ppo_makruk_pvp.zip"
# CURRENT_MODEL_PATH = "./ppo_makruk_self_pvp.zip"
# CURRENT_MODEL_PATH = "./ppo_makruk_notebook.zip"
# BEST_MODEL_PATH    = "./ppo_makruk_pvp.zip"
BEST_MODEL_PATH    = "./ppo_makruk_self_pvp.zip"
# BEST_MODEL_PATH    = "./best_model/best_model.zip"

# 2) Build a vectorized self-play env
def make_env(rank):
    def _init():
        env = SelfPlayMakruk(
            opponent_path=BEST_MODEL_PATH,
            device="mps",                   # only used by SelfPlayMakruk to load its opponent
            path="./engine/fairy-stockfish-arm",
            max_depth=3,                    # opponent difficulty
            engine_timeout=2.0,
            render_mode=None                # headless
        )
        # SB3 logging wrapper
        env = Monitor(env)
        # Masking wrapper — unwrap the Monitor so get_legal_moves_mask() is on the right object
        env = ActionMasker(env, lambda m: m.env.get_legal_moves_mask())
        return env
    return _init

In [8]:
n_envs = 8
vec = DummyVecEnv([make_env(i) for i in range(n_envs)])
env = VecMonitor(vec)

# ————————————————————————————————————————————————

# 3) Load or create your “current” PPO
if os.path.isfile(CURRENT_MODEL_PATH):
    
    print(f"Resuming from {CURRENT_MODEL_PATH}")
    model = MaskablePPO.load(CURRENT_MODEL_PATH, env=env, device="mps")

    # avoid clobbering old logs
    new_logger = configure("./ppo_selfplay_tb/", ["stdout", "tensorboard"])
    model.set_logger(new_logger)

    # —––––––––––––––––––––––––––––––––––––––––––––––––––—
    # **Tweak hyperparams for self-play**  
    # keep exploring
    model.ent_coef = 1e-2
    # lower LR so it absorbs that exploration
    for pg in model.policy.optimizer.param_groups:
        pg["lr"] = 1e-4
else:
    print("Starting fresh model")
    model = MaskablePPO(
        policy="MlpPolicy",
        env=env,
        device="mps",
        verbose=1,
        n_steps=2048,
        batch_size=64,
        gamma=0.99,
        gae_lambda=0.95,
        learning_rate=2.5e-4,
        clip_range=0.2,
        ent_coef=1e-2,
        tensorboard_log="./ppo_selfplay_tb/"
    )

Resuming from ./ppo_makruk_pvp.zip
Logging to ./ppo_selfplay_tb/




In [10]:




# ————————————————————————————————————————————————

# 4) Callbacks
class BestSaver(BaseCallback):
    def __init__(self, save_freq: int, best_path: str, verbose=0):
        super().__init__(verbose)
        self.save_freq = save_freq
        self.best_path = best_path
        self.best_mean_reward = -float("inf")

    def _on_step(self) -> bool:
        if self.n_calls % self.save_freq == 0:
            mean_r = self.model.logger.name_to_value.get("rollout/ep_rew_mean")
            if mean_r is not None and mean_r > self.best_mean_reward:
                self.best_mean_reward = mean_r
                self.model.save(self.best_path)
                print(f"[BestSaver] New best {mean_r:.2f} → {self.best_path}")
        return True

checkpoint_cb = CheckpointCallback(
    save_freq=100_000, save_path="./checkpoints/", name_prefix="selfplay"
)
eval_cb = MaskableEvalCallback(
    env,
    best_model_save_path="./eval_best/",
    log_path="./eval_logs/",
    eval_freq=100_000,
    n_eval_episodes=16,
    deterministic=True
)
best_cb = BestSaver(save_freq=50_000, best_path=BEST_MODEL_PATH)

# ————————————————————————————————————————————————

# 5) Train
model.learn(
    total_timesteps=400_000,
    callback=[checkpoint_cb, eval_cb, best_cb]
)

# 6) Save your “current” policy
model.save("ppo_makruk_pvp")


DRAW TRIGGERED: counting rule draw after 8 moves
DRAW TRIGGERED: counting rule draw after 8 moves
DRAW TRIGGERED: counting rule draw after 8 moves
DRAW TRIGGERED: counting rule draw after 16 moves
DRAW TRIGGERED: counting rule draw after 8 moves
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 31.2     |
|    ep_rew_mean     | 0.508    |
| time/              |          |
|    fps             | 96       |
|    iterations      | 1        |
|    time_elapsed    | 169      |
|    total_timesteps | 16384    |
---------------------------------
DRAW TRIGGERED: counting rule draw after 8 moves
DRAW TRIGGERED: counting rule draw after 16 moves
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 32.9        |
|    ep_rew_mean          | 0.519       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 2           |
|    time_elapsed    

### Evaluation against Own


In [7]:
# 1) Paths to your checkpoints
# CURRENT_MODEL_PATH = "./ppo_makruk_pvp.zip"
CURRENT_MODEL_PATH = "./ppo_makruk_self_pvp.zip"
# CURRENT_MODEL_PATH = "./ppo_makruk_notebook.zip"
# BEST_MODEL_PATH    = "./ppo_makruk_pvp.zip"
BEST_MODEL_PATH    = "./best_model/best_model.zip"
# BEST_MODEL_PATH    = "./ppo_makruk_self_pvp.zip"

In [8]:
from sb3_contrib import MaskablePPO
model = MaskablePPO.load(CURRENT_MODEL_PATH, device="mps")


In [9]:
# 7) Quick headless self-play evaluation vs. the frozen “best” model
eval_env = SelfPlayMakruk(
    opponent_path=BEST_MODEL_PATH,
    device="mps",
    path="./engine/fairy-stockfish-arm",
    max_depth=2,            # opponent difficulty
    engine_timeout=2.0,
    render_mode=None        # headless
)

# 3) Run N episodes, tallying wins vs. losses
n_eval = 100
wins, losses = 0, 0

for _ in range(n_eval):
    obs, info = eval_env.reset()
    done = False
    while not done:
        mask = eval_env.get_legal_moves_mask()
        action, _ = model.predict(obs, action_masks=mask, deterministic=True)
        obs, reward, done, _, info = eval_env.step(action)

    # reward > 0  → our agent delivered mate (win)
    # reward < 0  → opponent delivered mate (loss)
    if reward > 0:
        wins += 1
    elif reward < 0:
        losses += 1

eval_env.close()
print(f"Against frozen best over {n_eval} games → wins={wins}, losses={losses}")


Against frozen best over 100 games → wins=60, losses=37


### Evaluation Against Engine


In [11]:
# 1) Paths to your checkpoints
CURRENT_MODEL_PATH = "./ppo_makruk_pvp.zip"
# CURRENT_MODEL_PATH = "./ppo_makruk_self_pvp.zip"
# CURRENT_MODEL_PATH = "./best_model/best_model.zip"
# CURRENT_MODEL_PATH = "./ppo_makruk_notebook.zip"
BEST_MODEL_PATH    = "./ppo_makruk_pvp.zip"
# BEST_MODEL_PATH    = "./best_model/best_model.zip"

In [4]:
from sb3_contrib import MaskablePPO
model = MaskablePPO.load(CURRENT_MODEL_PATH, device="mps")

In [6]:
import time
from sb3_contrib import MaskablePPO
from makruk_env import FairyStockfishMakruk



# 2) Build a fresh headless env at your chosen difficulty
eval_env = FairyStockfishMakruk(
    path="./engine/fairy-stockfish-arm",
    max_depth=2,            # ← set difficulty here
    play_mode="selfplay",
    engine_timeout=2.0,
    render_mode=None        # headless
)

# 3) Run N games, counting only wins vs. losses
n_eval = 100
wins = 0
losses = 0

for _ in range(n_eval):
    obs, info = eval_env.reset()
    done = False
    while not done:
        mask = eval_env.get_legal_moves_mask()
        action, _ = model.predict(obs, action_masks=mask, deterministic=True)
        obs, reward, done, _, info = eval_env.step(action)

    # +10 → agent mate → win; –10 → engine mate → loss; 0 (draw) ignored
    if reward > 0:
        wins += 1
    elif reward < 0:
        losses += 1

eval_env.close()
print(f"Over {n_eval} games at depth=1 → wins={wins}, losses={losses}")


Over 100 games at depth=1 → wins=0, losses=56


### GUI Testing


In [None]:
# %% Testing cell (GUI only, with explicit mask)

import time
import pygame
from sb3_contrib import MaskablePPO
from makruk_env import FairyStockfishMakruk

# 1) Load your trained model
model = MaskablePPO.load(CURRENT_MODEL_PATH, device="mps")
# model = MaskablePPO.load("./best_model/best_model.zip", device="mps")

# 2) Create the GUI env
env = FairyStockfishMakruk(
    path="./engine/fairy-stockfish-arm",
    max_depth=2,
    play_mode="selfplay",
    engine_timeout=2.0,
    render_mode="human"
)

# 3) Reset & initial draw
obs, info = env.reset()
env.render()
time.sleep(1.0)

# 4) Play one self-play game
done = False
while not done:
    # Keep window alive
    for e in pygame.event.get():
        if e.type == pygame.QUIT:
            done = True
            break

    # 4a) Compute the mask of legal moves
    mask = env.get_legal_moves_mask()

    # 4b) Tell the model to only pick from those legal moves  
    #     Note: the keyword is **action_masks=**, not mask=
    action, _ = model.predict(obs, action_masks=mask, deterministic=True)
    
    # 5) Step and render
    obs, reward, done, _, info = env.step(action)
    env.render()
    time.sleep(0.5)

# 6) Done
print(f"Game over: {info.get('end_reason')}, reward={reward}")
env.close()
pygame.quit()


2025-05-07 14:15:55.692 python[96095:28355733] +[IMKClient subclass]: chose IMKClient_Modern
2025-05-07 14:15:55.692 python[96095:28355733] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Game over: checkmate, reward=-1.0


: 

### Tensor Board


In [4]:
# in the very first cell
%load_ext tensorboard

# in any cell
%tensorboard --logdir ./ppo_selfplay_tb --port 6010

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6010 (pid 33929), started 0:00:01 ago. (Use '!kill 33929' to kill it.)

In [10]:
from sb3_contrib import MaskablePPO
import torch
# load your MPS-trained model
model = MaskablePPO.load("./FInalModel/ppo_makruk_pvp.zip", env=env, device="mps")

# move everything to CPU
model.policy.to("cpu")
model.device = torch.device("cpu")

# overwrite the file (or save to a new name)
model.save("ppo_makruk_cpu.zip")
