In [None]:
# %% Training cell (with GUI callback that disables itself on window‐close)

# Toggle GUI vs headless
USE_GUI = False        # ← True to see Pygame GUI during training
TOTAL_STEPS = 300000  # adjust for your run
ENGINE_DEPTH= 1

import os
import time
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from makruk_env import FairyStockfishMakruk
from stable_baselines3.common.utils import get_schedule_fn

class RenderCallback(BaseCallback):
    """
    Renders the GUI each step; if the window is closed, disables further rendering.
    """
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.enabled = True

    def _on_step(self) -> bool:
        if not self.enabled:
            return True

        # 1) Unwrap to the base FairyStockfishMakruk env
        env_obj = self.model.env
        if hasattr(env_obj, "envs"):
            env_obj = env_obj.envs[0]
        while hasattr(env_obj, "env"):
            env_obj = env_obj.env

        # 2) Try rendering; if it fails, disable future renders
        try:
            env_obj.render()
        except Exception:
            print("RenderCallback: GUI closed; disabling further rendering.")
            self.enabled = False
            return True

        # 3) On terminal, pause so you can see the final position
        if getattr(env_obj, "done", False):
            reason = env_obj.info.get("end_reason", "?")
            print(f"\n→ Episode ended: {reason}\n")
            time.sleep(1.0)

        return True

def make_gui_env():
    e = FairyStockfishMakruk(
        path="./engine/fairy-stockfish-arm",
        max_depth= ENGINE_DEPTH,
        play_mode="selfplay",
        engine_timeout=2.0,
        render_mode="human"
    )
    e = Monitor(e)
    e = ActionMasker(e, lambda ev: ev.env.get_legal_moves_mask())
    return e

def make_headless_env(rank):
    e = FairyStockfishMakruk(
        path="./engine/fairy-stockfish-arm",
        max_depth= ENGINE_DEPTH,
        play_mode="selfplay",
        engine_timeout=2.0,
        render_mode=None
    )
    e = Monitor(e)
    e = ActionMasker(e, lambda ev: ev.env.get_legal_moves_mask())
    return e

# — Build environment & callbacks —
if USE_GUI:
    print(">>> GUI training mode")
    env = make_gui_env()
    extra_cbs = [RenderCallback()]
else:
    print(">>> Headless vectorized training")
    n_envs = 8
    vec = DummyVecEnv([lambda i=i: make_headless_env(i) for i in range(n_envs)])
    env = VecMonitor(vec)
    extra_cbs = []

checkpoint_cb = CheckpointCallback(
    save_freq=50_000,
    save_path="./checkpoints/",
    name_prefix="ppo_makruk"
)
eval_cb = MaskableEvalCallback(
    env,
    best_model_save_path="./best_model/",
    log_path="./eval_logs/",
    eval_freq=50_000,
    n_eval_episodes=8,
    deterministic=True
)

# — Load existing model or create new —
# model_path = "./checkpoints/ppo_makruk_400000_steps.zip"
# model_path = "./best_model/best_model.zip"
# model_path = "ppo_makruk_notebook.zip"
# model_path = "ppo_makruk_pvp.zip"
model_path = "ppo_makruk_self_pvp.zip"

if os.path.isfile(model_path):
    print(f"Resuming from {model_path}")
    # 1) Load the checkpoint into your existing env
    model = MaskablePPO.load(model_path, env=env, device="mps")

    # 2) Tweak hyperparameters in place:
    # — add some entropy bonus so the policy keeps exploring
    model.ent_coef =  1e-2   #  5e-2

    # — lower the learning rate to 1e-4
    for pg in model.policy.optimizer.param_groups:
        pg["lr"] = 1e-4

else:
    print("Starting fresh model")
    model = MaskablePPO(
        policy="MlpPolicy",
        env=env,
        device="mps",
        verbose=1,
        n_steps=2048,
        batch_size=64,
        gamma=0.99,
        gae_lambda=0.95,
        learning_rate=2.5e-4,
        clip_range=0.2,
        ent_coef=0.0,           # start with no entropy bonus if you like
        tensorboard_log="./ppo_makruk_tb/"
    )

# if os.path.isfile(model_path):
#     print(f"Resuming from {model_path}")
#     model = MaskablePPO.load(model_path, env=env, device="mps")
# else:
#     print("Starting fresh model")
#     model = MaskablePPO(
#         policy="MlpPolicy",
#         env=env,
#         device="mps",
#         verbose=1,
#         n_steps=2048,
#         batch_size=64,
#         gamma=0.99,
#         gae_lambda=0.95,
#         learning_rate=2.5e-4,
#         clip_range=0.2,
#         tensorboard_log="./ppo_makruk_tb/"
#     )

# — Train —
model.learn(
    total_timesteps=TOTAL_STEPS,
    callback=[checkpoint_cb, eval_cb] + extra_cbs
)

# model.learn(
#     total_timesteps=TOTAL_STEPS,
#     reset_num_timesteps=False,  # keep counting from your checkpoint
#     callback=[checkpoint_cb, eval_cb] + extra_cbs
# )

# — Save —
model.save("ppo_makruk_self_pvp")



# Bruh

In [None]:
# %% Training cell (with GUI callback that disables itself on window‐close)

# Toggle GUI vs headless
USE_GUI = False        # ← True to see Pygame GUI during training
TOTAL_STEPS = 300_000  # adjust for your run
ENGINE_DEPTH = 1

import os
import time
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.logger import configure
from stable_baselines3.common.utils import get_schedule_fn
from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from makruk_env import FairyStockfishMakruk

# ──────────────────────────────────────────────────────────────────────────────
# 1) Prepare a linear LR schedule
lr_schedule = get_schedule_fn(1e-4)  # start at 1e-4, linearly decay to 0 over TOTAL_STEPS

# 2) Callback to anneal entropy halfway
class EntropyAnnealer(BaseCallback):
    def __init__(self, switch_step: int, new_ent: float, verbose=0):
        super().__init__(verbose)
        self.switch_step = switch_step
        self.new_ent = new_ent

    def _on_step(self) -> bool:
        # once we hit halfway, lower entropy coefficient
        if self.num_timesteps == self.switch_step:
            print(f"[EntropyAnnealer] Switching ent_coef → {self.new_ent}")
            self.model.ent_coef = self.new_ent
        return True

entropy_cb = EntropyAnnealer(switch_step=TOTAL_STEPS // 2, new_ent=1e-3)
# ──────────────────────────────────────────────────────────────────────────────

class RenderCallback(BaseCallback):
    """
    Renders the GUI each step; if the window is closed, disables further rendering.
    """
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.enabled = True

    def _on_step(self) -> bool:
        if not self.enabled:
            return True
        # unwrap to the base env and render
        env_obj = self.model.env
        if hasattr(env_obj, "envs"):
            env_obj = env_obj.envs[0]
        while hasattr(env_obj, "env"):
            env_obj = env_obj.env
        try:
            env_obj.render()
        except Exception:
            print("RenderCallback: GUI closed; disabling further rendering.")
            self.enabled = False
        return True

def make_gui_env():
    e = FairyStockfishMakruk(
        path="./engine/fairy-stockfish-arm",
        max_depth=ENGINE_DEPTH,
        play_mode="selfplay",
        engine_timeout=2.0,
        render_mode="human"
    )
    e = Monitor(e)
    e = ActionMasker(e, lambda ev: ev.env.get_legal_moves_mask())
    return e

def make_headless_env(rank):
    e = FairyStockfishMakruk(
        path="./engine/fairy-stockfish-arm",
        max_depth=ENGINE_DEPTH,
        play_mode="selfplay",
        engine_timeout=2.0,
        render_mode=None
    )
    e = Monitor(e)
    e = ActionMasker(e, lambda ev: ev.env.get_legal_moves_mask())
    return e

# — Build environment & callbacks —
if USE_GUI:
    print(">>> GUI training mode")
    env = make_gui_env()
    extra_cbs = [RenderCallback()]
else:
    print(">>> Headless vectorized training")
    n_envs = 8
    vec = DummyVecEnv([lambda i=i: make_headless_env(i) for i in range(n_envs)])
    env = VecMonitor(vec)
    extra_cbs = []

checkpoint_cb = CheckpointCallback(
    save_freq=50_000,
    save_path="./checkpoints/",
    name_prefix="ppo_makruk"
)
eval_cb = MaskableEvalCallback(
    env,
    best_model_save_path="./best_model/",
    log_path="./eval_logs/",
    eval_freq=50_000,
    n_eval_episodes=8,
    deterministic=True
)

# — Load existing model or create new —
# model_path = "./best_model/best_model.zip"
# model_path = "ppo_makruk_notebook.zip"
# model_path = "ppo_makruk_pvp.zip"
model_path = "ppo_makruk_self_pvp.zip"

if os.path.isfile(model_path):
    print(f"Resuming from {model_path}")
    model = MaskablePPO.load(model_path, env=env, device="mps")
    # override ent_coef & install new LR schedule
    model.ent_coef = 5e-2
    model.lr_schedule = lr_schedule
    # step optimizer to current lr
    for pg in model.policy.optimizer.param_groups:
        pg["lr"] = lr_schedule(model.num_timesteps)
    # re-configure logger so self-play logs go to a fresh folder
    new_logger = configure("./ppo_makruk_tb/", ["stdout", "tensorboard"])
    model.set_logger(new_logger)
else:
    print("Starting fresh model")
    model = MaskablePPO(
        policy="MlpPolicy",
        env=env,
        device="mps",
        verbose=1,
        n_steps=2048,
        batch_size=64,
        gamma=0.99,
        gae_lambda=0.95,
        learning_rate=lr_schedule,
        clip_range=0.2,
        ent_coef=5e-2,
        tensorboard_log="./ppo_makruk_tb/"
    )

# — Train —
model.learn(
    total_timesteps=TOTAL_STEPS,
    callback=[checkpoint_cb, eval_cb, entropy_cb] + extra_cbs
)

# — Save —
model.save("ppo_makruk_self_pvp")


>>> Headless vectorized training
Resuming from ppo_makruk_self_pvp.zip
Logging to ./ppo_makruk_tb/




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 32.1     |
|    ep_rew_mean     | -9.7     |
| time/              |          |
|    fps             | 251      |
|    iterations      | 1        |
|    time_elapsed    | 65       |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 35.1        |
|    ep_rew_mean          | -9.5        |
| time/                   |             |
|    fps                  | 209         |
|    iterations           | 2           |
|    time_elapsed         | 156         |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.011772556 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.44       |
|    explained_variance   | -0.0981     |
|    learning_rate        | 0.

### New Env

In [None]:
# %% Training cell (with GUI callback that disables itself on window‐close)

USE_GUI       = False
TOTAL_STEPS   = 300_000
ENGINE_DEPTH  = 1

import os, time, torch
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.logger import configure
from stable_baselines3.common.utils import get_schedule_fn
from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from makruk_env import FairyStockfishMakruk

# ─── build lr schedule + entropy annealer ────────────────────────────────────
lr_schedule = get_schedule_fn(1e-4)
class EntropyAnnealer(BaseCallback):
    def __init__(self, switch_step, new_ent, verbose=0):
        super().__init__(verbose)
        self.switch_step = switch_step
        self.new_ent     = new_ent
    def _on_step(self) -> bool:
        if self.num_timesteps == self.switch_step:
            print(f"[EntropyAnnealer] Switching ent_coef → {self.new_ent}")
            self.model.ent_coef = self.new_ent
        return True

entropy_cb = EntropyAnnealer(TOTAL_STEPS//2, 1e-3)

# ─── env fns ─────────────────────────────────────────────────────────────────
def make_headless_env(rank):
    e = FairyStockfishMakruk(
        path="./engine/fairy-stockfish-arm",
        max_depth=ENGINE_DEPTH,
        play_mode="selfplay",
        engine_timeout=2.0,
        render_mode=None
    )
    e = Monitor(e)
    return ActionMasker(e, lambda ev: ev.env.get_legal_moves_mask())

if USE_GUI:
    # … your GUI env code …
    env = make_headless_env(0)  # placeholder
    extra_cbs = []  # + RenderCallback()
else:
    print(">>> Headless vectorized training")
    n_envs = 8
    vec   = DummyVecEnv([lambda i=i: make_headless_env(i) for i in range(n_envs)])
    env   = VecMonitor(vec)
    extra_cbs = []

checkpoint_cb = CheckpointCallback(
    save_freq=50_000, save_path="./checkpoints/", name_prefix="ppo_makruk"
)
eval_cb = MaskableEvalCallback(
    env,
    best_model_save_path="./best_model/",
    log_path="./eval_logs/",
    eval_freq=50_000,
    n_eval_episodes=8,
    deterministic=True
)

# ─── Load old + reinit action‐head ───────────────────────────────────────────
# — Load existing model or create new —
# model_path = "./best_model/best_model.zip"
# model_path = "ppo_makruk_notebook.zip"
# model_path = "ppo_makruk_pvp.zip"
model_path = "ppo_makruk_self_pvp.zip"


if os.path.isfile(model_path):
    print(f"▶ Re-initializing on new action-space from {model_path}")

    # 1) load old model without env to grab its weights
    old = MaskablePPO.load(model_path, env=None, device="cpu")
    old_state = old.policy.state_dict()

    # 2) filter out the old actor head ("action_net") keys
    filtered = {k: v for k, v in old_state.items() if not k.startswith("action_net")}

    # 3) instantiate brand-new model on new env (4 076 actions)
    model = MaskablePPO(
        policy="MlpPolicy",
        env=env,
        device="mps",
        verbose=1,
        n_steps=2048,
        batch_size=64,
        gamma=0.99,
        gae_lambda=0.95,
        learning_rate=lr_schedule,
        clip_range=0.2,
        ent_coef=5e-2,
        tensorboard_log="./ppo_makruk_tb/"
    )

    # 4) load everything _but_ the action head
    model.policy.load_state_dict(filtered, strict=False)
    print("✅ Loaded old weights except action_net → new head is randomly initialized")

    # 5) tweak hyperparams & logger
    model.ent_coef    = 5e-2
    model.lr_schedule = lr_schedule
    for pg in model.policy.optimizer.param_groups:
        pg["lr"] = lr_schedule(model.num_timesteps)
    new_logger = configure("./ppo_makruk_tb/", ["stdout","tensorboard"])
    model.set_logger(new_logger)

else:
    print("▶ Starting fresh model")
    model = MaskablePPO(
        policy="MlpPolicy",
        env=env,
        device="mps",
        verbose=1,
        n_steps=2048,
        batch_size=64,
        gamma=0.99,
        gae_lambda=0.95,
        learning_rate=lr_schedule,
        clip_range=0.2,
        ent_coef=5e-2,
        tensorboard_log="./ppo_makruk_tb/"
    )

# ─── Train & save ────────────────────────────────────────────────────────────
model.learn(
    total_timesteps=TOTAL_STEPS,
    callback=[checkpoint_cb, eval_cb, entropy_cb] + extra_cbs
)
model.save("ppo_makruk_self_pvp")


>>> Headless vectorized training
▶ Re-initializing on new action-space from ppo_makruk_self_pvp.zip
Using mps device
✅ Loaded old weights except action_net → new head is randomly initialized
Logging to ./ppo_makruk_tb/
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 29.5     |
|    ep_rew_mean     | -9.7     |
| time/              |          |
|    fps             | 255      |
|    iterations      | 1        |
|    time_elapsed    | 64       |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30          |
|    ep_rew_mean          | -9.4        |
| time/                   |             |
|    fps                  | 223         |
|    iterations           | 2           |
|    time_elapsed         | 146         |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl       

### Testing

In [None]:
# %% Testing cell (GUI only, with explicit mask)

import time
import pygame
from sb3_contrib import MaskablePPO
from makruk_env import FairyStockfishMakruk

# 1) Load your trained model
# model = MaskablePPO.load("ppo_makruk_notebook.zip", device="mps")
model = MaskablePPO.load("./best_model/best_model.zip", device="mps")

# 2) Create the GUI env
env = FairyStockfishMakruk(
    path="./engine/fairy-stockfish-arm",
    max_depth=1,
    play_mode="selfplay",
    engine_timeout=2.0,
    render_mode="human"
)

# 3) Reset & initial draw
obs, info = env.reset()
env.render()
time.sleep(1.0)

# 4) Play one self-play game
done = False
while not done:
    # Keep window alive
    for e in pygame.event.get():
        if e.type == pygame.QUIT:
            done = True
            break

    # 4a) Compute the mask of legal moves
    mask = env.get_legal_moves_mask()

    # 4b) Tell the model to only pick from those legal moves  
    #     Note: the keyword is **action_masks=**, not mask=
    action, _ = model.predict(obs, action_masks=mask, deterministic=True)
    
    # 5) Step and render
    obs, reward, done, _, info = env.step(action)
    env.render()
    time.sleep(0.5)

# 6) Done
print(f"Game over: {info.get('end_reason')}, reward={reward}")
env.close()
pygame.quit()


2025-05-04 13:11:52.587 python[48292:18656827] +[IMKClient subclass]: chose IMKClient_Modern
2025-05-04 13:11:52.587 python[48292:18656827] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Game over: checkmate, reward=-1.0


: 

### Tensor Board


In [None]:
# in the very first cell
%load_ext tensorboard

# in any cell
%tensorboard --logdir ./ppo_makruk_tb --port 6006