In [7]:
!apt -yq install swig >/dev/null
!pip -q install "gymnasium[box2d]" imageio imageio-ffmpeg opencv-python-headless einops



  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone


In [5]:
pip install "gymnasium[box2d]" torch numpy imageio imageio-ffmpeg opencv-python-headless

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[box2d])
  Using cached swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Using cached swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
Building wheels for collected packages: box2d-py
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for box2d-py (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for box2d-py[0m[31m
[0m[?25h  Running setup.py clean for box2d-py
Failed to build box2d-py
[31mERROR: ERROR: Failed to build installable wheels for some pypro

In [8]:
# === SAC on LunarLanderContinuous-v2 + MP4 recording ===
# Colab tip (one-time): !pip -q install "gymnasium[box2d]" torch numpy imageio imageio-ffmpeg opencv-python-headless

import os, math, random, time
from dataclasses import dataclass
from typing import Tuple
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import gymnasium as gym
from gymnasium.wrappers import RecordVideo

# ------------- Utils -------------
def set_seed(seed: int):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

def to_tensor(x, device): return torch.as_tensor(x, dtype=torch.float32, device=device)

def mlp(sizes, activation=nn.ReLU, output_activation=nn.Identity):
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    return nn.Sequential(*layers)

# ------------- Replay -------------
class ReplayBuffer:
    def __init__(self, obs_dim, act_dim, size, device):
        self.obs = np.zeros((size, obs_dim), np.float32)
        self.obs2 = np.zeros((size, obs_dim), np.float32)
        self.acts = np.zeros((size, act_dim), np.float32)
        self.rews = np.zeros(size, np.float32)
        self.done = np.zeros(size, np.float32)
        self.ptr = 0; self.size = 0; self.max_size = size; self.device = device
    def store(self, o, a, r, o2, d):
        i = self.ptr
        self.obs[i], self.acts[i], self.rews[i], self.obs2[i], self.done[i] = o, a, r, o2, d
        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)
    def sample(self, batch_size):
        idx = np.random.randint(0, self.size, size=batch_size)
        return dict(
            obs=to_tensor(self.obs[idx], self.device),
            obs2=to_tensor(self.obs2[idx], self.device),
            acts=to_tensor(self.acts[idx], self.device),
            rews=to_tensor(self.rews[idx], self.device).unsqueeze(-1),
            done=to_tensor(self.done[idx], self.device).unsqueeze(-1),
        )

# ------------- Networks -------------
class SquashedGaussianActor(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden=(256,256), log_std_bounds=(-20,2), action_low=None, action_high=None):
        super().__init__()
        self.net = mlp([obs_dim, *hidden], nn.ReLU, nn.ReLU)
        self.mu = nn.Linear(hidden[-1], act_dim)
        self.log_std = nn.Linear(hidden[-1], act_dim)
        self.lmin, self.lmax = log_std_bounds
        self.register_buffer('a_low', torch.as_tensor(action_low, dtype=torch.float32))
        self.register_buffer('a_high', torch.as_tensor(action_high, dtype=torch.float32))
        self.register_buffer('a_scale', (self.a_high - self.a_low)/2.0)
        self.register_buffer('a_mean',  (self.a_high + self.a_low)/2.0)
    def forward(self, o):
        h = self.net(o)
        mu = self.mu(h)
        log_std = torch.clamp(self.log_std(h), self.lmin, self.lmax)
        std = torch.exp(log_std)
        return mu, std
    @torch.no_grad()
    def act_mean(self, o):
        mu, _ = self.forward(o); y = torch.tanh(mu)
        return self.a_mean + self.a_scale * y
    def sample(self, o):
        mu, std = self.forward(o)
        dist = torch.distributions.Normal(mu, std)
        u = dist.rsample()               # pre-tanh
        y = torch.tanh(u)                # [-1,1]
        a = self.a_mean + self.a_scale * y
        logp = dist.log_prob(u).sum(-1, keepdim=True)
        logp -= torch.log(1 - y.pow(2) + 1e-6).sum(-1, keepdim=True)
        logp -= torch.log(self.a_scale).sum().view(1,1)
        return a, logp

class QCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden=(256,256)):
        super().__init__()
        self.q = mlp([obs_dim+act_dim, *hidden, 1], nn.ReLU, nn.Identity)
    def forward(self, o, a): return self.q(torch.cat([o,a], dim=-1))

# ------------- SAC Core -------------
@dataclass
class SACConfig:
    env_id: str = "LunarLanderContinuous-v3"
    seed: int = 0
    steps: int = 200_000
    start_steps: int = 5_000
    update_after: int = 5_000
    update_every: int = 1
    batch_size: int = 256
    gamma: float = 0.99
    tau: float = 0.005
    actor_lr: float = 3e-4
    critic_lr: float = 3e-4
    alpha_lr: float = 3e-4
    hidden: Tuple[int,int] = (256,256)
    target_entropy_scale: float = 1.0
    eval_episodes: int = 5
    video_folder: str = "videos"
    video_name: str = "sac_lander_demo"

class SACAgent:
    def __init__(self, obs_space, act_space, cfg: SACConfig, device):
        obs_dim = int(np.prod(obs_space.shape))
        act_dim = int(np.prod(act_space.shape))
        self.cfg, self.device = cfg, device
        a_low, a_high = np.asarray(act_space.low, np.float32), np.asarray(act_space.high, np.float32)
        self.actor = SquashedGaussianActor(obs_dim, act_dim, cfg.hidden, action_low=a_low, action_high=a_high).to(device)
        self.q1 = QCritic(obs_dim, act_dim, cfg.hidden).to(device)
        self.q2 = QCritic(obs_dim, act_dim, cfg.hidden).to(device)
        self.q1_t = QCritic(obs_dim, act_dim, cfg.hidden).to(device)
        self.q2_t = QCritic(obs_dim, act_dim, cfg.hidden).to(device)
        self.q1_t.load_state_dict(self.q1.state_dict())
        self.q2_t.load_state_dict(self.q2.state_dict())
        self.pi_opt  = torch.optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
        self.q1_opt  = torch.optim.Adam(self.q1.parameters(),   lr=cfg.critic_lr)
        self.q2_opt  = torch.optim.Adam(self.q2.parameters(),   lr=cfg.critic_lr)
        self.log_alpha = torch.tensor(0.0, requires_grad=True, device=device)
        self.a_opt = torch.optim.Adam([self.log_alpha], lr=cfg.alpha_lr)
        self.target_entropy = - cfg.target_entropy_scale * act_dim
    @property
    def alpha(self): return self.log_alpha.exp()
    def select(self, o, deterministic=False):
        o = to_tensor(o, self.device).unsqueeze(0)
        with torch.no_grad():
            if deterministic: a = self.actor.act_mean(o)
            else: a, _ = self.actor.sample(o)
        return a.cpu().numpy()[0]
    def update(self, batch):
        o, o2, a, r, d = batch['obs'], batch['obs2'], batch['acts'], batch['rews'], batch['done']
        with torch.no_grad():
            a2, logp2 = self.actor.sample(o2)
            q1_t = self.q1_t(o2, a2); q2_t = self.q2_t(o2, a2)
            q_targ = torch.min(q1_t, q2_t) - self.alpha * logp2
            backup = r + self.cfg.gamma * (1 - d) * q_targ
        q1 = self.q1(o, a); q2 = self.q2(o, a)
        lq1 = F.mse_loss(q1, backup); lq2 = F.mse_loss(q2, backup)
        self.q1_opt.zero_grad(set_to_none=True); lq1.backward(); self.q1_opt.step()
        self.q2_opt.zero_grad(set_to_none=True); lq2.backward(); self.q2_opt.step()
        api, logpi = self.actor.sample(o)
        qpi = torch.min(self.q1(o, api), self.q2(o, api))
        lpi = (self.alpha * logpi - qpi).mean()
        self.pi_opt.zero_grad(set_to_none=True); lpi.backward(); self.pi_opt.step()
        lalpha = -(self.log_alpha * (logpi.detach() + self.target_entropy)).mean()
        self.a_opt.zero_grad(set_to_none=True); lalpha.backward(); self.a_opt.step()
        with torch.no_grad():
            for p, pt in zip(self.q1.parameters(), self.q1_t.parameters()):
                pt.data.mul_(1 - self.cfg.tau); pt.data.add_(self.cfg.tau * p.data)
            for p, pt in zip(self.q2.parameters(), self.q2_t.parameters()):
                pt.data.mul_(1 - self.cfg.tau); pt.data.add_(self.cfg.tau * p.data)
        return dict(lq1=lq1.item(), lq2=lq2.item(), lpi=lpi.item(), alpha=self.alpha.item())

# ------------- Eval & Record -------------
def eval_policy(env_fn, agent, episodes=5, seed=None):
    env = env_fn()
    if seed is not None: env.reset(seed=seed)
    rets = []
    for _ in range(episodes):
        o, _ = env.reset()
        done, ep_ret = False, 0.0
        while not done:
            o, r, term, trunc, _ = env.step(agent.select(o, deterministic=True))
            ep_ret += float(r); done = term or trunc
        rets.append(ep_ret)
    env.close()
    return float(np.mean(rets)), float(np.std(rets))

def record_one_episode(agent, cfg: SACConfig, out_folder="videos", name_prefix="sac_lander_demo"):
    os.makedirs(out_folder, exist_ok=True)
    # render_mode must be rgb_array for RecordVideo
    env = gym.make(cfg.env_id, render_mode="rgb_array")
    env = RecordVideo(env, video_folder=out_folder, name_prefix=name_prefix, episode_trigger=lambda ep: True)
    o, _ = env.reset()
    done = False
    while not done:
        a = agent.select(o, deterministic=True)
        o, r, term, trunc, _ = env.step(a)
        done = term or trunc
    env.close()
    return out_folder  # mp4 is saved automatically

# ------------- Train + Record -------------
def train_and_record():
    cfg = SACConfig()
    set_seed(cfg.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def make_env(): return gym.make(cfg.env_id)
    env = make_env()
    obs_space, act_space = env.observation_space, env.action_space
    assert len(act_space.shape) == 1, "Continuous Box action space required."

    agent = SACAgent(obs_space, act_space, cfg, device)
    buf = ReplayBuffer(obs_space.shape[0], act_space.shape[0], size=300_000, device=device)

    o, _ = env.reset(seed=cfg.seed)
    ep_ret, ep_len = 0.0, 0
    print(f"Device: {device} | Env: {cfg.env_id} | Obs: {obs_space.shape} | Act: {act_space.shape}")

    for t in range(1, cfg.steps + 1):
        a = act_space.sample() if t < cfg.start_steps else agent.select(o)
        o2, r, term, trunc, _ = env.step(a)
        d = float(term or trunc)
        buf.store(o, a, r, o2, d)
        o = o2; ep_ret += float(r); ep_len += 1
        if d:
            print(f"[Step {t:>7}] EpRet: {ep_ret:7.1f} | EpLen: {ep_len:4d}")
            o, _ = env.reset(); ep_ret, ep_len = 0.0, 0

        if t >= cfg.update_after and t % cfg.update_every == 0:
            info = agent.update(buf.sample(cfg.batch_size))
            if t % 2000 == 0:
                print(f"[Upd {t:>7}] alpha={info['alpha']:.3f} | Lq1={info['lq1']:.3f} | Lq2={info['lq2']:.3f} | Lpi={info['lpi']:.3f}")

        if t % 10_000 == 0:
            mean_ret, std_ret = eval_policy(make_env, agent, episodes=cfg.eval_episodes)
            print(f"== Eval @ {t:>7} steps: mean={mean_ret:.1f} ± {std_ret:.1f}")

    env.close()
    print("Training done. Recording deterministic demo...")
    folder = record_one_episode(agent, cfg, out_folder=cfg.video_folder, name_prefix=cfg.video_name)
    print(f"Saved video(s) to: {os.path.abspath(folder)}")
train_and_record()


Device: cpu | Env: LunarLanderContinuous-v3 | Obs: (8,) | Act: (2,)
[Step      80] EpRet:    74.9 | EpLen:   80
[Step     207] EpRet:  -302.5 | EpLen:  127
[Step     315] EpRet:  -309.4 | EpLen:  108
[Step     413] EpRet:  -403.5 | EpLen:   98
[Step     513] EpRet:  -250.8 | EpLen:  100
[Step     686] EpRet:  -294.8 | EpLen:  173
[Step     814] EpRet:   -21.9 | EpLen:  128
[Step     980] EpRet:   -65.5 | EpLen:  166
[Step    1064] EpRet:  -195.7 | EpLen:   84
[Step    1189] EpRet:   -61.2 | EpLen:  125
[Step    1283] EpRet:  -167.3 | EpLen:   94
[Step    1418] EpRet:  -135.4 | EpLen:  135
[Step    1518] EpRet:   -46.0 | EpLen:  100
[Step    2518] EpRet:    14.6 | EpLen: 1000
[Step    2640] EpRet:  -155.2 | EpLen:  122
[Step    2785] EpRet:  -199.2 | EpLen:  145
[Step    2918] EpRet:  -373.3 | EpLen:  133
[Step    3005] EpRet:   -70.4 | EpLen:   87
[Step    3085] EpRet:  -206.4 | EpLen:   80
[Step    3188] EpRet:   -64.1 | EpLen:  103
[Step    3349] EpRet:  -106.4 | EpLen:  161
[Step   

  logger.warn(
  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"


Saved video(s) to: /content/videos
