In [None]:
# Copyright (c) 2022-2025, The Isaac Lab Project Developers.
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

"""Script to train RL agent with RSL-RL."""

"""Launch Isaac Sim Simulator first."""

import argparse
import sys

from isaaclab.app import AppLauncher

# local imports
import cli_args  # isort: skip


# add argparse arguments
parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.")
parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
parser.add_argument("--task", type=str, default=None, help="Name of the task.")
parser.add_argument("--seed", type=int, default=None, help="Seed used for the environment")
parser.add_argument("--max_iterations", type=int, default=None, help="RL Policy training iterations.")

parser.set_defaults(
    task="Isaac-Cartpole-v0",
    num_envs=128
)

# append RSL-RL cli arguments
cli_args.add_rsl_rl_args(parser)
# append AppLauncher cli args
AppLauncher.add_app_launcher_args(parser)
args_cli, hydra_args = parser.parse_known_args()

# always enable cameras to record video
if args_cli.video:
    args_cli.enable_cameras = True

# clear out sys.argv for Hydra
sys.argv = [sys.argv[0]] + hydra_args

# launch omniverse app
app_launcher = AppLauncher(args_cli)
simulation_app = app_launcher.app

"""Rest everything follows."""

import gymnasium as gym
import os
import torch
from datetime import datetime

from rsl_rl.runners import OnPolicyRunner

from isaaclab.envs import (
    DirectMARLEnv,
    DirectMARLEnvCfg,
    DirectRLEnvCfg,
    ManagerBasedRLEnvCfg,
    multi_agent_to_single_agent,
)
from isaaclab.utils.dict import print_dict
from isaaclab.utils.io import dump_pickle, dump_yaml

from isaaclab_rl.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlVecEnvWrapper

import isaaclab_tasks  # noqa: F401
from isaaclab_tasks.utils import get_checkpoint_path
from isaaclab_tasks.utils.hydra import hydra_task_config

import transformable_quadruped_wheelchair_isaaclab.tasks.locomotion # 追加

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False

# @hydra_task_config(args_cli.task, "rsl_rl_cfg_entry_point")
# def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agent_cfg: RslRlOnPolicyRunnerCfg):
"""Train with RSL-RL agent."""
# override configurations with non-hydra CLI arguments
agent_cfg = cli_args.update_rsl_rl_cfg(agent_cfg, args_cli)
env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs
agent_cfg.max_iterations = (
    args_cli.max_iterations if args_cli.max_iterations is not None else agent_cfg.max_iterations
)

# set the environment seed
# note: certain randomizations occur in the environment initialization so we set the seed here
env_cfg.seed = agent_cfg.seed
env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device

# specify directory for logging experiments
log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
log_root_path = os.path.abspath(log_root_path)
print(f"[INFO] Logging experiment in directory: {log_root_path}")
# specify directory for logging runs: {time-stamp}_{run_name}
log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# This way, the Ray Tune workflow can extract experiment name.
print(f"Exact experiment name requested from command line: {log_dir}")
if agent_cfg.run_name:
    log_dir += f"_{agent_cfg.run_name}"
log_dir = os.path.join(log_root_path, log_dir)

# create isaac environment
env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)

# convert to single-agent instance if required by the RL algorithm
if isinstance(env.unwrapped, DirectMARLEnv):
    env = multi_agent_to_single_agent(env)

# save resume path before creating a new log_dir
if agent_cfg.resume:
    resume_path = get_checkpoint_path(log_root_path, agent_cfg.load_run, agent_cfg.load_checkpoint)

# wrap for video recording
if args_cli.video:
    video_kwargs = {
        "video_folder": os.path.join(log_dir, "videos", "train"),
        "step_trigger": lambda step: step % args_cli.video_interval == 0,
        "video_length": args_cli.video_length,
        "disable_logger": True,
    }
    print("[INFO] Recording videos during training.")
    print_dict(video_kwargs, nesting=4)
    env = gym.wrappers.RecordVideo(env, **video_kwargs)

# wrap around environment for rsl-rl
env = RslRlVecEnvWrapper(env)

# create runner from rsl-rl
runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)
# write git state to logs
runner.add_git_repo_to_log(__file__)
# load the checkpoint
if agent_cfg.resume:
    print(f"[INFO]: Loading model checkpoint from: {resume_path}")
    # load previously trained model
    runner.load(resume_path)

# dump the configuration into log-directory
dump_yaml(os.path.join(log_dir, "params", "env.yaml"), env_cfg)
dump_yaml(os.path.join(log_dir, "params", "agent.yaml"), agent_cfg)
dump_pickle(os.path.join(log_dir, "params", "env.pkl"), env_cfg)
dump_pickle(os.path.join(log_dir, "params", "agent.pkl"), agent_cfg)

# run training
runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)

# close the simulator
env.close()
simulation_app.close()


[INFO][AppLauncher]: Loading experience file: C:\Users\admin5050\akamisaka\IsaacLab\apps\isaaclab.python.kit


  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


AttributeError: '_WindowsSelectorEventLoop' object has no attribute '_old_agen_hooks'

NameError: name 'agent_cfg' is not defined

: 

In [34]:
# %% セル１: インポート＆GymVecEnvラッパー（前出のものを流用）
import os, numpy as np, gymnasium as gym, torch
from types import SimpleNamespace
from rsl_rl.env import VecEnv
from rsl_rl.runners.on_policy_runner import OnPolicyRunner

class GymVecEnv(VecEnv):
    def __init__(self, env_name, num_envs=1, device="cpu"):
        self.envs = [gym.make(env_name) for _ in range(num_envs)]
        self.num_envs = num_envs
        self.device = torch.device(device)
        single = self.envs[0]
        self.num_actions = np.prod(single.action_space.shape)
        self.num_obs     = np.prod(single.observation_space.shape)
        self.max_episode_length = getattr(single, "_max_episode_steps", 1000)
        self._obs = None
        self.episode_length_buf = torch.zeros(
            (self.num_envs,), dtype=torch.long, device=self.device
        )
        self.reset()

    def seed(self, seed=0):
        for i, e in enumerate(self.envs):
            e.reset(seed=seed+i)
        return seed

    def reset(self):
        obs = np.stack([e.reset()[0] for e in self.envs], 0)
        self._obs = torch.from_numpy(obs).float().to(self.device)
        return self._obs

    def step(self, actions: torch.Tensor):
        acts = actions.cpu().numpy()

        # print(actions.min(), actions.max())


        obs_b, rew_b, done_b = [], [], []
        for env, a in zip(self.envs, acts):
            o, r, ter, tru, _ = env.step(a)
            done = ter or tru
            if done:
                o, _ = env.reset()
            obs_b.append(o); rew_b.append(r); done_b.append(done)
        # obs = torch.from_numpy(np.stack(obs_b,0)).float().to(self.device)
        # self._obs = obs
        # rew = torch.from_numpy(np.array(rew_b)).float().to(self.device)
        # dones = torch.from_numpy(np.array(done_b, dtype=int)).to(self.device)
        # return obs, rew, dones, {}
        # バッチ化
        obs = torch.from_numpy(np.stack(obs_b, 0)).float().to(self.device)
        self._obs = obs
        rew = torch.from_numpy(np.array(rew_b)).float().to(self.device)
        dones = torch.from_numpy(np.array(done_b, dtype=int)).to(self.device)

        # ── ここから extras を必ず返す──
        # policy 用観測
        policy_obs = obs
        critic_obs = obs
        # # critic 用観測（必要なら zeros で代用）
        # critic_obs = torch.zeros_like(obs)

        extras = {
            "observations": {
                "policy": policy_obs,
                "critic": critic_obs,
            }
        }
        return obs, rew, dones, extras

    def get_observations(self):
        """
        OnPolicyRunner が期待する形式:
          obs, extras = get_observations()
        extras["observations"] は dict で、
          extras["observations"]["policy"] → policy 用観測 tensor
          extras["observations"]["critic"] → critic 用観測（ここではなし or zeros）
        のように使われます。
        """
        # policy 用観測
        policy_obs = self._obs
        critic_obs = self._obs
        # critic 用観測がない場合は空/dummy tensor でも構わない
        # ここでは単純に policy_obs を再利用します
        # critic_obs = torch.zeros_like(self._obs)

        extras = {
            "observations": {
                "policy": policy_obs,
                "critic": critic_obs,
            }
        }
        return policy_obs, extras
    
    def close(self):
        for e in self.envs:
            e.close()
    
    @property
    def episode_length_buf(self) -> torch.Tensor:
        return self._episode_length_buf

    @episode_length_buf.setter
    def episode_length_buf(self, value: torch.Tensor):
        self._episode_length_buf = value

# %% セル２: PPO 用設定辞書をべた書き → Runner 初期化＆学習
TASK        = "Pendulum-v1"
NUM_ENVS    = 1
TOTAL_STEPS = 200000
STEPS_PER_ENV = 200

DEVICE      = "cuda:0"

# VecEnv 作成
venv = GymVecEnv(TASK, NUM_ENVS, device=DEVICE)

# %% セル２: PPO 用設定辞書をべた書き → Runner 初期化＆学習

train_cfg = {
    # １ env あたり何ステップ収集するか
    "num_steps_per_env": STEPS_PER_ENV,
    # 最大イテレーション数（PPOループ回数）
    "max_iterations":   TOTAL_STEPS//STEPS_PER_ENV,
    # チェックポイント保存間隔
    "save_interval":    1000,
    # 実験名（ログフォルダ用）
    "experiment_name":  "pendulum_ppo",
    # 経験的正規化を使うか否か
    "empirical_normalization": True,

    # ポリシーのネットワーク構成
    "policy": {
        "class_name":        "ActorCritic",
        "init_noise_std":    0.1,
        "actor_hidden_dims": [64, 64],
        "critic_hidden_dims":[64, 64],
        "activation":        "tanh",
        "apply_tanh":   True, 
    },

    # アルゴリズムのハイパーパラメータ
    "algorithm": {
        "class_name":           "PPO",
        "value_loss_coef":      1.0,
        "use_clipped_value_loss": True,
        "clip_param":           0.2,
        "entropy_coef":         0.01,
        "num_learning_epochs":  5,
        "num_mini_batches":     4,
        "learning_rate":        1.0e-3,
        "schedule":             "adaptive",
        "gamma":                0.99,
        "lam":                  0.95,
        "desired_kl":           0.01,
        "max_grad_norm":        1.0,
    },
}

log_dir = "./pendulum_ppo_logs"
os.makedirs(log_dir, exist_ok=True)
runner = OnPolicyRunner(venv, train_cfg, log_dir=log_dir, device=DEVICE)
print("=== Expert (PPO) Training Start ===")
# train() の代わりに learn() を使う
runner.learn(
    num_learning_iterations=train_cfg["max_iterations"],
    init_at_random_ep_len=True,    # 必要に応じてランダム初期長を有効化
)
print("=== Expert Training Done ===")

ActorCritic.__init__ got unexpected arguments, which will be ignored: ['apply_tanh']
Actor MLP: Sequential(
  (0): Linear(in_features=3, out_features=64, bias=True)
  (1): Tanh()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): Tanh()
  (4): Linear(in_features=64, out_features=1, bias=True)
)
Critic MLP: Sequential(
  (0): Linear(in_features=3, out_features=64, bias=True)
  (1): Tanh()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): Tanh()
  (4): Linear(in_features=64, out_features=1, bias=True)
)
=== Expert (PPO) Training Start ===
################################################################################
                      [1m Learning iteration 0/1000 [0m                       

                       Computation: 216 steps/s (collection: 0.800s, learning 0.124s)
               Value function loss: 17217.4637
                    Surrogate loss: 0.0163
             Mean action noise std: 0.10
                 Mean total reward: -1666.02
   

KeyboardInterrupt: 

In [33]:
import torch, os, cv2, numpy as np, gymnasium as gym
from gymnasium.wrappers import TimeLimit

policy_fn = runner.get_inference_policy(device="cpu")

os.makedirs("./videos", exist_ok=True)
video_path = "./videos/expert_pendulum.mp4"

env = TimeLimit(gym.make("Pendulum-v1", render_mode="rgb_array"), max_episode_steps=1000)

obs, _ = env.reset()
frame = env.render()
h, w, _ = frame.shape

writer = cv2.VideoWriter(video_path,
                         cv2.VideoWriter_fourcc(*"mp4v"),
                         30, (w, h))

for _ in range(1000):
    # ① 現フレームを書き込み
    writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    # ② obs -> Tensor
    obs_t = torch.from_numpy(obs[None, ...]).float()
    with torch.no_grad():
        action_t, *_ = policy_fn(obs_t)          # (1,1) Tensor

    # ③ Tensor -> (1,) ndarray
    action = action_t.squeeze().cpu().numpy().flatten()  # shape (1,)
    if action.ndim == 0:                               # 保険
        action = np.array([action])

    # ④ 環境へ入力
    obs, _, done, truncated, _ = env.step(action)

    # ⑤ 次フレーム取得
    frame = env.render()
    if done or truncated:
        writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
        break

writer.release()
env.close()
print("MP4 を保存しました:", video_path)


MP4 を保存しました: ./videos/expert_pendulum.mp4


In [3]:
import gymnasium as gym
import torch
import numpy as np

from stable_baselines3 import PPO
# imitation ライブラリから必要なモジュールをインポート
from imitation.data.rollout import rollout, make_sample_until, generate_trajectories
from imitation.algorithms.bc      import BC
from imitation.algorithms.dagger  import SimpleDAggerTrainer, LinearBetaSchedule

from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.policies import BasePolicy

vec_env = DummyVecEnv([lambda: gym.make("Pendulum-v1")] * 1)

import shutil, os
scratch_dir = "./dagger_scratch"
if os.path.exists(scratch_dir):
    shutil.rmtree(scratch_dir)

expert_fn = runner.get_inference_policy()

class ExpertWrapper(BasePolicy):
    def __init__(self, fn, observation_space, action_space, device="cpu"):
        super().__init__(observation_space, action_space, device, squash_output=False)
        self.fn = fn
    def _predict(self, obs, deterministic=True):
        obs_t = torch.as_tensor(obs, dtype=torch.float32, device=self.device)
        with torch.no_grad():
            return self.fn(obs_t)           # Tensor をそのまま返す
    def forward(self, obs, deterministic=False):
        return self._predict(obs, deterministic)

expert = ExpertWrapper(
    fn=runner.get_inference_policy(),
    observation_space=vec_env.observation_space,
    action_space     =vec_env.action_space,
    device="cpu",
)

# class ExpertWrapper(BasePolicy):
#     """rsl-rl の推論関数 `fn` を SB3 Policy 風に包む."""
#     def __init__(self, fn, observation_space, action_space, device="cpu"):
#         super().__init__(observation_space, action_space, device, squash_output=False)
#         self.fn = fn                                    # rsl-rl の推論関数
    
#     # ここを修正 ───────────────────────────
#     def _predict(self, obs, deterministic=True):
#         obs_t = torch.as_tensor(obs, dtype=torch.float32, device=self.device)
#         with torch.no_grad():
#             act_t = self.fn(obs_t)      # すでに Tensor
#         return act_t                    # ← NumPy に変換しない！
#     # ────────────────────────────────────

#     # forward も Tensor を返すよう揃える
#     def forward(self, obs, deterministic=False):
#         return self._predict(obs, deterministic)
    
    # # SB3 が内部で呼ぶメソッド
    # def _predict(self, obs, deterministic=True):
    #     obs_t = torch.as_tensor(obs, dtype=torch.float32, device=self.device)
    #     with torch.no_grad():
    #         act_t = self.fn(obs_t)
    #     return act_t.cpu().numpy()

    # # PyTorch の forward（BC 学習では使わないのでダミーで可）
    # def forward(self, obs, deterministic=False):
    #     return torch.as_tensor(self._predict(obs, deterministic), device=self.device)
    
# class ExpertWrapper:
#     def __init__(self, fn, observation_space, action_space, device="cpu"):
#         self.fn                = fn
#         self.observation_space = observation_space
#         self.action_space      = action_space
#         self.device            = torch.device(device)

#     def __call__(self, obs: np.ndarray, state=None, dones=None):
#         # NumPy→Tensor
#         obs_t = torch.from_numpy(obs).float().to(self.device)
#         # 行動だけ返す expert_fn を呼び出し
#         with torch.no_grad():
#             act_t = self.fn(obs_t)            # 返り値は Tensor のみ
#         # Tensor→NumPy
#         act = act_t.cpu().numpy()
#         # DAgger 用に、(actions, next_state) のタプルで返す
#         # next_state を持たないモデルなので None を渡す
#         return act, None

# class ExpertWrapper:
#     def __init__(self, fn, observation_space, action_space, device="cpu"):
#         self.fn                = fn
#         self.observation_space = observation_space
#         self.action_space      = action_space
#         self.device            = torch.device(device)

#     def _forward(self, obs):
#         obs_t = torch.as_tensor(obs).float().to(self.device)
#         with torch.no_grad():
#             act_t = self.fn(obs_t)
#         return act_t.cpu().numpy()

#     # SB3 互換
#     def predict(self, obs, deterministic=True):
#         return self._forward(obs), None

#     # # callable 用としても動く
#     # def __call__(self, obs, state=None, dones=None):
#     #     return self._forward(obs), None

expert = ExpertWrapper(
    runner.get_inference_policy(),
    observation_space=vec_env.observation_space,
    action_space=vec_env.action_space,
    device="cpu",
)
# ─── 2) Student (SB3 PPO) の準備 ────────────────────────────
env     = gym.make("Pendulum-v1")
student = PPO("MlpPolicy", env, verbose=1)

# ─── 3) Expert Trajectories の収集 ─────────────────────────
# def collect_expert_trajs(policy, venv, min_episodes: int, seed: int):
#     """Generate trajectories from the expert policy."""
#     sample_until = make_sample_until(min_episodes=min_episodes)
#     trajs = generate_trajectories(
#         policy               = policy,                     # ExpertWrapper
#         venv                 = venv,
#         sample_until         = sample_until,
#         deterministic_policy = False,                      # ← ここを False に
#         rng                  = np.random.default_rng(seed),
#     )
#     return trajs

seed = 0
# expert_trajs = collect_expert_trajs(expert, vec_env, min_episodes=5, seed=seed)
# print(f"Collected {len(expert_trajs)} expert episodes.")

def collect_expert_trajs(policy):
    sample_until = make_sample_until(min_episodes=5)
    return generate_trajectories(
        policy               = policy,
        venv                 = vec_env,
        sample_until         = sample_until,
        deterministic_policy = False,      # ← ここだけで十分
        rng                  = np.random.default_rng(seed),
    )

expert_trajs = collect_expert_trajs(expert)
print(len(expert_trajs), "expert episodes collected")

# from types import MethodType
# def _predict(self, obs, deterministic=True):
#     # obs が NumPy なら Tensor に変換
#     obs_t = torch.as_tensor(obs, dtype=torch.float32).to(next(self.parameters()).device)
#     with torch.no_grad():
#         act_t = self(obs_t)             # forward 呼び出し
#     return act_t.cpu().numpy(), None    # SB3 と同じ戻り値 (action, state)
# bc_trainer.policy.predict = MethodType(_predict, bc_trainer.policy)
# 1️⃣  ラッパークラスはそのまま

class BCPredictOnly(torch.nn.Module):
    def __init__(self, net):
        super().__init__()
        self.net = net                          # 元ネットを登録
        self.device = next(net.parameters()).device  # ← 追加: device を持たせる

    # BC 学習用
    def forward(self, obs):
        return self.net(obs)

    # ロールアウト用
    def predict(self, obs, deterministic=True):
        obs_t = torch.as_tensor(obs, dtype=torch.float32, device=self.device)
        with torch.no_grad():
            act_t = self.net(obs_t)
        return act_t.cpu().numpy(), None        # SB3 と同じ戻り値


########################################
# ② _policy の差し替えはそのまま
########################################
# bc_trainer = BC(
#     observation_space = vec_env.observation_space,
#     action_space      = vec_env.action_space,
#     rng               = np.random.default_rng(seed),
# )
# bc_trainer._policy = BCPredictOnly(bc_trainer.policy)   # ← OK

bc_trainer = BC(
    observation_space = vec_env.observation_space,
    action_space      = vec_env.action_space,
    rng               = np.random.default_rng(seed),
)

from types import MethodType
import torch

def bc_predict(self, obs, state=None, episode_start=None, deterministic=True):
    dev = next(self.parameters()).device
    obs_t = torch.as_tensor(obs, dtype=torch.float32, device=dev)

    with torch.no_grad():
        out = self(obs_t)                # ← forward の戻り値

    # --- ここを追加 --------------------------------------------------
    # BC ネットは (acts,) のタプルを返すので 1 要素目を取り出す
    if isinstance(out, (tuple, list)):
        out = out[0]

    # Distribution 型を返す実装の場合に備えた保険
    if isinstance(out, torch.distributions.Distribution):
        out = out.mean if deterministic else out.sample()
    # ---------------------------------------------------------------

    return out.cpu().numpy(), None       # SB3 互換の戻り値

bc_trainer.policy.predict = MethodType(bc_predict, bc_trainer.policy)


# ─── 4) DAggerTrainer の初期化 ─────────────────────────────
# dagger_trainer = SimpleDAggerTrainer(
#     venv             = vec_env,                   # 学習＆ロールアウト用 VecEnv
#     scratch_dir      = "./dagger_scratch",        # 中間結果保存フォルダ
#     expert_policy    = expert,                    # ExpertWrapper
#     bc_trainer       = BC(
#         observation_space=vec_env.observation_space,
#         action_space     =vec_env.action_space,
#         rng              = np.random.default_rng(seed),
#     ),
#     expert_trajs     = expert_trajs,              # 事前に集めた expert_demo
#     rng              = np.random.default_rng(seed),
#     beta_schedule    = LinearBetaSchedule(5),     # 5 ラウンドで β:1 → 0
# )

# dagger_trainer = SimpleDAggerTrainer(
#     venv          = vec_env,
#     scratch_dir   = "./dagger_scratch",
#     expert_policy = expert,
#     bc_trainer    = bc_trainer,         # ← 追加済みの bc_trainer を渡す
#     expert_trajs  = expert_trajs,
#     rng           = np.random.default_rng(seed),
#     beta_schedule = LinearBetaSchedule(5),
# )

# # ─── 5) DAgger 学習の実行 ─────────────────────────────────
# dagger_trainer.train(
#     total_timesteps             = 100_000,          # student に与える学習ステップ数
#     rollout_round_min_episodes  = 5,                # 各ラウンドで最低収集エピソード数
#     rollout_round_min_timesteps = 2000,             # 各ラウンドで最低収集ステップ数
#     bc_train_kwargs             = {"n_epochs": 10}, # BC 更新時のエポック数
#     deterministic_policy        = False
# )

dagger_trainer = SimpleDAggerTrainer(
    venv          = vec_env,
    scratch_dir   = "./dagger_scratch",
    expert_policy = expert,
    bc_trainer    = bc_trainer,
    expert_trajs  = expert_trajs,
    rng           = np.random.default_rng(seed),
    beta_schedule = LinearBetaSchedule(5),
)

dagger_trainer.train(
    total_timesteps            = 1000,
    rollout_round_min_episodes = 5,
    rollout_round_min_timesteps= 2000,
    bc_train_kwargs            = {"n_epochs": 10},
)

# ─── 6) 訓練済みポリシーの保存 ─────────────────────────────
ckpt_path, policy_path = dagger_trainer.save_trainer()
print(f"Saved DAgger checkpoint to: {ckpt_path}")
print(f"Saved DAgger policy to: {policy_path}")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
5 expert episodes collected


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

0batch [00:00, ?batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 0         |
|    ent_loss       | -0.00142  |
|    entropy        | 1.42      |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 70.5      |
|    loss           | 0.955     |
|    neglogp        | 0.956     |
|    prob_true_act  | 0.385     |
|    samples_so_far | 32        |
| rollout/          |           |
|    return_max     | -860      |
|    return_mean    | -1.23e+03 |
|    return_min     | -1.53e+03 |
|    return_std     | 235       |
---------------------------------


489batch [00:03, 220.41batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 500       |
|    ent_loss       | -0.000917 |
|    entropy        | 0.917     |
|    epoch          | 5         |
|    l2_loss        | 0         |
|    l2_norm        | 71.4      |
|    loss           | 0.416     |
|    neglogp        | 0.417     |
|    prob_true_act  | 0.659     |
|    samples_so_far | 16032     |
| rollout/          |           |
|    return_max     | -1.55e+03 |
|    return_mean    | -1.68e+03 |
|    return_min     | -1.84e+03 |
|    return_std     | 113       |
---------------------------------


930batch [00:06, 144.90batch/s]

Saved DAgger checkpoint to: c:\Users\admin5050\akamisaka\TransformableQuadrupedWheelchairIsaacLab\exts\transformable_quadruped_wheelchair_isaaclab\transformable_quadruped_wheelchair_isaaclab\envs\dagger_scratch\checkpoint-001.pt
Saved DAgger policy to: c:\Users\admin5050\akamisaka\TransformableQuadrupedWheelchairIsaacLab\exts\transformable_quadruped_wheelchair_isaaclab\transformable_quadruped_wheelchair_isaaclab\envs\dagger_scratch\policy-001.pt





In [11]:
# import os, cv2, numpy as np, gymnasium as gym
# from gymnasium.wrappers import TimeLimit

# # 0) 出力先フォルダ
# os.makedirs("./videos", exist_ok=True)
# video_path = "./videos/pendulum_dagger.avi"   # .mp4 にしたい場合は拡張子と fourcc を変える

# # 1) 学習済み policy
# policy = dagger_trainer.bc_trainer.policy     # バッチ次元を付けて predict する

# # 2) 環境 (rgb_array) + 1000 ステップ制限
# env = TimeLimit(
#     gym.make("Pendulum-v1", render_mode="rgb_array"),
#     max_episode_steps=1000
# )

# # 3) reset → 最初の obs とフレーム
# obs, _ = env.reset()
# frame = env.render()              # ここで初回フレームを取得
# h, w, _ = frame.shape

# # 4) VideoWriter 初期化
# writer = cv2.VideoWriter(
#     video_path,
#     cv2.VideoWriter_fourcc(*"XVID"),  # MP4 にしたいなら "mp4v" と .mp4
#     30,                               # FPS
#     (w, h)
# )

# # 5) ループ
# for _ in range(1000):
#     writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))   # 直前のフレームを書き込む

#     action_b, _ = policy.predict(obs[None, ...], deterministic=True)
#     obs, _, done, truncated, _ = env.step(action_b[0])

#     frame = env.render()                                   # 次のフレーム
#     if done or truncated:
#         writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))  # 最終フレームも保存
#         break

# # 6) 後処理
# writer.release()
# env.close()
# print("OpenCV で保存:", video_path)

import os, cv2, numpy as np, gymnasium as gym
from gymnasium.wrappers import TimeLimit

# ------------------------- 設定 --------------------------
os.makedirs("./videos", exist_ok=True)
video_path = "./videos/pendulum_dagger.mp4"   # ← .mp4 に変更

policy = dagger_trainer.bc_trainer.policy     # 学習済みポリシー

env = TimeLimit(
    gym.make("Pendulum-v1", render_mode="rgb_array"),
    max_episode_steps=1000
)

# ------------------ 初期フレームと VideoWriter ------------
obs, _ = env.reset()
frame = env.render()                # 最初のフレーム (H,W,3) RGB
h, w, _ = frame.shape

writer = cv2.VideoWriter(
    video_path,
    cv2.VideoWriter_fourcc(*"mp4v"),  # ← fourcc を MP4 用に
    30,                               # FPS
    (w, h)
)

# ---------------------- ループ ----------------------------
for _ in range(1000):
    writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    action_b, _ = policy.predict(obs[None, ...], deterministic=True)
    obs, _, done, truncated, _ = env.step(action_b[0])

    frame = env.render()
    if done or truncated:
        writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
        break

# -------------------- 後処理 ------------------------------
writer.release()
env.close()
print("MP4 を保存しました:", video_path)


MP4 を保存しました: ./videos/pendulum_dagger.mp4
