In [1]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

Torch version: 2.5.1
CUDA available: True
CUDA device count: 1
GPU name: NVIDIA GeForce RTX 4080 SUPER


In [2]:
# Cell 1: quick checks
import sys, platform
import torch
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("PyTorch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
import mujoco
print("MuJoCo package version:", mujoco.__version__)

Python: 3.10.19
Platform: Windows-10-10.0.26200-SP0
PyTorch: 2.5.1 | CUDA available: True
MuJoCo package version: 3.4.0


In [3]:
# Cell 2: imports
import os
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.evaluation import evaluate_policy

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [4]:
# Cell 3: env setup
ENV_ID = "HumanoidStandup-v5"   # requires mujoco >= 2.3
n_envs = 4               # on Windows + single CPU, 4 is reasonable; reduce if CPU bound
seed = 0

# Make vectorized training envs
train_env = make_vec_env(
    ENV_ID,
    n_envs=n_envs,
    seed=seed,
    monitor_dir="./logs/train_monitor"
)

# VecNormalize (important for continuous control)
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)

# eval env
eval_env = make_vec_env(
    ENV_ID,
    n_envs=1,
    seed=seed + 100,
    monitor_dir="./logs/eval_monitor"
)
eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, clip_obs=10.0)

print("Obs space:", train_env.observation_space)
print("Act space:", train_env.action_space)

Obs space: Box(-inf, inf, (348,), float64)
Act space: Box(-0.4, 0.4, (17,), float32)


In [7]:
# Cell 4: create PPO
policy_kwargs = dict(net_arch=[dict(pi=[512,512], vf=[512,512])])

model = PPO(
    "MlpPolicy",
    train_env,
    verbose=1,
    device=device,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    learning_rate=3e-4,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=1e-3,
    policy_kwargs=policy_kwargs,
)

Using cuda device


In [8]:
# Cell 5: callbacks and save dirs
os.makedirs("./models", exist_ok=True)
os.makedirs("./checkpoints", exist_ok=True)
os.makedirs("./best_model", exist_ok=True)

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./best_model/",
    log_path="./logs/eval",
    eval_freq=100_000,
    n_eval_episodes=5,
    deterministic=True,
    render=False
)

checkpoint_callback = CheckpointCallback(save_freq=500_000, save_path="./checkpoints/", name_prefix="ppo_humanoid_standup")

In [9]:
# Cell 6: train
TOTAL_TIMESTEPS = int(5e6)  # start with 5M; increase if you have time/resources
model.learn(total_timesteps=TOTAL_TIMESTEPS, callback=[eval_callback, checkpoint_callback])
model.save("./models/ppo_humanoid_standup_final")
# save VecNormalize
train_env.save("./models/vecnormalize_train.pkl")

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 3.93e+04 |
| time/              |          |
|    fps             | 1039     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 8192     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 3.96e+04   |
| time/                   |            |
|    fps                  | 642        |
|    iterations           | 2          |
|    time_elapsed         | 25         |
|    total_timesteps      | 16384      |
| train/                  |            |
|    approx_kl            | 0.39317805 |
|    clip_fraction        | 0.699      |
|    clip_range           | 0.2        |
|    entropy_loss         | -24.1      |
|    explained_variance   | -0.335     |
|    learning_rate        | 0.0003     |
|   

In [10]:
# Cell 7: load model and evaluate
model = PPO.load("./models/ppo_humanoid_standup_final", device=device)

venv = make_vec_env(ENV_ID, n_envs=1, seed=42)
venv = VecNormalize.load("./models/vecnormalize_train.pkl", venv)
venv.training = False
venv.norm_reward = False

mean_reward, std_reward = evaluate_policy(model, venv, n_eval_episodes=10, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward: 139755.11 +/- 29780.18


In [15]:
import os
os.environ["MUJOCO_GL"] = "glfw"

import gymnasium as gym
import imageio
import numpy as np
from IPython.display import Video, display
import tempfile

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

ENV_ID = "HumanoidStandup-v5"
N_EVAL_EPISODES = 10
MAX_STEPS = 1500
FPS = 30

# create env
env = gym.make(ENV_ID, render_mode="rgb_array")
venv = DummyVecEnv([lambda: env])
venv = VecNormalize.load("./models/vecnormalize_train.pkl", venv)
venv.training = False
venv.norm_reward = False

# load model
# model = PPO.load("./models/ppo_humanoid_final", device="cpu")
model = PPO.load("./best_model/best_model", device="cpu")

obs = venv.reset()
frames = []

for step in range(MAX_STEPS):
    action, _ = model.predict(obs, deterministic=True)
    obs, rewards, dones, infos = venv.step(action)

    frame = env.render()
    if frame is not None:
        frames.append(frame)

    if dones[0]:
        break

ep_length = len(frames)
print(f"Episode: {ep_length} frames")

# save and display only the longest episode
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
writer = imageio.get_writer(tmp, fps=FPS, codec="libx264")
for f in frames:
    writer.append_data(f)
writer.close()

display(Video(tmp, embed=True))

Episode: 1000 frames
