In [1]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

Torch version: 2.5.1
CUDA available: True
CUDA device count: 1
GPU name: NVIDIA GeForce RTX 4080 SUPER


In [2]:
# Quick checks
import sys, platform
import torch
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("PyTorch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
import mujoco
print("MuJoCo package version:", mujoco.__version__)

Python: 3.10.19
Platform: Windows-10-10.0.26200-SP0
PyTorch: 2.5.1 | CUDA available: True
MuJoCo package version: 3.4.0


In [3]:
# Imports
import os
import gymnasium as gym
import numpy as np
import torch
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.evaluation import evaluate_policy

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [4]:
# Env setup
ENV_ID = "Pusher-v5"   # requires mujoco >= 2.3
n_envs = 4               # on Windows + single CPU, 4 is reasonable; reduce if CPU bound
seed = 0

# Make vectorized training envs
train_env = make_vec_env(
    ENV_ID,
    n_envs=n_envs,
    seed=seed,
    monitor_dir="./logs/train_monitor"
)

# VecNormalize (important for continuous control)
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)

# eval env
eval_env = make_vec_env(
    ENV_ID,
    n_envs=1,
    seed=seed + 100,
    monitor_dir="./logs/eval_monitor"
)
eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, clip_obs=10.0)

print("Obs space:", train_env.observation_space)
print("Act space:", train_env.action_space)

Obs space: Box(-inf, inf, (23,), float64)
Act space: Box(-2.0, 2.0, (7,), float32)


In [5]:
# Create SAC
model = SAC(
    "MlpPolicy",
    train_env,
    learning_rate=2e-4,        # slightly smaller to stabilize updates
    buffer_size=int(1e6),      # large replay buffer
    batch_size=256,
    gamma=0.995,               # higher discount to value future progress
    tau=0.005,
    ent_coef="auto",           # automatic entropy tuning
    train_freq=1,
    gradient_steps=1,
    policy_kwargs=dict(net_arch=[512, 512]),
    device=device,
    verbose=1,
)

Using cuda device


In [6]:
# Callbacks and save dirs
os.makedirs("./models", exist_ok=True)
os.makedirs("./checkpoints", exist_ok=True)
os.makedirs("./best_model", exist_ok=True)

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./best_model/",
    log_path="./logs/eval",
    eval_freq=100_000,
    n_eval_episodes=5,
    deterministic=True,
    render=False
)

checkpoint_callback = CheckpointCallback(save_freq=500_000, save_path="./checkpoints/", name_prefix="sac_pusher")

In [7]:
# Train
TOTAL_TIMESTEPS = int(5e6)
model.learn(total_timesteps=TOTAL_TIMESTEPS, callback=[eval_callback, checkpoint_callback])
model.save("./models/sac_pusher_final")
# save VecNormalize
train_env.save("./models/vecnormalize_train.pkl")

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -154     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 180      |
|    time_elapsed    | 2        |
|    total_timesteps | 400      |
| train/             |          |
|    actor_loss      | -9.97    |
|    critic_loss     | 0.467    |
|    ent_coef        | 0.985    |
|    ent_coef_loss   | -0.171   |
|    learning_rate   | 0.0002   |
|    n_updates       | 74       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -155     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 160      |
|    time_elapsed    | 4        |
|    total_timesteps | 800      |
| train/             |          |
|    actor_loss      | -11.8    |
|    critic_loss     | 0.411    |
|    ent_coef 

In [8]:
# Load model and evaluate
model = SAC.load("./models/sac_pusher_final", device=device)

venv = make_vec_env(ENV_ID, n_envs=1, seed=42)
venv = VecNormalize.load("./models/vecnormalize_train.pkl", venv)
venv.training = False
venv.norm_reward = False

mean_reward, std_reward = evaluate_policy(model, venv, n_eval_episodes=10, deterministic=True)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward: -26.32 +/- 4.86


In [9]:
# Visualize the agent
import os
os.environ["MUJOCO_GL"] = "glfw"

import gymnasium as gym
import imageio
import numpy as np
from IPython.display import Video, display
import tempfile

from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

ENV_ID = "Pusher-v5"
N_EVAL_EPISODES = 10
MAX_STEPS = 1500
FPS = 30

# create env
env = gym.make(ENV_ID, render_mode="rgb_array")
venv = DummyVecEnv([lambda: env])
venv = VecNormalize.load("./models/vecnormalize_train.pkl", venv)
venv.training = False
venv.norm_reward = False

# load model
# model = SAC.load("./models/sac_humanoid_final", device="cpu")
model = SAC.load("./best_model/best_model", device="cpu")

best_frames = []
best_length = 0

for ep in range(N_EVAL_EPISODES):
    obs = venv.reset()
    frames = []

    for step in range(MAX_STEPS):
        action, _ = model.predict(obs, deterministic=True)
        obs, rewards, dones, infos = venv.step(action)

        frame = env.render()
        if frame is not None:
            frames.append(frame)

        if dones[0]:
            break

    ep_length = len(frames)
    print(f"Episode {ep+1}: {ep_length} frames")

    if ep_length > best_length:
        best_length = ep_length
        best_frames = frames

print(f"\nLongest episode length: {best_length} frames")

# save and display only the longest episode
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
writer = imageio.get_writer(tmp, fps=FPS, codec="libx264")
for f in best_frames:
    writer.append_data(f)
writer.close()

display(Video(tmp, embed=True))

Episode 1: 100 frames
Episode 2: 100 frames
Episode 3: 100 frames
Episode 4: 100 frames
Episode 5: 100 frames
Episode 6: 100 frames
Episode 7: 100 frames
Episode 8: 100 frames
Episode 9: 100 frames
Episode 10: 100 frames

Longest episode length: 100 frames
