In [2]:
!pip install pygame

Collecting pygame
  Downloading pygame-2.6.1-cp311-cp311-win_amd64.whl.metadata (13 kB)
Downloading pygame-2.6.1-cp311-cp311-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   - -------------------------------------- 0.5/10.6 MB 1.7 MB/s eta 0:00:07
   - -------------------------------------- 0.5/10.6 MB 1.7 MB/s eta 0:00:07
   --- ------------------------------------ 1.0/10.6 MB 1.4 MB/s eta 0:00:08
   ---- ----------------------------------- 1.3/10.6 MB 1.3 MB/s eta 0:00:08
   ----- ---------------------------------- 1.6/10.6 MB 1.4 MB/s eta 0:00:07
   -------- ------------------------------- 2.4/10.6 MB 1.7 MB/s eta 0:00:05
   ---------- ----------------------------- 2.9/10.6 MB 1.9 MB/s eta 0:00:05
   ------------- -------------------------- 3.7/10.6 MB 2.1 MB/s eta 0:00:04
   --------------- ------------------------ 4.2/10.6 MB 2.2 MB/s eta 0:00:03
   ---------------

In [3]:
!pip install tensorboard


Collecting tensorboard
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.76.0-cp311-cp311-win_amd64.whl.metadata (3.8 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading markdown-3.10-py3-none-any.whl.metadata (5.1 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Downloading protobuf-6.33.2-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.1.4-py3-none-any.whl.metadata (4.0 kB)
Downloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   - -------------------------------------- 0.3/5.5 MB ? eta -:--:--
   --- -----------------------

In [1]:
import os
import numpy as np
import gymnasium as gym
import register_envs
import pygame

from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy

  from pkg_resources import resource_stream, resource_exists


In [2]:
def make_env():
    env = env = gym.make("BallPlate-v0", render_mode=None)
    env = Monitor(env)   # records episode reward & length
    return env

env = make_vec_env(make_env, n_envs=1)

In [3]:
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)

Action Space: Box(-1.0, 1.0, (2,), float32)
Observation Space: Box(-inf, inf, (6,), float32)


In [4]:
model = SAC(
    policy="MlpPolicy",
    env=env,

    learning_rate=3e-4,
    buffer_size=200_000,
    learning_starts=5_000,

    batch_size=256,
    tau=0.005,
    gamma=0.99,

    train_freq=1,
    gradient_steps=1,

    ent_coef="auto",
    verbose=1,
    tensorboard_log="./sac_ballplate_tensorboard/"
)

Using cuda device


In [5]:
TOTAL_TIMESTEPS = 300_000
model.learn(total_timesteps=TOTAL_TIMESTEPS)

Logging to ./sac_ballplate_tensorboard/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 49.2     |
|    ep_rew_mean     | -86.7    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 1883     |
|    time_elapsed    | 0        |
|    total_timesteps | 197      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 47.6     |
|    ep_rew_mean     | -90.3    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 1969     |
|    time_elapsed    | 0        |
|    total_timesteps | 381      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 85.2     |
|    ep_rew_mean     | -69.5    |
| time/              |          |
|    episodes        | 12       |
|    fps             | 2243     |
|    time_elapsed    | 0        |
|  

<stable_baselines3.sac.sac.SAC at 0x240072e2790>

In [6]:
model.save("sac_ballplate")

In [3]:
model = SAC.load("sac_ballplate", env=env)

In [8]:
mean_reward, std_reward = evaluate_policy(
    model,
    env,
    n_eval_episodes=20,
    deterministic=True,
    return_episode_rewards=False
)

print("===== Evaluation Summary =====")
print(f"Mean Episode Reward : {mean_reward:.2f}")
print(f"Std Episode Reward  : {std_reward:.2f}")

===== Evaluation Summary =====
Mean Episode Reward : 920.37
Std Episode Reward  : 250.72


In [9]:
episode_rewards, episode_lengths = evaluate_policy(
    model,
    env,
    n_eval_episodes=10,
    deterministic=True,
    return_episode_rewards=True
)

print("===== Episode-wise Metrics =====")
for i, (r, l) in enumerate(zip(episode_rewards, episode_lengths), 1):
    print(f"Episode {i:02d} | Reward: {r:.2f} | Length: {l}")

===== Episode-wise Metrics =====
Episode 01 | Reward: 890.89 | Length: 1200
Episode 02 | Reward: 909.31 | Length: 1200
Episode 03 | Reward: 937.20 | Length: 1200
Episode 04 | Reward: 901.12 | Length: 1200
Episode 05 | Reward: 945.92 | Length: 1200
Episode 06 | Reward: 958.98 | Length: 1200
Episode 07 | Reward: 903.08 | Length: 1200
Episode 08 | Reward: 928.41 | Length: 1200
Episode 09 | Reward: 931.17 | Length: 1200
Episode 10 | Reward: 908.96 | Length: 1200


last cell for demo 

In [None]:
eval_env = env = gym.make("BallPlate-v0", render_mode="human")

episodes = 5
for ep in range(episodes):
    obs, _ = eval_env.reset()
    done = False
    total_reward = 0.0
    steps = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, _ = eval_env.step(action)

        total_reward += reward
        steps += 1

        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                done = True

    print(f"[Render Eval] Episode {ep+1} | Total Reward: {total_reward:.2f} | Steps: {steps}")

eval_env.close()


[Render Eval] Episode 1 | Total Reward: 3878.65 | Steps: 5185
