In [1]:
import datetime
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, CallbackList
from env import UArmEnv

pybullet build time: Jan 29 2025 23:16:28


In [2]:
env = DummyVecEnv([lambda: UArmEnv(render=False)])
eval_env = DummyVecEnv([lambda: UArmEnv(render=False)])




In [None]:
class RewardAndGoalLoggingCallback(BaseCallback):
    def __init__(self, log_interval=20000, verbose=0):
        super().__init__(verbose)
        self.log_interval = log_interval
        self.cumulative_reward = 0

    def _on_step(self):
        reward = self.locals["rewards"][0]
        self.cumulative_reward += reward

        if self.num_timesteps % self.log_interval == 0:
            current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            goal_pos = self.training_env.get_attr("goal_pos")[0]
            # ef_pos = self.training_env.get_attr("robot")[0].get_joint_obs()["ee_pos"]
            print(f"time: {current_time} 📈 Step {self.num_timesteps} | Reward (last {self.log_interval}): {self.cumulative_reward:.2f} | 🎯 Goal: {np.round(goal_pos, 3)}")
            self.cumulative_reward = 0
        return True


In [4]:
class CheckpointCallback(BaseCallback):
    def __init__(self, save_freq, save_path, name_prefix="model", verbose=1):
        super().__init__(verbose)
        self.save_freq = save_freq
        self.save_path = save_path
        self.name_prefix = name_prefix

    def _on_step(self) -> bool:
        if self.n_calls % self.save_freq == 0:
            path = f"{self.save_path}/{self.name_prefix}_{self.num_timesteps}.zip"
            self.model.save(path)
            if self.verbose:
                print(f"💾 Saved checkpoint to {path}")
        return True


In [None]:
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./ppo_checkpoints/",
    log_path="./ppo_checkpoints/",
    eval_freq=20000,
    deterministic=True,
    render=False
)

checkpoint_callback = CheckpointCallback(
    save_freq=20000,
    save_path="./ppo_checkpoints/",
    name_prefix="ppo_uarm_checkpoint"
)

reward_logging_callback = RewardAndGoalLoggingCallback(log_interval=20000)

callback = CallbackList([
    eval_callback,
    checkpoint_callback,
    reward_logging_callback
])


In [6]:
model = PPO(
    "MlpPolicy",
    env,
    ent_coef=0.005,
    n_steps=2048,
    batch_size=256,
    gae_lambda=0.95,
    gamma=0.99,
    verbose=0
)




In [None]:
model.learn(total_timesteps=1_500_000, callback=callback)
model.save("ppo_uarm_final")
env.close()

⏱️ Timeout at Step 500 | EE: [ 0.133 -0.196  0.009] | Goal: [ 0.043 -0.324  0.025]
⏱️ Timeout at Step 500 | EE: [-0.015 -0.03   0.284] | Goal: [ 0.042 -0.321  0.025]
⏱️ Timeout at Step 500 | EE: [0.039 0.047 0.303] | Goal: [ 0.045 -0.316  0.025]
⏱️ Timeout at Step 500 | EE: [ 0.082 -0.063  0.324] | Goal: [-0.169 -0.221  0.03 ]
⏱️ Timeout at Step 500 | EE: [ 0.013 -0.189  0.039] | Goal: [ 0.111 -0.25   0.06 ]
⏱️ Timeout at Step 500 | EE: [-0.12  -0.149  0.023] | Goal: [ 0.106 -0.251  0.025]
⏱️ Timeout at Step 500 | EE: [-0.071 -0.22  -0.008] | Goal: [ 0.109 -0.246  0.025]
⏱️ Timeout at Step 500 | EE: [-0.057 -0.068  0.204] | Goal: [-0.169 -0.221  0.03 ]
⏱️ Timeout at Step 500 | EE: [-0.122 -0.12   0.098] | Goal: [ 0.037 -0.319  0.06 ]
⏱️ Timeout at Step 500 | EE: [ 0.008 -0.062  0.232] | Goal: [ 0.038 -0.254  0.06 ]
⏱️ Timeout at Step 500 | EE: [-0.131 -0.128  0.09 ] | Goal: [ 0.106 -0.247  0.06 ]
⏱️ Timeout at Step 500 | EE: [ 0.058 -0.133  0.191] | Goal: [-0.169 -0.221  0.03 ]
⏱️ Time



⏱️ Timeout at Step 500 | EE: [-0.009 -0.191  0.072] | Goal: [ 0.035 -0.32   0.025]
⏱️ Timeout at Step 500 | EE: [-0.005 -0.194  0.078] | Goal: [ 0.105 -0.248  0.025]
⏱️ Timeout at Step 500 | EE: [-0.006 -0.192  0.073] | Goal: [ 0.106 -0.323  0.025]
⏱️ Timeout at Step 500 | EE: [-0.008 -0.194  0.077] | Goal: [ 0.038 -0.247  0.025]
⏱️ Timeout at Step 500 | EE: [-0.005 -0.192  0.073] | Goal: [ 0.114 -0.324  0.025]
Eval num_timesteps=20000, episode_reward=-77.67 +/- 17.39
Episode length: 500.00 +/- 0.00
New best mean reward!
time: 2025-06-20 15:25:37 📈 Step 20000 | Reward (last 20000): -5292.63 | 🎯 Goal: [ 0.042 -0.25   0.025]
⏱️ Timeout at Step 500 | EE: [ 0.167 -0.126 -0.011] | Goal: [ 0.042 -0.25   0.025]
⏱️ Timeout at Step 500 | EE: [-0.031 -0.031  0.268] | Goal: [ 0.044 -0.252  0.06 ]
⏱️ Timeout at Step 500 | EE: [ 0.065 -0.082  0.174] | Goal: [ 0.041 -0.249  0.06 ]
⏱️ Timeout at Step 500 | EE: [-0.013 -0.215 -0.005] | Goal: [ 0.112 -0.324  0.025]
⏱️ Timeout at Step 500 | EE: [ 0.157 