In [None]:
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np
import datetime


class RewardAndGoalLoggingCallback(BaseCallback):
    def __init__(self, log_interval=20000, verbose=0):
        super().__init__(verbose)
        self.log_interval = log_interval
        self.cumulative_reward = 0
        self.step_counter = 0

    def _on_step(self):
        reward = self.locals["rewards"][0]
        self.cumulative_reward += reward
        self.step_counter += 1
        self.save_path="./ppo_checkpoints"

        if self.step_counter % self.log_interval == 0:
            # Get the goal_pos from the environment (VecEnv-safe)
            curent_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            goal_pos = self.training_env.get_attr("goal_pos")[0]
            # ef_pos = self.training_env.get_attr("robot")[0].get_joint_obs()["ee_pos"]
            print(f"time: {curent_time} 📈 Step {self.num_timesteps} | Reward (last {self.log_interval}): {round(self.cumulative_reward, 4)} | 🎯 Goal: {np.round(goal_pos, 3)}")
            self.cumulative_reward = 0

        # if self.num_timesteps % self.log_interval == 0:
            path = f"{self.save_path}/ppo_checkpoint_{self.num_timesteps}.zip"
            self.model.save(path)
            # print(f"💾 Model saved at step {self.num_timesteps} → {path}")


        return True

In [2]:
from stable_baselines3 import PPO
from env import UArmEnv

# Create environment and model
env = UArmEnv(render=False)
model = PPO(
    "MlpPolicy",
    env,
    ent_coef=0.005,        # Encourages broader exploration without overwhelming reward
    n_steps=2048,          # Longer rollout → better credit assignment
    batch_size=256,        # Good match for long rollouts
    gae_lambda=0.95,       # Slightly longer-term reward tracking
    gamma=0.99,            # Standard long-term discount
    verbose=0
)

# Add logging callback
reward_logger = RewardAndGoalLoggingCallback(log_interval=20000)

# Train with logging
model.learn(total_timesteps=2_000_000, callback=reward_logger)

env.close()

model.save("ppo_uarm_final_again")


pybullet build time: Jan 29 2025 23:16:28


⏱️ Timeout at Step 500 | EE: [0.08  0.067 0.303] | Goal: [-0.169 -0.151  0.03 ]
⏱️ Timeout at Step 500 | EE: [-0.131 -0.093  0.122] | Goal: [ 0.11  -0.321  0.025]
⏱️ Timeout at Step 500 | EE: [-0.094 -0.104  0.141] | Goal: [ 0.11  -0.254  0.06 ]
⏱️ Timeout at Step 500 | EE: [ 0.114 -0.183  0.223] | Goal: [ 0.107 -0.32   0.06 ]
⏱️ Timeout at Step 500 | EE: [ 0.041 -0.041  0.28 ] | Goal: [ 0.107 -0.324  0.025]
⏱️ Timeout at Step 500 | EE: [ 0.106 -0.225  0.161] | Goal: [ 0.106 -0.245  0.06 ]
⏱️ Timeout at Step 500 | EE: [-0.06  -0.175  0.065] | Goal: [-0.169 -0.151  0.03 ]
⏱️ Timeout at Step 500 | EE: [-0.181 -0.166  0.175] | Goal: [ 0.112 -0.252  0.06 ]
⏱️ Timeout at Step 500 | EE: [ 0.165 -0.155 -0.003] | Goal: [ 0.109 -0.324  0.06 ]
⏱️ Timeout at Step 500 | EE: [-0.148 -0.118 -0.014] | Goal: [ 0.109 -0.317  0.025]
⏱️ Timeout at Step 500 | EE: [-0.026 -0.183  0.082] | Goal: [ 0.038 -0.319  0.06 ]
⏱️ Timeout at Step 500 | EE: [ 0.004 -0.309  0.229] | Goal: [ 0.105 -0.32   0.025]
⏱️ Time

AttributeError: 'RewardAndGoalLoggingCallback' object has no attribute 'save_path'

In [None]:
env.close()

model.save("ppo_uarm_overtrained")