In [1]:
from stable_baselines3.common.callbacks import BaseCallback
import numpy as np

class RewardAndGoalLoggingCallback(BaseCallback):
    def __init__(self, log_interval=20000, verbose=0):
        super().__init__(verbose)
        self.log_interval = log_interval
        self.cumulative_reward = 0
        self.step_counter = 0

    def _on_step(self):
        reward = self.locals["rewards"][0]
        self.cumulative_reward += reward
        self.step_counter += 1

        if self.step_counter % self.log_interval == 0:
            # Get the goal_pos from the environment (VecEnv-safe)
            goal_pos = self.training_env.get_attr("goal_pos")[0]
            ef_pos = self.training_env.get_attr("robot")[0].get_joint_obs()["ee_pos"]
            print(f"📈 Step {self.num_timesteps} | Reward (last {self.log_interval}): {round(self.cumulative_reward, 4)} | 🎯 Goal: {np.round(goal_pos, 3)},f🤖 EE Pos: {np.round(ef_pos, 3)} ")
            self.cumulative_reward = 0

        return True


In [None]:
from stable_baselines3 import PPO
from env import UArmEnv

# Create environment and model
env = UArmEnv(render=False)
model = PPO(
    "MlpPolicy",
    env,
    ent_coef=0.005,        # Encourages broader exploration without overwhelming reward
    n_steps=2048,          # Longer rollout → better credit assignment
    batch_size=256,        # Good match for long rollouts
    gae_lambda=0.95,       # Slightly longer-term reward tracking
    gamma=0.99,            # Standard long-term discount
    verbose=1
)

# Add logging callback
reward_logger = RewardAndGoalLoggingCallback(log_interval=20000)

# Train with logging
model.learn(total_timesteps=1_000_000, callback=reward_logger)

env.close()

model.save("ppo_uarm_pos_final_30")

pybullet build time: Jan 29 2025 23:16:28


camera_linkUsing cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




camera_linkGoal position set to: [ 0.04390762 -0.32124802  0.06      ]
🎉 Goal reached! EE: [ 0.063 -0.316  0.07 ] | Goal: [ 0.044 -0.321  0.06 ] | Reward: -0.0221
camera_linkGoal position set to: [ 0.10544503 -0.25465036  0.06      ]
camera_linkGoal position set to: [-0.16939274 -0.31760068  0.03      ]
camera_linkGoal position set to: [ 0.04086447 -0.32363899  0.06      ]
camera_linkGoal position set to: [ 0.11148211 -0.25302931  0.025     ]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 416      |
|    ep_rew_mean     | -99.8    |
| time/              |          |
|    fps             | 507      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
🎉 Goal reached! EE: [ 0.098 -0.264  0.03 ] | Goal: [ 0.111 -0.253  0.025] | Reward: -0.0188
camera_linkGoal position set to: [ 0.17986938 -0.24742816  0.06      ]
camera_linkGoal position set to: [-0.16939274 -0.1

In [None]:
model.save("ppo_uarm")
1 000 000