In [None]:
# ULTIMATE HUMANOID WALKING TRAINING - ALL LATEST IMPROVEMENTS
# Includes: Basic training, Quick improvement, Curriculum learning, Advanced rewards
# No demo data needed - pure RL from scratch with best methods!

import numpy as np
import gymnasium as gym
import mujoco
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
import time
import os

print("🚀 ULTIMATE HUMANOID WALKING TRAINING SUITE")
print("=" * 50)
print("✅ All latest improvements loaded!")
print("🎯 Includes: Basic → Quick Improve → Curriculum → Advanced")
print("🚫 No demo data needed - pure RL from scratch!")
print("📊 Tensorboard monitoring included!")




✅ New working approach loaded successfully!
🎯 Using step-by-step training: Standing → Walking
🚫 No demo data needed - pure RL from scratch!


In [None]:
# DEFINE ALL ENVIRONMENT CLASSES (Basic → Improved)
# All environments included directly in notebook

class StandingHumanoidEnv(gym.Env):
    """Phase 1: Learn to stand upright and maintain balance."""
    
    def __init__(self, xml_path="humanoid.xml"):
        super().__init__()
        self.model = mujoco.MjModel.from_xml_path(xml_path)
        self.data = mujoco.MjData(self.model)
        self.viewer = None
        
        obs_dim = self.model.nq + self.model.nv
        self.observation_space = gym.spaces.Box(-np.inf, np.inf, shape=(obs_dim,), dtype=np.float32)
        self.action_space = gym.spaces.Box(-0.3, 0.3, shape=(self.model.nu,), dtype=np.float32)
        
        self.step_count = 0
        
    def reset(self, seed=None):
        super().reset(seed=seed)
        self.data.qpos[:] = 0
        self.data.qpos[2] = 1.3  # Standing height
        self.data.qpos[3] = 1.0  # Upright quaternion
        
        # Add tiny random perturbations
        self.data.qpos[:] += np.random.normal(0, 0.02, size=self.model.nq)
        self.data.qpos[2] = max(1.2, self.data.qpos[2])
        self.data.qvel[:] = 0
        
        mujoco.mj_forward(self.model, self.data)
        self.step_count = 0
        return self._get_obs(), {}
    
    def step(self, action):
        self.data.ctrl[:] = np.clip(action, -0.3, 0.3)
        mujoco.mj_step(self.model, self.data)
        
        obs = self._get_obs()
        reward = self._standing_reward()
        
        terminated = self.data.qpos[2] < 0.5
        truncated = self.step_count >= 500
        self.step_count += 1
        
        return obs, reward, terminated, truncated, {}
    
    def _get_obs(self):
        return np.concatenate([self.data.qpos, self.data.qvel])
    
    def _standing_reward(self):
        height = self.data.qpos[2]
        if height > 1.2:
            height_reward = 2.0
        elif height > 1.0:
            height_reward = 1.0
        elif height > 0.8:
            height_reward = 0.5
        else:
            height_reward = -1.0
        
        stability_bonus = 1.0 if height > 1.0 else 0.0
        control_penalty = -0.001 * np.sum(np.square(self.data.ctrl))
        survival_bonus = 0.1
        
        return height_reward + stability_bonus + control_penalty + survival_bonus
    
    def render(self, mode="human"):
        if self.viewer is None:
            try:
                import mujoco.viewer as viewer
                self.viewer = viewer.launch_passive(self.model, self.data)
            except:
                self.viewer = "disabled"
        if self.viewer != "disabled" and hasattr(self.viewer, 'sync'):
            self.viewer.sync()

class WalkingHumanoidEnv(StandingHumanoidEnv):
    """Phase 2: Learn to walk forward while maintaining balance."""
    
    def __init__(self, xml_path="humanoid.xml"):
        super().__init__(xml_path)
        self.action_space = gym.spaces.Box(-0.5, 0.5, shape=(self.model.nu,), dtype=np.float32)
        
    def step(self, action):
        self.data.ctrl[:] = np.clip(action, -0.5, 0.5)
        mujoco.mj_step(self.model, self.data)
        
        obs = self._get_obs()
        reward = self._walking_reward()
        
        terminated = self.data.qpos[2] < 0.4 or abs(self.data.qpos[1]) > 2.0
        truncated = self.step_count >= 1000
        self.step_count += 1
        
        return obs, reward, terminated, truncated, {}
    
    def _walking_reward(self):
        pos = self.data.qpos
        vel = self.data.qvel
        
        height = pos[2]
        forward_vel = vel[0]
        side_pos = abs(pos[1])
        
        # Height reward (most important)
        if height > 1.1:
            height_reward = 3.0
        elif height > 0.9:
            height_reward = 2.0
        elif height > 0.6:
            height_reward = 1.0
        else:
            height_reward = -2.0
        
        # Forward movement reward
        if forward_vel > 0.2:
            forward_reward = 2.0 * min(forward_vel, 1.5)
        else:
            forward_reward = 0.0
        
        # Penalties and bonuses
        side_penalty = -1.0 * side_pos
        control_penalty = -0.01 * np.sum(np.square(self.data.ctrl))
        survival_bonus = 0.2
        
        return height_reward + forward_reward + side_penalty + control_penalty + survival_bonus

print("✅ Basic environments defined!")

# Test basic standing environment
standing_env = StandingHumanoidEnv()
obs, info = standing_env.reset()
print(f"📊 Standing env - Obs: {obs.shape}, Actions: {standing_env.action_space}")

# Quick standing test
total_reward = 0
for i in range(100):
    action = np.zeros(standing_env.action_space.shape[0])
    obs, reward, terminated, truncated, _ = standing_env.step(action)
    total_reward += reward
    if terminated:
        break

print(f"📈 Standing test: {total_reward/100:.2f} avg reward")
print("✅ Basic environments ready!")


🧪 Testing Standing Environment...
✅ Standing environment created!
📊 Observation shape: (55,)
🎮 Action space: Box(-0.3, 0.3, (21,), float32)
🎯 Goal: Learn to stand upright (height > 1.0m)
📈 Zero control test: 3.10 avg reward
✅ Environment ready for training!


In [None]:
# ADVANCED IMPROVED WALKING ENVIRONMENT
# Enhanced reward function with curriculum learning support

class ImprovedWalkingEnv(gym.Env):
    """Enhanced walking environment with better reward shaping and curriculum learning"""
    
    def __init__(self, xml_path="humanoid.xml", difficulty=1.0):
        super().__init__()
        self.model = mujoco.MjModel.from_xml_path(xml_path)
        self.data = mujoco.MjData(self.model)
        self.viewer = None
        
        # Difficulty scaling (0.5 = easier, 2.0 = harder)
        self.difficulty = difficulty
        
        obs_dim = self.model.nq + self.model.nv
        self.observation_space = gym.spaces.Box(-np.inf, np.inf, shape=(obs_dim,), dtype=np.float32)
        self.action_space = gym.spaces.Box(-0.8, 0.8, shape=(self.model.nu,), dtype=np.float32)
        
        # Episode parameters
        self.max_episode_steps = int(1500 / difficulty)  # Easier = longer episodes
        self.step_count = 0
        
        # Reward tracking
        self.prev_x_pos = 0.0
        self.cumulative_distance = 0.0
        self.stability_bonus = 0.0
        
    def reset(self, seed=None):
        super().reset(seed=seed)
        
        # Reset to standing with slight randomization
        self.data.qpos[:] = 0
        self.data.qpos[2] = 1.3  # Standing height
        self.data.qpos[3] = 1.0  # Upright quaternion
        
        # Add small perturbations for robustness
        noise_scale = 0.03 / self.difficulty  # Easier = less noise
        self.data.qpos[:] += np.random.normal(0, noise_scale, size=self.model.nq)
        self.data.qpos[2] = max(1.1, self.data.qpos[2])
        
        # Zero velocities with small random component
        self.data.qvel[:] = 0
        self.data.qvel[:] += np.random.normal(0, 0.02, size=self.model.nv)
        
        mujoco.mj_forward(self.model, self.data)
        
        # Reset tracking
        self.step_count = 0
        self.prev_x_pos = self.data.qpos[0]
        self.cumulative_distance = 0.0
        self.stability_bonus = 0.0
        
        return self._get_obs(), {}
    
    def step(self, action):
        # Apply action with scaling
        action_scale = 0.7 / self.difficulty  # Easier = smaller actions
        self.data.ctrl[:] = np.clip(action * action_scale, -0.8, 0.8)
        
        # Step simulation
        mujoco.mj_step(self.model, self.data)
        
        obs = self._get_obs()
        reward = self._compute_enhanced_reward()
        
        # Enhanced termination conditions
        terminated = self._is_terminated()
        truncated = self.step_count >= self.max_episode_steps
        
        self.step_count += 1
        
        # Update tracking
        current_x = self.data.qpos[0]
        forward_progress = max(0, current_x - self.prev_x_pos)
        self.cumulative_distance += forward_progress
        self.prev_x_pos = current_x
        
        return obs, reward, terminated, truncated, {}
    
    def _get_obs(self):
        return np.concatenate([self.data.qpos, self.data.qvel])
    
    def _compute_enhanced_reward(self):
        """Enhanced reward function with better walking incentives"""
        pos = self.data.qpos
        vel = self.data.qvel
        
        # Position and velocity components
        x, y, z = pos[0], pos[1], pos[2]
        vx, vy, vz = vel[0], vel[1], vel[2]
        
        # Quaternion for orientation
        quat = pos[3:7]
        w, qx, qy, qz = quat[0], quat[1], quat[2], quat[3]
        
        reward = 0.0
        
        # 1. FORWARD VELOCITY (primary objective)
        target_speed = 1.2 * self.difficulty
        if vx > 0.3:
            forward_reward = 4.0 * min(vx / target_speed, 1.5)
            # Bonus for maintaining good speed
            if 0.8 <= vx <= 2.0:
                forward_reward += 2.0
        else:
            forward_reward = -0.5  # Penalty for not moving forward
        reward += forward_reward
        
        # 2. HEIGHT STABILITY (crucial for walking)
        target_height = 1.3
        height_diff = abs(z - target_height)
        if height_diff < 0.1:
            height_reward = 3.0
        elif height_diff < 0.3:
            height_reward = 2.0 * (1.0 - height_diff / 0.3)
        else:
            height_reward = -2.0
        reward += height_reward
        
        # 3. UPRIGHT ORIENTATION
        upright_factor = 2 * (w**2 + qz**2) - 1
        upright_reward = 2.0 * max(0, upright_factor)
        reward += upright_reward
        
        # 4. LATERAL STABILITY
        lateral_penalty = -2.0 * (abs(vy) + abs(y))
        reward += lateral_penalty
        
        # 5. GAIT SMOOTHNESS (encourage natural walking)
        action_smoothness = -0.01 * np.sum(np.square(self.data.ctrl))
        reward += action_smoothness
        
        # 6. PROGRESS BONUS
        if self.cumulative_distance > 0:
            progress_bonus = min(self.cumulative_distance * 0.5, 3.0)
            reward += progress_bonus
        
        # 7. STABILITY BONUS (for staying upright over time)
        if z > 1.0 and abs(y) < 0.3:
            self.stability_bonus += 0.01
            reward += min(self.stability_bonus, 1.0)
        
        # 8. STEP SURVIVAL BONUS
        reward += 0.05
        
        # 9. PENALTY SYSTEM
        if z < 0.5:  # Fallen
            reward -= 10.0
        if abs(y) > 2.0:  # Too far sideways
            reward -= 5.0
        if vx < -0.5:  # Moving backward
            reward -= 2.0
        
        # 10. CURRICULUM SCALING
        reward *= (0.5 + 0.5 * self.difficulty)  # Scale rewards with difficulty
        
        return reward
    
    def _is_terminated(self):
        pos = self.data.qpos
        
        # More forgiving termination for easier training
        min_height = 0.4 / self.difficulty
        max_lateral = 3.0 / self.difficulty
        
        if pos[2] < min_height:
            return True
        if abs(pos[1]) > max_lateral:
            return True
        if pos[0] < -3.0:  # Moved too far backward
            return True
            
        return False
    
    def render(self, mode="human"):
        if self.viewer is None:
            try:
                import mujoco.viewer as viewer
                self.viewer = viewer.launch_passive(self.model, self.data)
            except:
                self.viewer = "disabled"
        
        if self.viewer != "disabled" and hasattr(self.viewer, 'sync'):
            self.viewer.sync()

print("✅ Advanced ImprovedWalkingEnv defined!")

# Create directories for models and logs
os.makedirs("models", exist_ok=True)
os.makedirs("logs", exist_ok=True)

print("📁 Model and log directories created")
print("🎯 Ready for training with multiple approaches!")


🚀 PHASE 1: Training Standing Model
Using cpu device
✅ Standing model created
🎯 Training to stand upright...
📊 Small actions (-0.3 to 0.3) for gentle learning


In [None]:
# TRAINING METHOD 1: BASIC STEP-BY-STEP TRAINING
# Start here if you're new or want the simple approach

def basic_training():
    """Basic step-by-step training: Standing → Walking"""
    print("🚀 BASIC STEP-BY-STEP TRAINING")
    print("=" * 40)
    
    # Phase 1: Standing
    print("🎯 Phase 1: Training Standing Model...")
    standing_env_vec = DummyVecEnv([lambda: Monitor(StandingHumanoidEnv())])
    
    standing_model = PPO(
        "MlpPolicy", standing_env_vec,
        learning_rate=3e-4, n_steps=1024, batch_size=32, n_epochs=5,
        gamma=0.99, verbose=1, tensorboard_log="./logs/basic/",
        policy_kwargs=dict(net_arch=dict(pi=[128, 128], vf=[128, 128]))
    )
    
    standing_model.learn(total_timesteps=50000, progress_bar=True, tb_log_name="basic_standing")
    standing_model.save("basic_standing_model")
    print("✅ Standing model trained!")
    
    # Phase 2: Walking
    print("\n🎯 Phase 2: Training Walking Model...")
    walking_env_vec = DummyVecEnv([lambda: Monitor(WalkingHumanoidEnv())])
    
    try:
        walking_model = PPO.load("basic_standing_model", env=walking_env_vec)
        print("✅ Loaded standing model as base")
    except:
        walking_model = PPO(
            "MlpPolicy", walking_env_vec,
            learning_rate=2e-4, n_steps=2048, batch_size=64, n_epochs=8,
            gamma=0.99, verbose=1, tensorboard_log="./logs/basic/",
            policy_kwargs=dict(net_arch=dict(pi=[256, 256], vf=[256, 256]))
        )
        print("✅ Created new walking model")
    
    walking_model.learn(total_timesteps=100000, progress_bar=True, 
                       tb_log_name="basic_walking", reset_num_timesteps=False)
    walking_model.save("basic_walking_model")
    print("✅ Walking model trained!")
    
    return walking_model

# TRAINING METHOD 2: QUICK IMPROVEMENT
# Use this if you already have a model and want to improve it quickly

def quick_improvement():
    """Quick improvement of existing walking model"""
    print("🔥 QUICK IMPROVEMENT TRAINING")
    print("=" * 40)
    
    # Try to load existing model
    env = DummyVecEnv([lambda: Monitor(WalkingHumanoidEnv())])
    
    try:
        model = PPO.load("basic_walking_model", env=env)
        print("✅ Loaded basic walking model")
    except:
        try:
            model = PPO.load("walking_model", env=env)
            print("✅ Loaded previous walking model")
        except:
            print("❌ No existing model found. Run basic_training() first!")
            return None
    
    # Fine-tune with lower learning rate
    model.learning_rate = 1e-4
    print(f"🎯 Fine-tuning with learning rate: {model.learning_rate}")
    
    model.learn(total_timesteps=150000, progress_bar=True, 
               tb_log_name="quick_improve", reset_num_timesteps=False)
    model.save("quick_improved_model")
    print("✅ Quick improvement completed!")
    
    return model

# TRAINING METHOD 3: CURRICULUM LEARNING (BEST RESULTS)
# Advanced method that starts easy and gets progressively harder

def curriculum_training():
    """Advanced curriculum learning - start easy, get harder"""
    print("🎓 CURRICULUM LEARNING TRAINING")
    print("=" * 50)
    
    difficulties = [0.5, 0.7, 1.0, 1.3, 1.5]  # Easy to hard
    model = None
    
    for i, difficulty in enumerate(difficulties):
        print(f"\n🎯 STAGE {i+1}/5: Difficulty {difficulty}")
        
        # Create environment with current difficulty
        env = DummyVecEnv([lambda d=difficulty: Monitor(ImprovedWalkingEnv(difficulty=d))])
        
        if model is not None:
            model.set_env(env)
            print(f"✅ Continuing from previous stage")
        else:
            model = PPO(
                "MlpPolicy", env,
                learning_rate=3e-4 / difficulty,
                n_steps=2048, batch_size=64, n_epochs=10,
                gamma=0.99, gae_lambda=0.95, clip_range=0.2,
                ent_coef=0.01 / difficulty, verbose=1,
                tensorboard_log="./logs/curriculum/",
                policy_kwargs=dict(net_arch=dict(pi=[512, 512, 256], vf=[512, 512, 256]))
            )
            print(f"✅ Created model for stage {i+1}")
        
        # Train for this stage
        timesteps = int(50000 * (1.0 + difficulty))
        print(f"Training for {timesteps} timesteps...")
        
        model.learn(total_timesteps=timesteps, progress_bar=True,
                   tb_log_name=f"curriculum_stage_{i+1}_diff_{difficulty}",
                   reset_num_timesteps=False)
        
        model.save(f"models/curriculum_stage_{i+1}")
        
        # Quick test
        print(f"🧪 Testing stage {i+1}...")
        test_env = ImprovedWalkingEnv(difficulty=difficulty)
        obs, _ = test_env.reset()
        total_reward = 0
        
        for step in range(200):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, _ = test_env.step(action)
            total_reward += reward
            if terminated or truncated:
                break
        
        avg_reward = total_reward / (step + 1)
        distance = test_env.data.qpos[0]
        print(f"📊 Stage {i+1}: Avg reward={avg_reward:.2f}, Distance={distance:.2f}m")
    
    model.save("curriculum_final_model")
    print("✅ Curriculum training completed!")
    return model

print("✅ All training methods defined!")
print("\n🎯 CHOOSE YOUR TRAINING APPROACH:")
print("1. basic_training()      - Simple, reliable (30 min)")
print("2. quick_improvement()   - Fast improvement (20 min)")
print("3. curriculum_training() - Best results (60 min)")
print("\nRun one of these functions to start training!")


Output()

🎯 Starting Phase 1 training...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 145      |
|    ep_rew_mean     | 173      |
| time/              |          |
|    fps             | 6453     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1024     |
---------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 154        |
|    ep_rew_mean          | 184        |
| time/                   |            |
|    fps                  | 3822       |
|    iterations           | 2          |
|    time_elapsed         | 0          |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.03220638 |
|    clip_fraction        | 0.338      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.8      |
|    explained_variance   | -0.0174    |
|    learning_rate        | 0.0003     |
|    loss                 | 220        |
|    n_updates            | 5          |
|    policy_gradient_loss | -0.063     |
|    std                  | 1          |
|    value_loss           | 615        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 153        |
|    ep_rew_mean          | 210        |
| time/                   |            |
|    fps                  | 3535       |
|    iterations           | 3          |
|    time_elapsed         | 0          |
|    total_timesteps      | 3072       |
| train/                  |            |
|    approx_kl            | 0.02677293 |
|    clip_fraction        | 0.273      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.8      |
|    explained_variance   | 0.34       |
|    learning_rate        | 0.0003     |
|    loss                 | 287        |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0611    |
|    std                  | 0.999      |
|    value_loss           | 668        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 153         |
|    ep_rew_mean          | 207         |
| time/                   |             |
|    fps                  | 3370        |
|    iterations           | 4           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.024350924 |
|    clip_fraction        | 0.249       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.507       |
|    learning_rate        | 0.0003      |
|    loss                 | 342         |
|    n_updates            | 15          |
|    policy_gradient_loss | -0.0548     |
|    std                  | 0.999       |
|    value_loss           | 703         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 153        |
|    ep_rew_mean          | 210        |
| time/                   |            |
|    fps                  | 3298       |
|    iterations           | 5          |
|    time_elapsed         | 1          |
|    total_timesteps      | 5120       |
| train/                  |            |
|    approx_kl            | 0.02549538 |
|    clip_fraction        | 0.254      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.8      |
|    explained_variance   | 0.668      |
|    learning_rate        | 0.0003     |
|    loss                 | 239        |
|    n_updates            | 20         |
|    policy_gradient_loss | -0.0567    |
|    std                  | 1          |
|    value_loss           | 659        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 151         |
|    ep_rew_mean          | 214         |
| time/                   |             |
|    fps                  | 3230        |
|    iterations           | 6           |
|    time_elapsed         | 1           |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.029266905 |
|    clip_fraction        | 0.302       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.761       |
|    learning_rate        | 0.0003      |
|    loss                 | 234         |
|    n_updates            | 25          |
|    policy_gradient_loss | -0.0592     |
|    std                  | 1           |
|    value_loss           | 540         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 155         |
|    ep_rew_mean          | 211         |
| time/                   |             |
|    fps                  | 3177        |
|    iterations           | 7           |
|    time_elapsed         | 2           |
|    total_timesteps      | 7168        |
| train/                  |             |
|    approx_kl            | 0.028646328 |
|    clip_fraction        | 0.304       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.802       |
|    learning_rate        | 0.0003      |
|    loss                 | 204         |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.061      |
|    std                  | 1           |
|    value_loss           | 520         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 153        |
|    ep_rew_mean          | 208        |
| time/                   |            |
|    fps                  | 3092       |
|    iterations           | 8          |
|    time_elapsed         | 2          |
|    total_timesteps      | 8192       |
| train/                  |            |
|    approx_kl            | 0.03468039 |
|    clip_fraction        | 0.328      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.8      |
|    explained_variance   | 0.807      |
|    learning_rate        | 0.0003     |
|    loss                 | 181        |
|    n_updates            | 35         |
|    policy_gradient_loss | -0.0612    |
|    std                  | 1          |
|    value_loss           | 401        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 154        |
|    ep_rew_mean          | 211        |
| time/                   |            |
|    fps                  | 3029       |
|    iterations           | 9          |
|    time_elapsed         | 3          |
|    total_timesteps      | 9216       |
| train/                  |            |
|    approx_kl            | 0.03880105 |
|    clip_fraction        | 0.359      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.8      |
|    explained_variance   | 0.861      |
|    learning_rate        | 0.0003     |
|    loss                 | 136        |
|    n_updates            | 40         |
|    policy_gradient_loss | -0.0658    |
|    std                  | 1          |
|    value_loss           | 344        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 153         |
|    ep_rew_mean          | 211         |
| time/                   |             |
|    fps                  | 2986        |
|    iterations           | 10          |
|    time_elapsed         | 3           |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.029639436 |
|    clip_fraction        | 0.321       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.866       |
|    learning_rate        | 0.0003      |
|    loss                 | 107         |
|    n_updates            | 45          |
|    policy_gradient_loss | -0.0586     |
|    std                  | 1           |
|    value_loss           | 357         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 152         |
|    ep_rew_mean          | 212         |
| time/                   |             |
|    fps                  | 2943        |
|    iterations           | 11          |
|    time_elapsed         | 3           |
|    total_timesteps      | 11264       |
| train/                  |             |
|    approx_kl            | 0.037364706 |
|    clip_fraction        | 0.351       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.893       |
|    learning_rate        | 0.0003      |
|    loss                 | 52.9        |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.0593     |
|    std                  | 1           |
|    value_loss           | 301         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 152        |
|    ep_rew_mean          | 210        |
| time/                   |            |
|    fps                  | 2940       |
|    iterations           | 12         |
|    time_elapsed         | 4          |
|    total_timesteps      | 12288      |
| train/                  |            |
|    approx_kl            | 0.04193095 |
|    clip_fraction        | 0.387      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.8      |
|    explained_variance   | 0.916      |
|    learning_rate        | 0.0003     |
|    loss                 | 132        |
|    n_updates            | 55         |
|    policy_gradient_loss | -0.0645    |
|    std                  | 1          |
|    value_loss           | 250        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 154         |
|    ep_rew_mean          | 210         |
| time/                   |             |
|    fps                  | 2916        |
|    iterations           | 13          |
|    time_elapsed         | 4           |
|    total_timesteps      | 13312       |
| train/                  |             |
|    approx_kl            | 0.043908425 |
|    clip_fraction        | 0.36        |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.915       |
|    learning_rate        | 0.0003      |
|    loss                 | 108         |
|    n_updates            | 60          |
|    policy_gradient_loss | -0.0628     |
|    std                  | 0.998       |
|    value_loss           | 212         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 153         |
|    ep_rew_mean          | 211         |
| time/                   |             |
|    fps                  | 2919        |
|    iterations           | 14          |
|    time_elapsed         | 4           |
|    total_timesteps      | 14336       |
| train/                  |             |
|    approx_kl            | 0.043656632 |
|    clip_fraction        | 0.403       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.929       |
|    learning_rate        | 0.0003      |
|    loss                 | 59.4        |
|    n_updates            | 65          |
|    policy_gradient_loss | -0.0654     |
|    std                  | 0.997       |
|    value_loss           | 188         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 153         |
|    ep_rew_mean          | 214         |
| time/                   |             |
|    fps                  | 2906        |
|    iterations           | 15          |
|    time_elapsed         | 5           |
|    total_timesteps      | 15360       |
| train/                  |             |
|    approx_kl            | 0.043859802 |
|    clip_fraction        | 0.383       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.942       |
|    learning_rate        | 0.0003      |
|    loss                 | 75.6        |
|    n_updates            | 70          |
|    policy_gradient_loss | -0.0624     |
|    std                  | 0.998       |
|    value_loss           | 177         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 154         |
|    ep_rew_mean          | 218         |
| time/                   |             |
|    fps                  | 2909        |
|    iterations           | 16          |
|    time_elapsed         | 5           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.033113178 |
|    clip_fraction        | 0.317       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.928       |
|    learning_rate        | 0.0003      |
|    loss                 | 68.4        |
|    n_updates            | 75          |
|    policy_gradient_loss | -0.0634     |
|    std                  | 0.996       |
|    value_loss           | 253         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 154        |
|    ep_rew_mean          | 223        |
| time/                   |            |
|    fps                  | 2904       |
|    iterations           | 17         |
|    time_elapsed         | 5          |
|    total_timesteps      | 17408      |
| train/                  |            |
|    approx_kl            | 0.04225651 |
|    clip_fraction        | 0.366      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.7      |
|    explained_variance   | 0.954      |
|    learning_rate        | 0.0003     |
|    loss                 | 86.3       |
|    n_updates            | 80         |
|    policy_gradient_loss | -0.0629    |
|    std                  | 0.997      |
|    value_loss           | 153        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 155         |
|    ep_rew_mean          | 223         |
| time/                   |             |
|    fps                  | 2900        |
|    iterations           | 18          |
|    time_elapsed         | 6           |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.029934378 |
|    clip_fraction        | 0.28        |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.919       |
|    learning_rate        | 0.0003      |
|    loss                 | 91.1        |
|    n_updates            | 85          |
|    policy_gradient_loss | -0.056      |
|    std                  | 0.997       |
|    value_loss           | 288         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 156        |
|    ep_rew_mean          | 228        |
| time/                   |            |
|    fps                  | 2908       |
|    iterations           | 19         |
|    time_elapsed         | 6          |
|    total_timesteps      | 19456      |
| train/                  |            |
|    approx_kl            | 0.03305886 |
|    clip_fraction        | 0.286      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.7      |
|    explained_variance   | 0.957      |
|    learning_rate        | 0.0003     |
|    loss                 | 64.4       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0452    |
|    std                  | 0.996      |
|    value_loss           | 153        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 157         |
|    ep_rew_mean          | 229         |
| time/                   |             |
|    fps                  | 2903        |
|    iterations           | 20          |
|    time_elapsed         | 7           |
|    total_timesteps      | 20480       |
| train/                  |             |
|    approx_kl            | 0.046964314 |
|    clip_fraction        | 0.387       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.954       |
|    learning_rate        | 0.0003      |
|    loss                 | 40.6        |
|    n_updates            | 95          |
|    policy_gradient_loss | -0.0647     |
|    std                  | 0.995       |
|    value_loss           | 169         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 160         |
|    ep_rew_mean          | 236         |
| time/                   |             |
|    fps                  | 2912        |
|    iterations           | 21          |
|    time_elapsed         | 7           |
|    total_timesteps      | 21504       |
| train/                  |             |
|    approx_kl            | 0.051127642 |
|    clip_fraction        | 0.379       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.967       |
|    learning_rate        | 0.0003      |
|    loss                 | 22.4        |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.061      |
|    std                  | 0.993       |
|    value_loss           | 113         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 162         |
|    ep_rew_mean          | 242         |
| time/                   |             |
|    fps                  | 2910        |
|    iterations           | 22          |
|    time_elapsed         | 7           |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.027516218 |
|    clip_fraction        | 0.245       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.932       |
|    learning_rate        | 0.0003      |
|    loss                 | 95.4        |
|    n_updates            | 105         |
|    policy_gradient_loss | -0.0498     |
|    std                  | 0.992       |
|    value_loss           | 241         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 162         |
|    ep_rew_mean          | 248         |
| time/                   |             |
|    fps                  | 2917        |
|    iterations           | 23          |
|    time_elapsed         | 8           |
|    total_timesteps      | 23552       |
| train/                  |             |
|    approx_kl            | 0.031786934 |
|    clip_fraction        | 0.273       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.928       |
|    learning_rate        | 0.0003      |
|    loss                 | 91.1        |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.0493     |
|    std                  | 0.992       |
|    value_loss           | 203         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 166        |
|    ep_rew_mean          | 256        |
| time/                   |            |
|    fps                  | 2914       |
|    iterations           | 24         |
|    time_elapsed         | 8          |
|    total_timesteps      | 24576      |
| train/                  |            |
|    approx_kl            | 0.02492904 |
|    clip_fraction        | 0.256      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.6      |
|    explained_variance   | 0.929      |
|    learning_rate        | 0.0003     |
|    loss                 | 85.2       |
|    n_updates            | 115        |
|    policy_gradient_loss | -0.0523    |
|    std                  | 0.991      |
|    value_loss           | 232        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 168         |
|    ep_rew_mean          | 262         |
| time/                   |             |
|    fps                  | 2918        |
|    iterations           | 25          |
|    time_elapsed         | 8           |
|    total_timesteps      | 25600       |
| train/                  |             |
|    approx_kl            | 0.034209576 |
|    clip_fraction        | 0.277       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.94        |
|    learning_rate        | 0.0003      |
|    loss                 | 44.2        |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0578     |
|    std                  | 0.99        |
|    value_loss           | 168         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 170         |
|    ep_rew_mean          | 271         |
| time/                   |             |
|    fps                  | 2919        |
|    iterations           | 26          |
|    time_elapsed         | 9           |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.030314274 |
|    clip_fraction        | 0.238       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.925       |
|    learning_rate        | 0.0003      |
|    loss                 | 109         |
|    n_updates            | 125         |
|    policy_gradient_loss | -0.0514     |
|    std                  | 0.989       |
|    value_loss           | 216         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 173        |
|    ep_rew_mean          | 277        |
| time/                   |            |
|    fps                  | 2920       |
|    iterations           | 27         |
|    time_elapsed         | 9          |
|    total_timesteps      | 27648      |
| train/                  |            |
|    approx_kl            | 0.02421929 |
|    clip_fraction        | 0.232      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.6      |
|    explained_variance   | 0.925      |
|    learning_rate        | 0.0003     |
|    loss                 | 65.4       |
|    n_updates            | 130        |
|    policy_gradient_loss | -0.049     |
|    std                  | 0.99       |
|    value_loss           | 247        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 177         |
|    ep_rew_mean          | 288         |
| time/                   |             |
|    fps                  | 2927        |
|    iterations           | 28          |
|    time_elapsed         | 9           |
|    total_timesteps      | 28672       |
| train/                  |             |
|    approx_kl            | 0.033311572 |
|    clip_fraction        | 0.266       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.948       |
|    learning_rate        | 0.0003      |
|    loss                 | 76.7        |
|    n_updates            | 135         |
|    policy_gradient_loss | -0.0556     |
|    std                  | 0.989       |
|    value_loss           | 197         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 178         |
|    ep_rew_mean          | 296         |
| time/                   |             |
|    fps                  | 2926        |
|    iterations           | 29          |
|    time_elapsed         | 10          |
|    total_timesteps      | 29696       |
| train/                  |             |
|    approx_kl            | 0.037511483 |
|    clip_fraction        | 0.321       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.949       |
|    learning_rate        | 0.0003      |
|    loss                 | 73          |
|    n_updates            | 140         |
|    policy_gradient_loss | -0.0622     |
|    std                  | 0.988       |
|    value_loss           | 179         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 181         |
|    ep_rew_mean          | 306         |
| time/                   |             |
|    fps                  | 2931        |
|    iterations           | 30          |
|    time_elapsed         | 10          |
|    total_timesteps      | 30720       |
| train/                  |             |
|    approx_kl            | 0.038580455 |
|    clip_fraction        | 0.329       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.963       |
|    learning_rate        | 0.0003      |
|    loss                 | 48.1        |
|    n_updates            | 145         |
|    policy_gradient_loss | -0.0651     |
|    std                  | 0.989       |
|    value_loss           | 145         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 183        |
|    ep_rew_mean          | 316        |
| time/                   |            |
|    fps                  | 2926       |
|    iterations           | 31         |
|    time_elapsed         | 10         |
|    total_timesteps      | 31744      |
| train/                  |            |
|    approx_kl            | 0.03459051 |
|    clip_fraction        | 0.305      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.6      |
|    explained_variance   | 0.961      |
|    learning_rate        | 0.0003     |
|    loss                 | 57.9       |
|    n_updates            | 150        |
|    policy_gradient_loss | -0.0588    |
|    std                  | 0.988      |
|    value_loss           | 161        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 185        |
|    ep_rew_mean          | 325        |
| time/                   |            |
|    fps                  | 2932       |
|    iterations           | 32         |
|    time_elapsed         | 11         |
|    total_timesteps      | 32768      |
| train/                  |            |
|    approx_kl            | 0.02945223 |
|    clip_fraction        | 0.287      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.5      |
|    explained_variance   | 0.949      |
|    learning_rate        | 0.0003     |
|    loss                 | 50.6       |
|    n_updates            | 155        |
|    policy_gradient_loss | -0.0544    |
|    std                  | 0.989      |
|    value_loss           | 132        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 189         |
|    ep_rew_mean          | 334         |
| time/                   |             |
|    fps                  | 2928        |
|    iterations           | 33          |
|    time_elapsed         | 11          |
|    total_timesteps      | 33792       |
| train/                  |             |
|    approx_kl            | 0.025504328 |
|    clip_fraction        | 0.232       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.971       |
|    learning_rate        | 0.0003      |
|    loss                 | 61.1        |
|    n_updates            | 160         |
|    policy_gradient_loss | -0.0495     |
|    std                  | 0.987       |
|    value_loss           | 147         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 193         |
|    ep_rew_mean          | 342         |
| time/                   |             |
|    fps                  | 2931        |
|    iterations           | 34          |
|    time_elapsed         | 11          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.035252906 |
|    clip_fraction        | 0.304       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.968       |
|    learning_rate        | 0.0003      |
|    loss                 | 57.1        |
|    n_updates            | 165         |
|    policy_gradient_loss | -0.0567     |
|    std                  | 0.987       |
|    value_loss           | 118         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 196         |
|    ep_rew_mean          | 348         |
| time/                   |             |
|    fps                  | 2929        |
|    iterations           | 35          |
|    time_elapsed         | 12          |
|    total_timesteps      | 35840       |
| train/                  |             |
|    approx_kl            | 0.038788013 |
|    clip_fraction        | 0.318       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.961       |
|    learning_rate        | 0.0003      |
|    loss                 | 31          |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0529     |
|    std                  | 0.987       |
|    value_loss           | 116         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 199        |
|    ep_rew_mean          | 357        |
| time/                   |            |
|    fps                  | 2929       |
|    iterations           | 36         |
|    time_elapsed         | 12         |
|    total_timesteps      | 36864      |
| train/                  |            |
|    approx_kl            | 0.02720388 |
|    clip_fraction        | 0.244      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.5      |
|    explained_variance   | 0.964      |
|    learning_rate        | 0.0003     |
|    loss                 | 52.8       |
|    n_updates            | 175        |
|    policy_gradient_loss | -0.0499    |
|    std                  | 0.986      |
|    value_loss           | 136        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | 361         |
| time/                   |             |
|    fps                  | 2934        |
|    iterations           | 37          |
|    time_elapsed         | 12          |
|    total_timesteps      | 37888       |
| train/                  |             |
|    approx_kl            | 0.032119423 |
|    clip_fraction        | 0.289       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.965       |
|    learning_rate        | 0.0003      |
|    loss                 | 51.1        |
|    n_updates            | 180         |
|    policy_gradient_loss | -0.0542     |
|    std                  | 0.986       |
|    value_loss           | 124         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 203        |
|    ep_rew_mean          | 371        |
| time/                   |            |
|    fps                  | 2934       |
|    iterations           | 38         |
|    time_elapsed         | 13         |
|    total_timesteps      | 38912      |
| train/                  |            |
|    approx_kl            | 0.04529118 |
|    clip_fraction        | 0.328      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.5      |
|    explained_variance   | 0.973      |
|    learning_rate        | 0.0003     |
|    loss                 | 33.5       |
|    n_updates            | 185        |
|    policy_gradient_loss | -0.0539    |
|    std                  | 0.985      |
|    value_loss           | 98.8       |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 205         |
|    ep_rew_mean          | 378         |
| time/                   |             |
|    fps                  | 2938        |
|    iterations           | 39          |
|    time_elapsed         | 13          |
|    total_timesteps      | 39936       |
| train/                  |             |
|    approx_kl            | 0.030361533 |
|    clip_fraction        | 0.278       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.967       |
|    learning_rate        | 0.0003      |
|    loss                 | 24.6        |
|    n_updates            | 190         |
|    policy_gradient_loss | -0.0494     |
|    std                  | 0.985       |
|    value_loss           | 126         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 207         |
|    ep_rew_mean          | 385         |
| time/                   |             |
|    fps                  | 2936        |
|    iterations           | 40          |
|    time_elapsed         | 13          |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.039028075 |
|    clip_fraction        | 0.304       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.976       |
|    learning_rate        | 0.0003      |
|    loss                 | 25.4        |
|    n_updates            | 195         |
|    policy_gradient_loss | -0.0534     |
|    std                  | 0.986       |
|    value_loss           | 76.7        |
-----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 208       |
|    ep_rew_mean          | 392       |
| time/                   |           |
|    fps                  | 2941      |
|    iterations           | 41        |
|    time_elapsed         | 14        |
|    total_timesteps      | 41984     |
| train/                  |           |
|    approx_kl            | 0.0382798 |
|    clip_fraction        | 0.33      |
|    clip_range           | 0.2       |
|    entropy_loss         | -29.5     |
|    explained_variance   | 0.974     |
|    learning_rate        | 0.0003    |
|    loss                 | 24.8      |
|    n_updates            | 200       |
|    policy_gradient_loss | -0.0602   |
|    std                  | 0.984     |
|    value_loss           | 87.3      |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 211        |
|    ep_rew_mean          | 399        |
| time/                   |            |
|    fps                  | 2937       |
|    iterations           | 42         |
|    time_elapsed         | 14         |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.03792181 |
|    clip_fraction        | 0.335      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.5      |
|    explained_variance   | 0.985      |
|    learning_rate        | 0.0003     |
|    loss                 | 20.3       |
|    n_updates            | 205        |
|    policy_gradient_loss | -0.0592    |
|    std                  | 0.986      |
|    value_loss           | 60         |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 214        |
|    ep_rew_mean          | 407        |
| time/                   |            |
|    fps                  | 2941       |
|    iterations           | 43         |
|    time_elapsed         | 14         |
|    total_timesteps      | 44032      |
| train/                  |            |
|    approx_kl            | 0.03547118 |
|    clip_fraction        | 0.304      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.5      |
|    explained_variance   | 0.966      |
|    learning_rate        | 0.0003     |
|    loss                 | 21.7       |
|    n_updates            | 210        |
|    policy_gradient_loss | -0.054     |
|    std                  | 0.985      |
|    value_loss           | 79.6       |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 216         |
|    ep_rew_mean          | 415         |
| time/                   |             |
|    fps                  | 2939        |
|    iterations           | 44          |
|    time_elapsed         | 15          |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.042989664 |
|    clip_fraction        | 0.363       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.984       |
|    learning_rate        | 0.0003      |
|    loss                 | 19.8        |
|    n_updates            | 215         |
|    policy_gradient_loss | -0.0586     |
|    std                  | 0.985       |
|    value_loss           | 61.9        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 216         |
|    ep_rew_mean          | 419         |
| time/                   |             |
|    fps                  | 2940        |
|    iterations           | 45          |
|    time_elapsed         | 15          |
|    total_timesteps      | 46080       |
| train/                  |             |
|    approx_kl            | 0.050034672 |
|    clip_fraction        | 0.366       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.4       |
|    explained_variance   | 0.978       |
|    learning_rate        | 0.0003      |
|    loss                 | 13.3        |
|    n_updates            | 220         |
|    policy_gradient_loss | -0.0571     |
|    std                  | 0.983       |
|    value_loss           | 58.4        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 220         |
|    ep_rew_mean          | 428         |
| time/                   |             |
|    fps                  | 2939        |
|    iterations           | 46          |
|    time_elapsed         | 16          |
|    total_timesteps      | 47104       |
| train/                  |             |
|    approx_kl            | 0.048108853 |
|    clip_fraction        | 0.36        |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.4       |
|    explained_variance   | 0.981       |
|    learning_rate        | 0.0003      |
|    loss                 | 16.1        |
|    n_updates            | 225         |
|    policy_gradient_loss | -0.0577     |
|    std                  | 0.983       |
|    value_loss           | 62.1        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 221         |
|    ep_rew_mean          | 431         |
| time/                   |             |
|    fps                  | 2939        |
|    iterations           | 47          |
|    time_elapsed         | 16          |
|    total_timesteps      | 48128       |
| train/                  |             |
|    approx_kl            | 0.041228235 |
|    clip_fraction        | 0.329       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.4       |
|    explained_variance   | 0.982       |
|    learning_rate        | 0.0003      |
|    loss                 | 12.9        |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.0581     |
|    std                  | 0.983       |
|    value_loss           | 62.2        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 222         |
|    ep_rew_mean          | 434         |
| time/                   |             |
|    fps                  | 2943        |
|    iterations           | 48          |
|    time_elapsed         | 16          |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.030495314 |
|    clip_fraction        | 0.276       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.4       |
|    explained_variance   | 0.979       |
|    learning_rate        | 0.0003      |
|    loss                 | 21.9        |
|    n_updates            | 235         |
|    policy_gradient_loss | -0.0537     |
|    std                  | 0.982       |
|    value_loss           | 72.4        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 222         |
|    ep_rew_mean          | 438         |
| time/                   |             |
|    fps                  | 2942        |
|    iterations           | 49          |
|    time_elapsed         | 17          |
|    total_timesteps      | 50176       |
| train/                  |             |
|    approx_kl            | 0.032849446 |
|    clip_fraction        | 0.277       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.4       |
|    explained_variance   | 0.965       |
|    learning_rate        | 0.0003      |
|    loss                 | 19.6        |
|    n_updates            | 240         |
|    policy_gradient_loss | -0.0547     |
|    std                  | 0.981       |
|    value_loss           | 82          |
-----------------------------------------


✅ Phase 1 completed - Standing model saved!

🧪 Testing standing model...
Step   0: Height=1.30, Reward=3.10
Step 100: Height=1.19, Reward=2.10
Step 200: Height=1.09, Reward=2.10
Standing test ended at step 278
📊 Standing Test Results:
  Average reward: 1.98
  Final height: 0.48
  Success: ✅ YES


In [None]:
# ACTUALLY RUN TRAINING (Choose ONE of these)
# Uncomment the training method you want to use

# Option 1: Basic Training (recommended for first time)
# trained_model = basic_training()

# Option 2: Quick Improvement (if you have existing model)
# trained_model = quick_improvement()

# Option 3: Curriculum Learning (best results but takes longer)
# trained_model = curriculum_training()

print("🎯 TRAINING INSTRUCTIONS:")
print("1. Uncomment ONE of the training lines above")
print("2. Run this cell to start training")
print("3. Wait for training to complete")
print("4. Continue to next cell for testing")

print("\n💡 RECOMMENDATIONS:")
print("- First time: Use basic_training()")
print("- Have existing model: Use quick_improvement()")  
print("- Want best results: Use curriculum_training()")

print("\n📊 MONITOR TRAINING:")
print("Run in terminal: tensorboard --logdir=./logs/")
print("Then open: http://localhost:6006")

# Placeholder - uncomment one of the training methods above
trained_model = None
print("\n⚠️  No training method selected yet!")
print("Uncomment one of the training lines above to start.")


Output()


🚶 PHASE 2: Training Walking Model
⚠️  Creating new walking model
Using cpu device
🎯 Training walking...
📊 Larger actions (-0.5 to 0.5) for walking movement


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 170      |
|    ep_rew_mean     | 258      |
| time/              |          |
|    fps             | 6989     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 176         |
|    ep_rew_mean          | 271         |
| time/                   |             |
|    fps                  | 3811        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.027547084 |
|    clip_fraction        | 0.345       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | -0.00697    |
|    learning_rate        | 0.0002      |
|    loss                 | 180         |
|    n_updates            | 8           |
|    policy_gradient_loss | -0.0676     |
|    std                  | 0.999       |
|    value_loss           | 722         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 175         |
|    ep_rew_mean          | 282         |
| time/                   |             |
|    fps                  | 3164        |
|    iterations           | 3           |
|    time_elapsed         | 1           |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.023137331 |
|    clip_fraction        | 0.268       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.457       |
|    learning_rate        | 0.0002      |
|    loss                 | 297         |
|    n_updates            | 16          |
|    policy_gradient_loss | -0.0572     |
|    std                  | 0.998       |
|    value_loss           | 785         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 169         |
|    ep_rew_mean          | 274         |
| time/                   |             |
|    fps                  | 2970        |
|    iterations           | 4           |
|    time_elapsed         | 2           |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.025486428 |
|    clip_fraction        | 0.304       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.629       |
|    learning_rate        | 0.0002      |
|    loss                 | 252         |
|    n_updates            | 24          |
|    policy_gradient_loss | -0.0654     |
|    std                  | 0.998       |
|    value_loss           | 748         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 172         |
|    ep_rew_mean          | 275         |
| time/                   |             |
|    fps                  | 2894        |
|    iterations           | 5           |
|    time_elapsed         | 3           |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.028331209 |
|    clip_fraction        | 0.293       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.741       |
|    learning_rate        | 0.0002      |
|    loss                 | 172         |
|    n_updates            | 32          |
|    policy_gradient_loss | -0.0651     |
|    std                  | 0.999       |
|    value_loss           | 551         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 172         |
|    ep_rew_mean          | 277         |
| time/                   |             |
|    fps                  | 2835        |
|    iterations           | 6           |
|    time_elapsed         | 4           |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.025997855 |
|    clip_fraction        | 0.303       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.8       |
|    explained_variance   | 0.821       |
|    learning_rate        | 0.0002      |
|    loss                 | 159         |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0649     |
|    std                  | 0.998       |
|    value_loss           | 464         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 174         |
|    ep_rew_mean          | 279         |
| time/                   |             |
|    fps                  | 2787        |
|    iterations           | 7           |
|    time_elapsed         | 5           |
|    total_timesteps      | 14336       |
| train/                  |             |
|    approx_kl            | 0.028996604 |
|    clip_fraction        | 0.315       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.867       |
|    learning_rate        | 0.0002      |
|    loss                 | 187         |
|    n_updates            | 48          |
|    policy_gradient_loss | -0.0675     |
|    std                  | 0.997       |
|    value_loss           | 497         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 174         |
|    ep_rew_mean          | 280         |
| time/                   |             |
|    fps                  | 2755        |
|    iterations           | 8           |
|    time_elapsed         | 5           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.028529102 |
|    clip_fraction        | 0.307       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.887       |
|    learning_rate        | 0.0002      |
|    loss                 | 113         |
|    n_updates            | 56          |
|    policy_gradient_loss | -0.0656     |
|    std                  | 0.996       |
|    value_loss           | 372         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 172         |
|    ep_rew_mean          | 281         |
| time/                   |             |
|    fps                  | 2696        |
|    iterations           | 9           |
|    time_elapsed         | 6           |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.031234115 |
|    clip_fraction        | 0.316       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.898       |
|    learning_rate        | 0.0002      |
|    loss                 | 50.1        |
|    n_updates            | 64          |
|    policy_gradient_loss | -0.0653     |
|    std                  | 0.996       |
|    value_loss           | 311         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 170         |
|    ep_rew_mean          | 282         |
| time/                   |             |
|    fps                  | 2664        |
|    iterations           | 10          |
|    time_elapsed         | 7           |
|    total_timesteps      | 20480       |
| train/                  |             |
|    approx_kl            | 0.036890697 |
|    clip_fraction        | 0.371       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.7       |
|    explained_variance   | 0.935       |
|    learning_rate        | 0.0002      |
|    loss                 | 79.1        |
|    n_updates            | 72          |
|    policy_gradient_loss | -0.0718     |
|    std                  | 0.994       |
|    value_loss           | 244         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 175         |
|    ep_rew_mean          | 291         |
| time/                   |             |
|    fps                  | 2629        |
|    iterations           | 11          |
|    time_elapsed         | 8           |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.038945973 |
|    clip_fraction        | 0.367       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.934       |
|    learning_rate        | 0.0002      |
|    loss                 | 82.6        |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.0705     |
|    std                  | 0.992       |
|    value_loss           | 229         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 176         |
|    ep_rew_mean          | 303         |
| time/                   |             |
|    fps                  | 2603        |
|    iterations           | 12          |
|    time_elapsed         | 9           |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.030798187 |
|    clip_fraction        | 0.331       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.945       |
|    learning_rate        | 0.0002      |
|    loss                 | 73.4        |
|    n_updates            | 88          |
|    policy_gradient_loss | -0.0665     |
|    std                  | 0.992       |
|    value_loss           | 230         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 182         |
|    ep_rew_mean          | 315         |
| time/                   |             |
|    fps                  | 2584        |
|    iterations           | 13          |
|    time_elapsed         | 10          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.033323467 |
|    clip_fraction        | 0.33        |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.948       |
|    learning_rate        | 0.0002      |
|    loss                 | 63.4        |
|    n_updates            | 96          |
|    policy_gradient_loss | -0.0673     |
|    std                  | 0.991       |
|    value_loss           | 243         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 183         |
|    ep_rew_mean          | 319         |
| time/                   |             |
|    fps                  | 2560        |
|    iterations           | 14          |
|    time_elapsed         | 11          |
|    total_timesteps      | 28672       |
| train/                  |             |
|    approx_kl            | 0.029502798 |
|    clip_fraction        | 0.299       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.6       |
|    explained_variance   | 0.937       |
|    learning_rate        | 0.0002      |
|    loss                 | 69.5        |
|    n_updates            | 104         |
|    policy_gradient_loss | -0.0634     |
|    std                  | 0.99        |
|    value_loss           | 238         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 185         |
|    ep_rew_mean          | 330         |
| time/                   |             |
|    fps                  | 2543        |
|    iterations           | 15          |
|    time_elapsed         | 12          |
|    total_timesteps      | 30720       |
| train/                  |             |
|    approx_kl            | 0.039915103 |
|    clip_fraction        | 0.377       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.958       |
|    learning_rate        | 0.0002      |
|    loss                 | 36.7        |
|    n_updates            | 112         |
|    policy_gradient_loss | -0.0703     |
|    std                  | 0.987       |
|    value_loss           | 143         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 187         |
|    ep_rew_mean          | 343         |
| time/                   |             |
|    fps                  | 2533        |
|    iterations           | 16          |
|    time_elapsed         | 12          |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.029423345 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.939       |
|    learning_rate        | 0.0002      |
|    loss                 | 106         |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0598     |
|    std                  | 0.987       |
|    value_loss           | 268         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 189         |
|    ep_rew_mean          | 353         |
| time/                   |             |
|    fps                  | 2521        |
|    iterations           | 17          |
|    time_elapsed         | 13          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.028678412 |
|    clip_fraction        | 0.307       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.5       |
|    explained_variance   | 0.957       |
|    learning_rate        | 0.0002      |
|    loss                 | 59.4        |
|    n_updates            | 128         |
|    policy_gradient_loss | -0.0618     |
|    std                  | 0.984       |
|    value_loss           | 231         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 194         |
|    ep_rew_mean          | 369         |
| time/                   |             |
|    fps                  | 2511        |
|    iterations           | 18          |
|    time_elapsed         | 14          |
|    total_timesteps      | 36864       |
| train/                  |             |
|    approx_kl            | 0.035873666 |
|    clip_fraction        | 0.354       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.4       |
|    explained_variance   | 0.967       |
|    learning_rate        | 0.0002      |
|    loss                 | 42.8        |
|    n_updates            | 136         |
|    policy_gradient_loss | -0.0695     |
|    std                  | 0.982       |
|    value_loss           | 122         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 198         |
|    ep_rew_mean          | 387         |
| time/                   |             |
|    fps                  | 2505        |
|    iterations           | 19          |
|    time_elapsed         | 15          |
|    total_timesteps      | 38912       |
| train/                  |             |
|    approx_kl            | 0.031822324 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.4       |
|    explained_variance   | 0.946       |
|    learning_rate        | 0.0002      |
|    loss                 | 52.5        |
|    n_updates            | 144         |
|    policy_gradient_loss | -0.0641     |
|    std                  | 0.981       |
|    value_loss           | 192         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | 394        |
| time/                   |            |
|    fps                  | 2506       |
|    iterations           | 20         |
|    time_elapsed         | 16         |
|    total_timesteps      | 40960      |
| train/                  |            |
|    approx_kl            | 0.03052684 |
|    clip_fraction        | 0.296      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.4      |
|    explained_variance   | 0.958      |
|    learning_rate        | 0.0002     |
|    loss                 | 45.7       |
|    n_updates            | 152        |
|    policy_gradient_loss | -0.062     |
|    std                  | 0.98       |
|    value_loss           | 219        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 203         |
|    ep_rew_mean          | 415         |
| time/                   |             |
|    fps                  | 2499        |
|    iterations           | 21          |
|    time_elapsed         | 17          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.035193313 |
|    clip_fraction        | 0.334       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.4       |
|    explained_variance   | 0.973       |
|    learning_rate        | 0.0002      |
|    loss                 | 33.5        |
|    n_updates            | 160         |
|    policy_gradient_loss | -0.0641     |
|    std                  | 0.979       |
|    value_loss           | 127         |
-----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 206       |
|    ep_rew_mean          | 429       |
| time/                   |           |
|    fps                  | 2485      |
|    iterations           | 22        |
|    time_elapsed         | 18        |
|    total_timesteps      | 45056     |
| train/                  |           |
|    approx_kl            | 0.0326303 |
|    clip_fraction        | 0.327     |
|    clip_range           | 0.2       |
|    entropy_loss         | -29.3     |
|    explained_variance   | 0.966     |
|    learning_rate        | 0.0002    |
|    loss                 | 43.9      |
|    n_updates            | 168       |
|    policy_gradient_loss | -0.066    |
|    std                  | 0.977     |
|    value_loss           | 155       |
---------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 205         |
|    ep_rew_mean          | 444         |
| time/                   |             |
|    fps                  | 2487        |
|    iterations           | 23          |
|    time_elapsed         | 18          |
|    total_timesteps      | 47104       |
| train/                  |             |
|    approx_kl            | 0.029678859 |
|    clip_fraction        | 0.302       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.3       |
|    explained_variance   | 0.959       |
|    learning_rate        | 0.0002      |
|    loss                 | 33.3        |
|    n_updates            | 176         |
|    policy_gradient_loss | -0.0619     |
|    std                  | 0.976       |
|    value_loss           | 166         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 208        |
|    ep_rew_mean          | 472        |
| time/                   |            |
|    fps                  | 2490       |
|    iterations           | 24         |
|    time_elapsed         | 19         |
|    total_timesteps      | 49152      |
| train/                  |            |
|    approx_kl            | 0.02625882 |
|    clip_fraction        | 0.278      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.3      |
|    explained_variance   | 0.949      |
|    learning_rate        | 0.0002     |
|    loss                 | 29.7       |
|    n_updates            | 184        |
|    policy_gradient_loss | -0.0593    |
|    std                  | 0.976      |
|    value_loss           | 167        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 210        |
|    ep_rew_mean          | 482        |
| time/                   |            |
|    fps                  | 2487       |
|    iterations           | 25         |
|    time_elapsed         | 20         |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.02582422 |
|    clip_fraction        | 0.273      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.3      |
|    explained_variance   | 0.967      |
|    learning_rate        | 0.0002     |
|    loss                 | 57.9       |
|    n_updates            | 192        |
|    policy_gradient_loss | -0.0603    |
|    std                  | 0.976      |
|    value_loss           | 208        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 214        |
|    ep_rew_mean          | 498        |
| time/                   |            |
|    fps                  | 2482       |
|    iterations           | 26         |
|    time_elapsed         | 21         |
|    total_timesteps      | 53248      |
| train/                  |            |
|    approx_kl            | 0.03584201 |
|    clip_fraction        | 0.334      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.3      |
|    explained_variance   | 0.977      |
|    learning_rate        | 0.0002     |
|    loss                 | 25.1       |
|    n_updates            | 200        |
|    policy_gradient_loss | -0.0669    |
|    std                  | 0.977      |
|    value_loss           | 98.6       |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 215         |
|    ep_rew_mean          | 520         |
| time/                   |             |
|    fps                  | 2478        |
|    iterations           | 27          |
|    time_elapsed         | 22          |
|    total_timesteps      | 55296       |
| train/                  |             |
|    approx_kl            | 0.036381774 |
|    clip_fraction        | 0.332       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.3       |
|    explained_variance   | 0.974       |
|    learning_rate        | 0.0002      |
|    loss                 | 30.1        |
|    n_updates            | 208         |
|    policy_gradient_loss | -0.0591     |
|    std                  | 0.977       |
|    value_loss           | 122         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 218         |
|    ep_rew_mean          | 543         |
| time/                   |             |
|    fps                  | 2479        |
|    iterations           | 28          |
|    time_elapsed         | 23          |
|    total_timesteps      | 57344       |
| train/                  |             |
|    approx_kl            | 0.028801767 |
|    clip_fraction        | 0.301       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.3       |
|    explained_variance   | 0.971       |
|    learning_rate        | 0.0002      |
|    loss                 | 21.1        |
|    n_updates            | 216         |
|    policy_gradient_loss | -0.0586     |
|    std                  | 0.977       |
|    value_loss           | 124         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 220         |
|    ep_rew_mean          | 562         |
| time/                   |             |
|    fps                  | 2478        |
|    iterations           | 29          |
|    time_elapsed         | 23          |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.028168906 |
|    clip_fraction        | 0.286       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.3       |
|    explained_variance   | 0.957       |
|    learning_rate        | 0.0002      |
|    loss                 | 34.8        |
|    n_updates            | 224         |
|    policy_gradient_loss | -0.0588     |
|    std                  | 0.976       |
|    value_loss           | 205         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 222         |
|    ep_rew_mean          | 569         |
| time/                   |             |
|    fps                  | 2478        |
|    iterations           | 30          |
|    time_elapsed         | 24          |
|    total_timesteps      | 61440       |
| train/                  |             |
|    approx_kl            | 0.045596074 |
|    clip_fraction        | 0.376       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.3       |
|    explained_variance   | 0.98        |
|    learning_rate        | 0.0002      |
|    loss                 | 19.1        |
|    n_updates            | 232         |
|    policy_gradient_loss | -0.0684     |
|    std                  | 0.974       |
|    value_loss           | 103         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 224        |
|    ep_rew_mean          | 593        |
| time/                   |            |
|    fps                  | 2483       |
|    iterations           | 31         |
|    time_elapsed         | 25         |
|    total_timesteps      | 63488      |
| train/                  |            |
|    approx_kl            | 0.03458765 |
|    clip_fraction        | 0.318      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.2      |
|    explained_variance   | 0.978      |
|    learning_rate        | 0.0002     |
|    loss                 | 27.3       |
|    n_updates            | 240        |
|    policy_gradient_loss | -0.0622    |
|    std                  | 0.973      |
|    value_loss           | 113        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 226         |
|    ep_rew_mean          | 609         |
| time/                   |             |
|    fps                  | 2482        |
|    iterations           | 32          |
|    time_elapsed         | 26          |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.029036082 |
|    clip_fraction        | 0.295       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.2       |
|    explained_variance   | 0.965       |
|    learning_rate        | 0.0002      |
|    loss                 | 41.3        |
|    n_updates            | 248         |
|    policy_gradient_loss | -0.0571     |
|    std                  | 0.973       |
|    value_loss           | 169         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 227         |
|    ep_rew_mean          | 620         |
| time/                   |             |
|    fps                  | 2486        |
|    iterations           | 33          |
|    time_elapsed         | 27          |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.027387535 |
|    clip_fraction        | 0.262       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.2       |
|    explained_variance   | 0.948       |
|    learning_rate        | 0.0002      |
|    loss                 | 30          |
|    n_updates            | 256         |
|    policy_gradient_loss | -0.05       |
|    std                  | 0.971       |
|    value_loss           | 195         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 230        |
|    ep_rew_mean          | 623        |
| time/                   |            |
|    fps                  | 2486       |
|    iterations           | 34         |
|    time_elapsed         | 28         |
|    total_timesteps      | 69632      |
| train/                  |            |
|    approx_kl            | 0.03860855 |
|    clip_fraction        | 0.355      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.2      |
|    explained_variance   | 0.973      |
|    learning_rate        | 0.0002     |
|    loss                 | 30.4       |
|    n_updates            | 264        |
|    policy_gradient_loss | -0.0603    |
|    std                  | 0.97       |
|    value_loss           | 111        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 233         |
|    ep_rew_mean          | 634         |
| time/                   |             |
|    fps                  | 2489        |
|    iterations           | 35          |
|    time_elapsed         | 28          |
|    total_timesteps      | 71680       |
| train/                  |             |
|    approx_kl            | 0.030219227 |
|    clip_fraction        | 0.299       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.1       |
|    explained_variance   | 0.972       |
|    learning_rate        | 0.0002      |
|    loss                 | 17.2        |
|    n_updates            | 272         |
|    policy_gradient_loss | -0.056      |
|    std                  | 0.969       |
|    value_loss           | 115         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 236         |
|    ep_rew_mean          | 666         |
| time/                   |             |
|    fps                  | 2492        |
|    iterations           | 36          |
|    time_elapsed         | 29          |
|    total_timesteps      | 73728       |
| train/                  |             |
|    approx_kl            | 0.034041576 |
|    clip_fraction        | 0.313       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.1       |
|    explained_variance   | 0.961       |
|    learning_rate        | 0.0002      |
|    loss                 | 30.8        |
|    n_updates            | 280         |
|    policy_gradient_loss | -0.0581     |
|    std                  | 0.969       |
|    value_loss           | 115         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 236         |
|    ep_rew_mean          | 678         |
| time/                   |             |
|    fps                  | 2492        |
|    iterations           | 37          |
|    time_elapsed         | 30          |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.037779804 |
|    clip_fraction        | 0.333       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.1       |
|    explained_variance   | 0.964       |
|    learning_rate        | 0.0002      |
|    loss                 | 27.3        |
|    n_updates            | 288         |
|    policy_gradient_loss | -0.0623     |
|    std                  | 0.967       |
|    value_loss           | 112         |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 238        |
|    ep_rew_mean          | 681        |
| time/                   |            |
|    fps                  | 2496       |
|    iterations           | 38         |
|    time_elapsed         | 31         |
|    total_timesteps      | 77824      |
| train/                  |            |
|    approx_kl            | 0.03744704 |
|    clip_fraction        | 0.327      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.1      |
|    explained_variance   | 0.966      |
|    learning_rate        | 0.0002     |
|    loss                 | 23.9       |
|    n_updates            | 296        |
|    policy_gradient_loss | -0.0569    |
|    std                  | 0.968      |
|    value_loss           | 130        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 240        |
|    ep_rew_mean          | 681        |
| time/                   |            |
|    fps                  | 2496       |
|    iterations           | 39         |
|    time_elapsed         | 31         |
|    total_timesteps      | 79872      |
| train/                  |            |
|    approx_kl            | 0.04267271 |
|    clip_fraction        | 0.378      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29.1      |
|    explained_variance   | 0.977      |
|    learning_rate        | 0.0002     |
|    loss                 | 26.2       |
|    n_updates            | 304        |
|    policy_gradient_loss | -0.0648    |
|    std                  | 0.968      |
|    value_loss           | 102        |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 239         |
|    ep_rew_mean          | 680         |
| time/                   |             |
|    fps                  | 2499        |
|    iterations           | 40          |
|    time_elapsed         | 32          |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.030973183 |
|    clip_fraction        | 0.299       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.1       |
|    explained_variance   | 0.958       |
|    learning_rate        | 0.0002      |
|    loss                 | 28.8        |
|    n_updates            | 312         |
|    policy_gradient_loss | -0.0552     |
|    std                  | 0.968       |
|    value_loss           | 178         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 241         |
|    ep_rew_mean          | 701         |
| time/                   |             |
|    fps                  | 2498        |
|    iterations           | 41          |
|    time_elapsed         | 33          |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.040569533 |
|    clip_fraction        | 0.393       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.1       |
|    explained_variance   | 0.977       |
|    learning_rate        | 0.0002      |
|    loss                 | 20.4        |
|    n_updates            | 320         |
|    policy_gradient_loss | -0.0709     |
|    std                  | 0.967       |
|    value_loss           | 89.6        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 243         |
|    ep_rew_mean          | 717         |
| time/                   |             |
|    fps                  | 2501        |
|    iterations           | 42          |
|    time_elapsed         | 34          |
|    total_timesteps      | 86016       |
| train/                  |             |
|    approx_kl            | 0.031794887 |
|    clip_fraction        | 0.357       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29.1       |
|    explained_variance   | 0.977       |
|    learning_rate        | 0.0002      |
|    loss                 | 34.8        |
|    n_updates            | 328         |
|    policy_gradient_loss | -0.0667     |
|    std                  | 0.965       |
|    value_loss           | 112         |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 244         |
|    ep_rew_mean          | 731         |
| time/                   |             |
|    fps                  | 2498        |
|    iterations           | 43          |
|    time_elapsed         | 35          |
|    total_timesteps      | 88064       |
| train/                  |             |
|    approx_kl            | 0.037908185 |
|    clip_fraction        | 0.354       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29         |
|    explained_variance   | 0.975       |
|    learning_rate        | 0.0002      |
|    loss                 | 22          |
|    n_updates            | 336         |
|    policy_gradient_loss | -0.0613     |
|    std                  | 0.964       |
|    value_loss           | 92.5        |
-----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 247       |
|    ep_rew_mean          | 743       |
| time/                   |           |
|    fps                  | 2501      |
|    iterations           | 44        |
|    time_elapsed         | 36        |
|    total_timesteps      | 90112     |
| train/                  |           |
|    approx_kl            | 0.0355871 |
|    clip_fraction        | 0.314     |
|    clip_range           | 0.2       |
|    entropy_loss         | -29       |
|    explained_variance   | 0.965     |
|    learning_rate        | 0.0002    |
|    loss                 | 24.1      |
|    n_updates            | 344       |
|    policy_gradient_loss | -0.0552   |
|    std                  | 0.964     |
|    value_loss           | 109       |
---------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 248         |
|    ep_rew_mean          | 752         |
| time/                   |             |
|    fps                  | 2502        |
|    iterations           | 45          |
|    time_elapsed         | 36          |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.045857806 |
|    clip_fraction        | 0.409       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29         |
|    explained_variance   | 0.984       |
|    learning_rate        | 0.0002      |
|    loss                 | 17.7        |
|    n_updates            | 352         |
|    policy_gradient_loss | -0.0695     |
|    std                  | 0.964       |
|    value_loss           | 67.5        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 248         |
|    ep_rew_mean          | 757         |
| time/                   |             |
|    fps                  | 2505        |
|    iterations           | 46          |
|    time_elapsed         | 37          |
|    total_timesteps      | 94208       |
| train/                  |             |
|    approx_kl            | 0.034643047 |
|    clip_fraction        | 0.365       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29         |
|    explained_variance   | 0.981       |
|    learning_rate        | 0.0002      |
|    loss                 | 17          |
|    n_updates            | 360         |
|    policy_gradient_loss | -0.0606     |
|    std                  | 0.963       |
|    value_loss           | 91.9        |
-----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 250        |
|    ep_rew_mean          | 760        |
| time/                   |            |
|    fps                  | 2507       |
|    iterations           | 47         |
|    time_elapsed         | 38         |
|    total_timesteps      | 96256      |
| train/                  |            |
|    approx_kl            | 0.03497222 |
|    clip_fraction        | 0.34       |
|    clip_range           | 0.2        |
|    entropy_loss         | -29        |
|    explained_variance   | 0.98       |
|    learning_rate        | 0.0002     |
|    loss                 | 29.2       |
|    n_updates            | 368        |
|    policy_gradient_loss | -0.0611    |
|    std                  | 0.963      |
|    value_loss           | 117        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 248        |
|    ep_rew_mean          | 759        |
| time/                   |            |
|    fps                  | 2507       |
|    iterations           | 48         |
|    time_elapsed         | 39         |
|    total_timesteps      | 98304      |
| train/                  |            |
|    approx_kl            | 0.03960548 |
|    clip_fraction        | 0.366      |
|    clip_range           | 0.2        |
|    entropy_loss         | -29        |
|    explained_variance   | 0.977      |
|    learning_rate        | 0.0002     |
|    loss                 | 15         |
|    n_updates            | 376        |
|    policy_gradient_loss | -0.0632    |
|    std                  | 0.961      |
|    value_loss           | 86.3       |
----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 249         |
|    ep_rew_mean          | 760         |
| time/                   |             |
|    fps                  | 2509        |
|    iterations           | 49          |
|    time_elapsed         | 39          |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.042626806 |
|    clip_fraction        | 0.386       |
|    clip_range           | 0.2         |
|    entropy_loss         | -29         |
|    explained_variance   | 0.979       |
|    learning_rate        | 0.0002      |
|    loss                 | 16.3        |
|    n_updates            | 384         |
|    policy_gradient_loss | -0.0644     |
|    std                  | 0.961       |
|    value_loss           | 70.5        |
-----------------------------------------


✅ Phase 2 completed - Walking model saved!


In [None]:
# COMPREHENSIVE TESTING OF TRAINED MODELS
# Tests all available models and shows performance comparison

def test_model(model_name, env_class, difficulty=1.0, steps=500):
    """Test a specific model and return performance metrics"""
    try:
        env = env_class() if env_class != ImprovedWalkingEnv else env_class(difficulty=difficulty)
        model = PPO.load(model_name, env=env)
        
        obs, _ = env.reset()
        total_reward = 0
        episode_rewards = []
        distances = []
        episode_count = 0
        
        for i in range(steps):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            
            if i % 100 == 0:
                pos = env.data.qpos
                vel = env.data.qvel
                print(f"  Step {i:3d}: Height={pos[2]:.2f}, X={pos[0]:.2f}, VelX={vel[0]:.2f}, R={reward:.2f}")
            
            if terminated or truncated:
                episode_count += 1
                episode_rewards.append(total_reward)
                distances.append(env.data.qpos[0])
                
                print(f"  🏁 Episode {episode_count}: Reward={total_reward:.1f}, Distance={env.data.qpos[0]:.2f}m")
                obs, _ = env.reset()
                total_reward = 0
                
                if episode_count >= 3:  # Test 3 episodes
                    break
        
        # Calculate metrics
        avg_reward = np.mean(episode_rewards) if episode_rewards else total_reward / steps
        avg_distance = np.mean(distances) if distances else env.data.qpos[0]
        max_distance = max(distances) if distances else env.data.qpos[0]
        
        return {
            'avg_reward': avg_reward,
            'avg_distance': avg_distance,
            'max_distance': max_distance,
            'episodes': episode_count
        }
        
    except Exception as e:
        print(f"  ❌ Error testing {model_name}: {str(e)[:50]}...")
        return None

def comprehensive_test():
    """Test all available models and compare performance"""
    print("🧪 COMPREHENSIVE MODEL TESTING")
    print("=" * 50)
    
    # List of models to test
    models_to_test = [
        ("basic_standing_model", StandingHumanoidEnv, "Basic Standing"),
        ("basic_walking_model", WalkingHumanoidEnv, "Basic Walking"),
        ("quick_improved_model", WalkingHumanoidEnv, "Quick Improved"),
        ("curriculum_final_model", ImprovedWalkingEnv, "Curriculum Final"),
        ("walking_model", WalkingHumanoidEnv, "Previous Walking"),  # If exists
    ]
    
    results = []
    
    for model_name, env_class, description in models_to_test:
        print(f"\n🔍 Testing {description} ({model_name})...")
        
        result = test_model(model_name, env_class)
        if result:
            result['name'] = description
            result['model_file'] = model_name
            results.append(result)
            
            print(f"📊 {description} Results:")
            print(f"   Average reward: {result['avg_reward']:.2f}")
            print(f"   Average distance: {result['avg_distance']:.2f}m")
            print(f"   Max distance: {result['max_distance']:.2f}m")
            print(f"   Episodes completed: {result['episodes']}")
    
    # Compare results
    if results:
        print("\n📈 PERFORMANCE COMPARISON")
        print("=" * 50)
        
        # Sort by average reward
        results.sort(key=lambda x: x['avg_reward'], reverse=True)
        
        print("Ranking by Average Reward:")
        for i, result in enumerate(results):
            print(f"{i+1}. {result['name']:20} | Reward: {result['avg_reward']:6.2f} | Distance: {result['max_distance']:5.2f}m")
        
        # Find best model
        best_model = results[0]
        print(f"\n🥇 BEST MODEL: {best_model['name']}")
        print(f"   File: {best_model['model_file']}")
        print(f"   Performance: {best_model['avg_reward']:.2f} avg reward, {best_model['max_distance']:.2f}m max distance")
        
        return best_model['model_file']
    else:
        print("❌ No models found to test. Train a model first!")
        return None

# Run comprehensive testing
best_model_name = comprehensive_test()

if best_model_name:
    print(f"\n✅ Testing completed! Best model: {best_model_name}")
    print("\n🎬 Ready for visualization in next cell!")
else:
    print("\n⚠️  No trained models found.")
    print("Go back and run one of the training methods first!")



🧪 Testing walking model...
Step   0: Height=1.30, X=-0.00, VelX=-0.00, Reward=3.17
Step 100: Height=1.22, X=0.02, VelX=0.29, Reward=3.76
Step 200: Height=1.12, X=-0.28, VelX=-0.99, Reward=3.00
Walking test ended at step 280

📊 Walking Test Results:
  Average reward: 2.63
  Final height: 0.40
  Distance traveled: -0.89 meters
  Success: ❌ NO
⚠️  May need more training or parameter tuning


In [None]:
# 🎬 ULTIMATE HUMANOID VIEWER AND ANALYZER

def interactive_viewer(model_name, env_class=None, difficulty=1.0):
    """Interactive viewer with real-time stats and controls"""
    print(f"🎬 Loading {model_name} for interactive viewing...")
    
    try:
        # Auto-detect environment type
        if env_class is None:
            if "standing" in model_name.lower():
                env_class = StandingHumanoidEnv
            elif "curriculum" in model_name.lower():
                env_class = ImprovedWalkingEnv
            else:
                env_class = WalkingHumanoidEnv
        
        # Create environment
        env = env_class() if env_class != ImprovedWalkingEnv else env_class(difficulty=difficulty)
        model = PPO.load(model_name, env=env)
        
        print(f"✅ Loaded {model_name} successfully!")
        print("🎮 Interactive Controls:")
        print("   - Runs automatically")
        print("   - Stats printed every 100 steps")  
        print("   - Auto-restart on episode end")
        print("   - Press Ctrl+C in terminal to stop")
        
        obs, _ = env.reset()
        step_count = 0
        episode_count = 0
        total_reward = 0
        best_distance = 0
        
        try:
            while True:
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, terminated, truncated, _ = env.step(action)
                total_reward += reward
                step_count += 1
                
                # Render
                try:
                    env.render()
                except:
                    if step_count == 1:
                        print("⚠️  Visual rendering not available, showing stats only")
                
                # Print stats
                if step_count % 100 == 0:
                    pos = env.data.qpos
                    vel = env.data.qvel
                    best_distance = max(best_distance, pos[0])
                    
                    print(f"Step {step_count:4d} | Height: {pos[2]:5.2f} | X: {pos[0]:6.2f} | VelX: {vel[0]:5.2f} | Reward: {reward:6.2f}")
                
                # Episode management
                if terminated or truncated:
                    episode_count += 1
                    final_pos = env.data.qpos
                    
                    print(f"\n🏁 Episode {episode_count} Summary:")
                    print(f"   Steps: {step_count}")
                    print(f"   Total reward: {total_reward:.1f}")
                    print(f"   Distance: {final_pos[0]:.2f}m")
                    print(f"   Best distance so far: {best_distance:.2f}m")
                    print(f"   Average reward: {total_reward/step_count:.3f}")
                    print()
                    
                    obs, _ = env.reset()
                    step_count = 0
                    total_reward = 0
                    time.sleep(1.0)  # Brief pause between episodes
                
                time.sleep(0.02)  # 50 FPS
                
        except KeyboardInterrupt:
            print("\n🛑 Viewer stopped by user")
            print(f"📊 Final Stats: {episode_count} episodes, best distance: {best_distance:.2f}m")
    
    except Exception as e:
        print(f"❌ Error loading {model_name}: {e}")
        print("Available models:")
        available_models = ["basic_standing_model", "basic_walking_model", "quick_improved_model", 
                          "curriculum_final_model", "walking_model"]
        for model in available_models:
            try:
                PPO.load(model)
                print(f"  ✅ {model}")
            except:
                print(f"  ❌ {model}")

def quick_comparison_view():
    """Quickly view and compare multiple models"""
    print("⚡ QUICK COMPARISON VIEWER")
    print("=" * 30)
    
    models_to_compare = [
        ("basic_walking_model", WalkingHumanoidEnv),
        ("quick_improved_model", WalkingHumanoidEnv), 
        ("curriculum_final_model", ImprovedWalkingEnv),
    ]
    
    for model_name, env_class in models_to_compare:
        try:
            print(f"\n🔍 Quick test: {model_name}")
            env = env_class() if env_class != ImprovedWalkingEnv else env_class(difficulty=1.0)
            model = PPO.load(model_name, env=env)
            
            obs, _ = env.reset()
            total_reward = 0
            
            for i in range(200):
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, terminated, truncated, _ = env.step(action)
                total_reward += reward
                
                if terminated or truncated:
                    break
            
            distance = env.data.qpos[0]
            avg_reward = total_reward / (i + 1)
            print(f"   📊 {model_name}: {avg_reward:.2f} avg reward, {distance:.2f}m distance")
            
        except:
            print(f"   ❌ {model_name}: Not available")

print("🎬 ULTIMATE HUMANOID VIEWER SUITE")
print("=" * 50)

print("\n🎮 OPTION 1: Interactive Viewer (Recommended)")
print("Choose a model to view interactively:")

# Check available models
available_models = []
model_options = [
    ("basic_standing_model", "Basic Standing"),
    ("basic_walking_model", "Basic Walking"), 
    ("quick_improved_model", "Quick Improved"),
    ("curriculum_final_model", "Curriculum Final"),
    ("walking_model", "Previous Walking")
]

for model_file, description in model_options:
    try:
        PPO.load(model_file)
        available_models.append((model_file, description))
        print(f"  ✅ {description}: interactive_viewer('{model_file}')")
    except:
        print(f"  ❌ {description}: Not trained yet")

print("\n⚡ OPTION 2: Quick Comparison")
print("  quick_comparison_view()  # Compare all models quickly")

print("\n🖥️  OPTION 3: Terminal Viewer (External)")
print("Run this in terminal for any model:")
print("mjpython -c \"")
print("import sys; sys.path.append('.')") 
print("exec(open('train_humanoid.ipynb').read())  # Load notebook functions")
print("interactive_viewer('basic_walking_model')  # Replace with your model")
print("\"")

print("\n💡 USAGE EXAMPLES:")
if available_models:
    best_model = available_models[0][0]  # First available model
    print(f"interactive_viewer('{best_model}')  # View best available model")
    print(f"quick_comparison_view()  # Compare all models")
else:
    print("No models available yet. Train a model first!")

print("\n📊 EXPECTED IMPROVEMENTS:")
print("✅ Positive rewards (3-7+ instead of negative)")
print("✅ Forward walking (2-5+ meters per episode)")  
print("✅ Stable upright posture (height > 1.0m)")
print("✅ Consistent walking behavior")
print("🚫 No more 'flying' or dimensional issues!")

# Auto-run quick comparison if models exist
if available_models:
    print("\n🚀 Running quick comparison of available models...")
    quick_comparison_view()
else:
    print("\n⚠️  No trained models found!")
    print("Go back and uncomment a training method in the previous cell.")


🎬 How to view your trained walking humanoid:

💻 OPTION 1: Direct Python viewer
Copy and run this code in a new cell:

# Quick inline viewer
test_env = WalkingHumanoidEnv()
obs, _ = test_env.reset()

for i in range(1000):
    action, _ = walking_model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = test_env.step(action)
    test_env.render()  # This will open MuJoCo viewer
    time.sleep(0.02)

    if terminated or truncated:
        obs, _ = test_env.reset()
        print(f"Episode restarted at step {i}")

    if i % 200 == 0:
        pos = test_env.data.qpos
        print(f"Step {i}: Height={pos[2]:.2f}, Distance={pos[0]:.2f}")


🖥️  OPTION 2: Terminal viewer (recommended)
Run this command in terminal:
mjpython -c "
from simple_stand_walk import WalkingHumanoidEnv; from stable_baselines3 import PPO; import time
env = WalkingHumanoidEnv(); model = PPO.load('walking_model', env=env)
obs, _ = env.reset()
for i in range(2000):
    action, _ = model.predict(obs

In [10]:
# CORRECT: Use the NEW working models
from simple_stand_walk import WalkingHumanoidEnv
from stable_baselines3 import PPO

# Create the NEW environment (no dimensional issues)
test_env = WalkingHumanoidEnv()

# Load the NEW walking model
walking_model = PPO.load("walking_model", env=test_env)

obs, _ = test_env.reset()

for i in range(1000):
    action, _ = walking_model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = test_env.step(action)
    test_env.render()  # This will show PROPER standing/walking
    
    if i % 100 == 0:
        pos = test_env.data.qpos
        print(f"Step {i}: Height={pos[2]:.2f}, Distance={pos[0]:.2f}, Reward={reward:.2f}")
    
    if terminated or truncated:
        obs, _ = test_env.reset()
        print(f"Episode restarted at step {i}")
    
    time.sleep(0.02)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step 0: Height=1.33, Distance=0.02, Reward=3.16
Step 100: Height=1.18, Distance=0.13, Reward=3.09
Step 200: Height=1.11, Distance=0.31, Reward=5.37
Episode restarted at step 273
Step 300: Height=1.27, Distance=0.04, Reward=4.37
Step 400: Height=1.18, Distance=-0.01, Reward=3.14
Step 500: Height=1.05, Distance=-0.36, Reward=1.99
Episode restarted at step 583
Step 600: Height=1.29, Distance=-0.01, Reward=3.74
Step 700: Height=1.19, Distance=0.10, Reward=3.16
Step 800: Height=1.00, Distance=0.42, Reward=4.94
Episode restarted at step 860
Step 900: Height=1.25, Distance=0.08, Reward=4.00


In [None]:
# CORRECT: Use the NEW working models
from simple_stand_walk import WalkingHumanoidEnv
from stable_baselines3 import PPO

# Create the NEW environment (no dimensional issues)
test_env = WalkingHumanoidEnv()

# Load the NEW walking model
walking_model = PPO.load("walking_model", env=test_env)

obs, _ = test_env.reset()

for i in range(1000):
    action, _ = walking_model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = test_env.step(action)
    test_env.render()  # This will show PROPER standing/walking
    
    if i % 100 == 0:
        pos = test_env.data.qpos
        print(f"Step {i}: Height={pos[2]:.2f}, Distance={pos[0]:.2f}, Reward={reward:.2f}")
    
    if terminated or truncated:
        obs, _ = test_env.reset()
        print(f"Episode restarted at step {i}")
    
    time.sleep(0.02)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step 0: Height=1.33, Distance=0.02, Reward=3.16
Step 100: Height=1.18, Distance=0.13, Reward=3.09
Step 200: Height=1.11, Distance=0.31, Reward=5.37
Episode restarted at step 273
Step 300: Height=1.27, Distance=0.04, Reward=4.37
Step 400: Height=1.18, Distance=-0.01, Reward=3.14
Step 500: Height=1.05, Distance=-0.36, Reward=1.99
Episode restarted at step 583
Step 600: Height=1.29, Distance=-0.01, Reward=3.74
Step 700: Height=1.19, Distance=0.10, Reward=3.16
Step 800: Height=1.00, Distance=0.42, Reward=4.94
Episode restarted at step 860
Step 900: Height=1.25, Distance=0.08, Reward=4.00
