# PPO training for Task 7 pendulum

Reproduces the PPO workflow from `task_2_lqr_balance_data_gen.ipynb`, but runs directly on the custom PyBullet + Pinocchio environment implemented in `scripts/task_7_env.py`.

In [1]:
import os
from pathlib import Path

import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

In [2]:
current = Path.cwd()

if (current / 'notebooks').exists():
    PROJECT_ROOT = current
else:
    PROJECT_ROOT = current.parent

os.chdir(PROJECT_ROOT)

import sys
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from scripts.task_7_env import Task7PendulumEnv

DATA_DIR = Path('data')
MODELS_DIR = DATA_DIR
for directory in (DATA_DIR, MODELS_DIR):
    directory.mkdir(exist_ok=True)

print('Project root:', PROJECT_ROOT)

Project root: /home/acepeax/Desktop/Studies/MVA/Robotics/Project


pybullet build time: Dec  4 2025 20:11:42


## Environment helpers

In [3]:
MAX_STEPS = 1000
SHOULD_BALANCE = True

def make_task7_env(gui=False, should_balance=SHOULD_BALANCE):
    def _init():
        env = Task7PendulumEnv(
            max_steps=MAX_STEPS,
            should_balance=should_balance,
            gui=gui,
        )
        return Monitor(env)
    return _init

train_env = DummyVecEnv([make_task7_env(gui=False, should_balance=SHOULD_BALANCE)])
print('Observation space:', train_env.observation_space)
print('Action space:', train_env.action_space)

Observation space: Box(-inf, inf, (14,), float32)
Action space: Discrete(3)


## Train PPO

In [4]:
model = PPO(
    policy='MlpPolicy',
    env=train_env,
    verbose=0,
    n_steps=2048,
    batch_size=256,
    gamma=0.99,
    gae_lambda=0.95,
    ent_coef=0.0,
    learning_rate=3e-4,
    clip_range=0.2,
    tensorboard_log='runs/task7_ppo',
)

In [5]:
TOTAL_TIMESTEPS = 200_000
model.learn(total_timesteps=TOTAL_TIMESTEPS, progress_bar=True)

Output()

KeyboardInterrupt: 

## Save / load the trained policy

In [8]:
MODEL_PATH = MODELS_DIR / 'test_ppo_robot_arm_balance'
model.save(MODEL_PATH)
print('Model saved to', MODEL_PATH)

Model saved to data/test_ppo_robot_arm_balance


In [5]:
MODEL_PATH = MODELS_DIR / 'ppo_robot_arm_balance'
model = PPO.load(MODEL_PATH, train_env)

## Rollout helpers

In [6]:
train_env.close()

def rollout_episode(env, model, max_steps=MAX_STEPS, deterministic=True):
    obs, _ = env.reset()
    rewards = []
    infos = []
    for _ in range(max_steps):
        action, _ = model.predict(obs, deterministic=deterministic)
        obs, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        infos.append(info)
        if terminated or truncated:
            break
    return np.array(rewards), infos

eval_env = Task7PendulumEnv(
    max_steps=MAX_STEPS, should_balance=SHOULD_BALANCE, gui=True
)

startThreads creating 1 threads.
starting thread 0
started thread 0 
argc=2
argv[0] = --unused
argv[1] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creating context
Created GL 3.3 context
Direct GLX rendering context obtained
Making context current
GL_VENDOR=Intel
GL_RENDERER=Mesa Intel(R) Iris(R) Xe Graphics (RPL-U)
GL_VERSION=4.6 (Core Profile) Mesa 25.0.7-0ubuntu0.24.04.1
GL_SHADING_LANGUAGE_VERSION=4.60
pthread_getconcurrency()=0
Version = 4.6 (Core Profile) Mesa 25.0.7-0ubuntu0.24.04.1
Vendor = Intel
Renderer = Mesa Intel(R) Iris(R) Xe Graphics (RPL-U)
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started
ven = Intel
Workaround for some crash in the Intel OpenGL driver on Linux/Ubuntu
ven = Intel
Workaround for some crash in the Intel OpenGL driver on Linux/Ubunt

In [7]:
rewards, infos = rollout_episode(eval_env, model)
print('Episode length:', len(rewards))
print('Total reward:', rewards.sum())
print('Final info:', infos[-1] if infos else None)

Episode length: 1000
Total reward: 918.1066546742811
Final info: {'failure': False, 'success': True, 'state': Task7Observation(vector=array([ 2.8460357e-02,  1.2364244e+00,  4.9987969e-01,  4.1536069e-03,
        5.7339168e-01, -1.3670009e-01,  4.2259592e-01, -2.0999467e-02,
       -8.5006677e-06, -2.0414682e-03, -6.5909126e-03, -2.9060680e-01,
        3.3564192e-01,  1.5748497e-02], dtype=float32), pendulum_angle=0.028460356521870178, pendulum_velocity=1.2364244288488333, end_effector_position=array([-8.50066739e-06, -2.04146823e-03, -6.59091238e-03])), 'y_target': 0.45}


## Export rollouts for datasets

In [12]:
def record_task7_dataset(
    model,
    n_episodes=5,
    max_steps=MAX_STEPS,
    should_balance=SHOULD_BALANCE,
    filename='task7_ppo_dataset',
):
    env = Task7PendulumEnv(
        max_steps=max_steps, should_balance=should_balance, gui=False
    )
    all_obs, all_actions, all_rewards, episode_ids = [], [], [], []
    recorded = 0
    while recorded < n_episodes:
        obs, _ = env.reset()
        local_obs, local_actions, local_rewards = [], [], []
        success = True
        for step in range(max_steps):
            local_obs.append(obs.copy())
            action, _ = model.predict(obs, deterministic=True)
            action = int(action)
            local_actions.append(action)
            obs, reward, terminated, truncated, info = env.step(action)
            local_rewards.append(reward)
            if terminated or truncated:
                success = info.get('success', False) and not info.get('failure', False)
                break
        if success and len(local_obs) == max_steps:
            all_obs.extend(local_obs)
            all_actions.extend(local_actions)
            all_rewards.extend(local_rewards)
            episode_ids.extend([recorded] * len(local_obs))
            recorded += 1
            print(f'Recorded episode {recorded}/{n_episodes}')
        else:
            print('Episode failed, retrying...')
    env.close()
    save_path = DATA_DIR / f'{filename}.npz'
    np.savez(
        save_path,
        observations=np.array(all_obs, dtype=np.float32),
        actions=np.array(all_actions, dtype=np.int64),
        rewards=np.array(all_rewards, dtype=np.float32),
        episode_ids=np.array(episode_ids, dtype=np.int32),
        max_steps=max_steps,
    )
    print('Dataset saved to', save_path)
    return save_path

In [13]:
# Example usage once a good policy is trained
dataset_path = record_task7_dataset(model, n_episodes=3, filename='task7_ppo_balance_rollouts')

Recorded episode 1/3
Recorded episode 2/3
Recorded episode 3/3
Dataset saved to data/task7_ppo_balance_rollouts.npz


In [10]:
eval_env.close()
train_env.close()

numActiveThreads = 0
stopping threads
Thread with taskId 0 exiting
destroy semaphore
semaphore destroyed
Thread TERMINATED
destroy main semaphore
main semaphore destroyed
finished
numActiveThreads = 0
btShutDownExampleBrowser stopping threads
Thread with taskId 0 exiting
Thread TERMINATED
destroy semaphore
semaphore destroyed
destroy main semaphore
main semaphore destroyed
