# PPO training for Task 7 pendulum

Reproduces the PPO workflow from `task_2_lqr_balance_data_gen.ipynb`, but runs directly on the custom PyBullet + Pinocchio environment implemented in `scripts/task_7_env.py`.

In [1]:
import os
from pathlib import Path

import numpy as np
from tqdm import tqdm
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from gymnasium.wrappers import RecordVideo
import control

In [7]:
current = Path.cwd()

if (current / 'notebooks').exists():
    PROJECT_ROOT = current
else:
    PROJECT_ROOT = current.parent

os.chdir(PROJECT_ROOT)

import sys
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from scripts.task_7_env import Task7PendulumEnv, Y_MAX

DATA_DIR = Path('data')
MODELS_DIR = DATA_DIR
for directory in (DATA_DIR, MODELS_DIR):
    directory.mkdir(exist_ok=True)

print('Project root:', PROJECT_ROOT)

DATASET_SIM_SUBSTEPS = 6
DATASET_DURATION_S = 30.0
DATASET_VIDEO_DIR = Path('videos/task7_dataset')

Project root: /home/acepeax/Desktop/Studies/MVA/Robotics/Project


## Environment helpers

In [3]:
MAX_STEPS = 1000
SHOULD_BALANCE = True
SIM_SUBSTEPS = 12

def make_task7_env(gui=False, should_balance=SHOULD_BALANCE, sim_substeps=SIM_SUBSTEPS):
    def _init():
        env = Task7PendulumEnv(
            max_steps=MAX_STEPS,
            should_balance=should_balance,
            gui=gui,
            sim_substeps=sim_substeps,
        )
        return Monitor(env)
    return _init

train_env = DummyVecEnv([make_task7_env(gui=False, should_balance=SHOULD_BALANCE)])
print('Observation space:', train_env.observation_space)
print('Action space:', train_env.action_space)

Observation space: Box(-inf, inf, (14,), float32)
Action space: Discrete(3)


In [4]:
def rollout_episode(env, model, max_steps=MAX_STEPS, deterministic=True):
    obs, _ = env.reset()
    rewards = []
    infos = []
    for _ in range(max_steps):
        action, _ = model.predict(obs, deterministic=deterministic)
        obs, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        infos.append(info)
        if terminated or truncated:
            break
    return np.array(rewards), infos

## Train PPO

In [4]:
model = PPO(
    policy='MlpPolicy',
    env=train_env,
    verbose=0,
    n_steps=2048,
    batch_size=256,
    gamma=0.99,
    gae_lambda=0.95,
    ent_coef=0.0,
    learning_rate=3e-4,
    clip_range=0.2,
    tensorboard_log='runs/task7_ppo',
)

In [None]:
TOTAL_TIMESTEPS = 200_000
model.learn(total_timesteps=TOTAL_TIMESTEPS, progress_bar=True)

## Save / load the trained policy

In [8]:
MODEL_PATH = MODELS_DIR / 'test_ppo_robot_arm_balance'
model.save(MODEL_PATH)
print('Model saved to', MODEL_PATH)

Model saved to data/test_ppo_robot_arm_balance


In [5]:
MODEL_PATH = MODELS_DIR / 'ppo_robot_arm_balance'
model = PPO.load(MODEL_PATH, train_env)

train_env.close()

EVAL_RECORD_VIDEO = False
EVAL_VIDEO_DIR = Path('videos/task7_eval_rollouts')

def rollout_episode(env, model, max_steps=MAX_STEPS, deterministic=True):
    obs, _ = env.reset()
    rewards = []
    infos = []
    for _ in range(max_steps):
        action, _ = model.predict(obs, deterministic=deterministic)
        obs, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        infos.append(info)
        if terminated or truncated:
            break
    return np.array(rewards), infos

if EVAL_RECORD_VIDEO:
    EVAL_VIDEO_DIR.mkdir(parents=True, exist_ok=True)
    base_eval_env = Task7PendulumEnv(
        max_steps=MAX_STEPS,
        should_balance=SHOULD_BALANCE,
        gui=False,
        sim_substeps=SIM_SUBSTEPS,
        render_mode='rgb_array',
    )
    eval_env = RecordVideo(
        base_eval_env,
        video_folder=str(EVAL_VIDEO_DIR),
        name_prefix='task7_eval',
        episode_trigger=lambda ep: True,
    )
else:
    eval_env = Task7PendulumEnv(
        max_steps=MAX_STEPS, should_balance=SHOULD_BALANCE, gui=True, sim_substeps=SIM_SUBSTEPS
    )

In [6]:
train_env.close()

eval_env = Task7PendulumEnv(
    max_steps=MAX_STEPS, should_balance=SHOULD_BALANCE, gui=True
)

startThreads creating 1 threads.
starting thread 0
started thread 0 
argc=2
argv[0] = --unused
argv[1] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creating context
Created GL 3.3 context
Direct GLX rendering context obtained
Making context current
GL_VENDOR=Intel
GL_RENDERER=Mesa Intel(R) Iris(R) Xe Graphics (RPL-U)
GL_VERSION=4.6 (Core Profile) Mesa 25.0.7-0ubuntu0.24.04.1
GL_SHADING_LANGUAGE_VERSION=4.60
pthread_getconcurrency()=0
Version = 4.6 (Core Profile) Mesa 25.0.7-0ubuntu0.24.04.1
Vendor = Intel
Renderer = Mesa Intel(R) Iris(R) Xe Graphics (RPL-U)
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started
ven = Intel
Workaround for some crash in the Intel OpenGL driver on Linux/Ubuntu
ven = Intel
Workaround for some crash in the Intel OpenGL driver on Linux/Ubunt

In [None]:
rewards, infos = rollout_episode(eval_env, model)
print('Episode length:', len(rewards))
print('Total reward:', rewards.sum())
print('Final info:', infos[-1] if infos else None)

## Export rollouts for datasets

In [None]:
DATASET_VIDEO_DIR.mkdir(parents=True, exist_ok=True)
BASE_BULLET_DT = 1.0 / 240.0

def _episode_steps(duration_s, sim_substeps):
    return max(1, int(round(duration_s / (sim_substeps * BASE_BULLET_DT))))

def record_task7_dataset(
    model,
    n_episodes=5,
    should_balance=SHOULD_BALANCE,
    filename='task7_ppo_dataset',
    sim_substeps=DATASET_SIM_SUBSTEPS,
    episode_duration_s=DATASET_DURATION_S,
    video_folder=DATASET_VIDEO_DIR,
    record_video=True,
):
    video_folder = Path(video_folder)
    video_folder.mkdir(parents=True, exist_ok=True)
    max_steps = _episode_steps(episode_duration_s, sim_substeps)
    print(f'Recording {n_episodes} episodes of {episode_duration_s}s (~{max_steps} env steps)')
    base_env = Task7PendulumEnv(
        max_steps=max_steps,
        should_balance=should_balance,
        gui=False,
        render_mode="rgb_array",
        sim_substeps=sim_substeps,
    )
    env = RecordVideo(
        base_env,
        video_folder=str(video_folder),
        name_prefix=filename,
        episode_trigger=(lambda ep: ep == 0) if record_video else (lambda ep: False),
    )
    all_obs, all_actions, all_rewards, episode_ids = [], [], [], []
    recorded = 0
    while recorded < n_episodes:
        obs, _ = env.reset()
        local_obs, local_actions, local_rewards = [], [], []
        success = True
        for step in tqdm(range(max_steps)):
            local_obs.append(obs.copy())
            action, _ = model.predict(obs, deterministic=True)
            action = int(action)
            local_actions.append(action)
            obs, reward, terminated, truncated, info = env.step(action)
            local_rewards.append(reward)
            if terminated or truncated:
                success = info.get('success', False) and not info.get('failure', False)
                break
        if success and len(local_obs) == max_steps:
            all_obs.extend(local_obs)
            all_actions.extend(local_actions)
            all_rewards.extend(local_rewards)
            episode_ids.extend([recorded] * len(local_obs))
            recorded += 1
            print(f'Recorded episode {recorded}/{n_episodes}')
        else:
            print('[WARNING] Episode failed, retrying...')
    env.close()
    save_path = DATA_DIR / f'{filename}.npz'
    np.savez(
        save_path,
        observations=np.array(all_obs, dtype=np.float32),
        actions=np.array(all_actions, dtype=np.int64),
        rewards=np.array(all_rewards, dtype=np.float32),
        episode_ids=np.array(episode_ids, dtype=np.int32),
        max_steps=max_steps,
        sim_substeps=sim_substeps,
        episode_duration_s=episode_duration_s,
    )
    print('Dataset saved to', save_path)
    return save_path

In [7]:
dataset_path = record_task7_dataset(
    model,
    n_episodes=7,
    filename='task7_ppo_balance_rollouts',
    sim_substeps=6,
    episode_duration_s=30.0,
    video_folder=DATASET_VIDEO_DIR,
    record_video=False,
)

  logger.warn(


Recording 7 episodes of 30.0s (~1200 env steps)


100%|█████████▉| 1199/1200 [00:02<00:00, 549.77it/s]


Recorded episode 1/7


100%|█████████▉| 1199/1200 [00:02<00:00, 539.69it/s]


Recorded episode 2/7


100%|█████████▉| 1199/1200 [00:02<00:00, 482.32it/s]


Recorded episode 3/7


100%|█████████▉| 1199/1200 [00:02<00:00, 446.95it/s]


Recorded episode 4/7


100%|█████████▉| 1199/1200 [00:02<00:00, 459.12it/s]


Recorded episode 5/7


100%|█████████▉| 1199/1200 [00:02<00:00, 448.81it/s]


Recorded episode 6/7


100%|█████████▉| 1199/1200 [00:02<00:00, 458.42it/s]

Recorded episode 7/7
Dataset saved to data/task7_ppo_balance_rollouts.npz





In [5]:
try:
    train_env.close()
except:
    pass
try:
    train_env.close()
except:
    pass

## Task 7 dataset inspection

In [8]:
DATASET_FILE = DATA_DIR / 'task7_ppo_balance_rollouts.npz'
if not DATASET_FILE.exists():
    raise FileNotFoundError(f'Missing dataset: {DATASET_FILE}')
dataset = np.load(DATASET_FILE)
observations = dataset['observations']
actions = dataset['actions']
episode_ids = dataset['episode_ids']
sim_substeps_ds = int(dataset.get('sim_substeps', DATASET_SIM_SUBSTEPS))
episode_duration_ds = float(dataset.get('episode_duration_s', DATASET_DURATION_S))
max_steps_ds = int(dataset.get('max_steps', observations.shape[0]))
print('Observations:', observations.shape)
print('Actions:', actions.shape)
print('Episodes IDs:', episode_ids.shape)
print('sim_substeps:', sim_substeps_ds, 'episode_duration:', episode_duration_ds, 'max_steps:', max_steps_ds)

Observations: (8400, 14)
Actions: (8400,)
Episodes IDs: (8400,)
sim_substeps: 6 episode_duration: 30.0 max_steps: 1200


### Build transitions

In [9]:
same_episode = episode_ids[1:] == episode_ids[:-1]
X = observations[:-1][same_episode]
X_next = observations[1:][same_episode]
ACTION_VALUES = np.array([-Y_MAX, 0.0, Y_MAX], dtype=np.float32)
U = ACTION_VALUES[actions[:-1][same_episode]].reshape(-1, 1)
print('Transitions:', X.shape, U.shape, X_next.shape)

Transitions: (8393, 14) (8393, 1) (8393, 14)


### Keep samples near the upright equilibrium

In [10]:
theta = X[:, 0]
theta_dot = X[:, 1]
local_mask = (np.abs(theta) < 0.3) & (np.abs(theta_dot) < 1.5)
X_loc = X[local_mask]
U_loc = U[local_mask]
X_next_loc = X_next[local_mask]
print('Local samples:', X_loc.shape[0], '/', X.shape[0])

Local samples: 7273 / 8393


### Fit a linear model

In [11]:
nx = X_loc.shape[1]
nu = U_loc.shape[1]
Z = np.hstack([X_loc, U_loc])
Y = X_next_loc
W, *_ = np.linalg.lstsq(Z, Y, rcond=None)
A_lin = W[:nx, :].T
B_lin = W[nx:, :].T
print('A shape:', A_lin.shape)
print('B shape:', B_lin.shape)

A shape: (14, 14)
B shape: (14, 1)


### Design an LQR controller on the linearized model

In [12]:
state_weights = np.ones(nx) * 0.1
state_weights[0] = 8.0
state_weights[1] = 1.0
Q = np.diag(state_weights)
R = np.array([[0.5]])
K_lqr, S_lqr, eigvals_lqr = control.dlqr(A_lin, B_lin, Q, R)
print('K shape:', K_lqr.shape)
print('Eigenvalues:', eigvals_lqr)

K shape: (1, 14)
Eigenvalues: [-0.55807994+0.j         -0.12043677+0.j          0.01876407+0.j
  0.20117272+0.j          0.31488259+0.08737926j  0.31488259-0.08737926j
  0.33036833+0.j          0.7628735 +0.26921604j  0.7628735 -0.26921604j
  0.94788135+0.10315928j  0.94788135-0.10315928j  0.88290375+0.j
  0.9832996 +0.j          0.99999873+0.j        ]


### Rollout the LQR controller in the simulator

In [13]:
class Task7LQRPolicy:
    def __init__(self, K):
        self.K = K
    def predict(self, obs, deterministic=True):
        u = float(-(self.K @ obs.reshape(-1, 1)))
        clipped = np.clip(u, ACTION_VALUES.min(), ACTION_VALUES.max())
        idx = int(np.argmin(np.abs(ACTION_VALUES - clipped)))
        return idx, None

lqr_policy = Task7LQRPolicy(K_lqr)
eval_env_lqr = Task7PendulumEnv(gui=True, should_balance=SHOULD_BALANCE, sim_substeps=SIM_SUBSTEPS)


startThreads creating 1 threads.
starting thread 0
started thread 0 
argc=2
argv[0] = --unused
argv[1] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creating context
Created GL 3.3 context
Direct GLX rendering context obtained
Making context current
GL_VENDOR=Intel
GL_RENDERER=Mesa Intel(R) Iris(R) Xe Graphics (RPL-U)
GL_VERSION=4.6 (Core Profile) Mesa 25.0.7-0ubuntu0.24.04.1
GL_SHADING_LANGUAGE_VERSION=4.60
pthread_getconcurrency()=0
Version = 4.6 (Core Profile) Mesa 25.0.7-0ubuntu0.24.04.1
Vendor = Intel
Renderer = Mesa Intel(R) Iris(R) Xe Graphics (RPL-U)
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started
ven = Intel
Workaround for some crash in the Intel OpenGL driver on Linux/Ubuntu
ven = Intel
Workaround for some crash in the Intel OpenGL driver on Linux/Ubunt

In [None]:
rewards_lqr, infos_lqr = rollout_episode(eval_env_lqr, lqr_policy)
print('LQR episode length:', len(rewards_lqr))
print('LQR total reward:', rewards_lqr.sum())

In [15]:
eval_env_lqr.close()

numActiveThreads = 0
stopping threads
Thread with taskId 0 exiting
Thread TERMINATED
destroy semaphore
semaphore destroyed
destroy main semaphore
main semaphore destroyed
finished
numActiveThreads = 0
btShutDownExampleBrowser stopping threads
Thread with taskId 0 exiting
destroy semaphore
semaphore destroyed
Thread TERMINATED
destroy main semaphore
main semaphore destroyed
