In [1]:
"""
The simplest run of a gym possible. The agent takes random actions for the specified steps, and doesn't 'learn' anything. No ANN is built here.
"""
import gymnasium as gym
import plotly.express as px
import moviepy as mpy

env = gym.make("CartPole-v1", render_mode='rgb_array')

observation, info = env.reset(seed=42)
steps = 50 # run it this many steps regardless of what happens
render_rate = steps//4 
terminated = truncated = False
for i in range(steps):
    action = env.action_space.sample() # FIXME: choose a random action here instead of None. Hint: use the action space of the environment
    observation, reward, terminated, truncated, info = env.step(action)
    if i % render_rate == 0:
        print(f'Render of step {i}: ') 
        img = px.imshow(env.render()) # peek at the environment
        img.show()

    if terminated or truncated: # true if was stopped--ie the pole tipped too far or max episode length reached
        print(f'Render of TERMINATED step {i}: ')
        img = px.imshow(env.render())
        img.show()
        observation, info = env.reset() # reset environment for new episode
env.close()

Render of step 0: 


Render of TERMINATED step 10: 


Render of step 12: 


Render of TERMINATED step 20: 


Render of step 24: 


Render of step 36: 


Render of step 48: 


In [3]:
"""Here is one random episode that records a video of the environment"""
import moviepy as mpy

episode_reward = 0
reward = 0
terminated = truncated = False
obs, info = env.reset()
img_list = []
while not terminated and not truncated:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    episode_reward += reward
    img_list.append(env.render())

print('episode reward was: ', episode_reward)
clip = mpy.ImageSequenceClip(img_list, fps=30)
clip.write_videofile('cartpolerandom.mp4', logger=None)

episode reward was:  21.0


In [4]:
"""Here is one that uses the pygame renderer"""
from time import sleep
import gymnasium as gym
env2 = gym.make("CartPole-v1", render_mode='human') # human renderer uses pygame
episode_reward = 0
reward = 0
step = 0
terminated = truncated = False
obs, info = env2.reset()
while not terminated and not truncated:
    action = env2.action_space.sample()
    obs, reward, terminated, truncated, info = env2.step(action)
    episode_reward += reward
    env2.render()
    step += 1
    sleep(0.01)  # sleep to slow down the rendering a bit

print('episode reward was: ', episode_reward)
print('steps taken: ', step)
env2.close()

episode reward was:  23.0
steps taken:  23


In [5]:
"""
Using stable_baselines3 to learn something in the most simple example. 
An ANN is actually built here. By calling PPO alg it will be used to estimate a policy and updated according to PPO
This will take a couple minutes or so to execute this cell
NOTE you will need to click near the bottom of this notebook cell output to make it "view as scrollable element" since the output is very long.
"""
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy 

env = make_vec_env("CartPole-v1", n_envs=10)
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100_000, progress_bar=True)


2025-07-15 21:08:06.488020: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752613686.498995   35565 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752613686.502184   35565 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752613686.510264   35565 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752613686.510271   35565 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752613686.510273   35565 computation_placer.cc:177] computation placer alr

Using cuda device


Output()


You are trying to run PPO on the GPU, but it is primarily intended to run on the CPU when not using a CNN policy (you are using ActorCriticPolicy which should be a MlpPolicy). See https://github.com/DLR-RM/stable-baselines3/issues/1245 for more info. You can pass `device='cpu'` or `export CUDA_VISIBLE_DEVICES=` to force using the CPU.Note: The model will train, but the GPU utilization will be poor and the training might take longer than on CPU.



---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.1     |
|    ep_rew_mean     | 23.1     |
| time/              |          |
|    fps             | 8599     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 20480    |
---------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 36.6        |
|    ep_rew_mean          | 36.6        |
| time/                   |             |
|    fps                  | 2720        |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.015917644 |
|    clip_fraction        | 0.277       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.68       |
|    explained_variance   | -0.00462    |
|    learning_rate        | 0.0003      |
|    loss                 | 2.94        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.029      |
|    value_loss           | 11.2        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 94.3        |
|    ep_rew_mean          | 94.3        |
| time/                   |             |
|    fps                  | 2223        |
|    iterations           | 3           |
|    time_elapsed         | 27          |
|    total_timesteps      | 61440       |
| train/                  |             |
|    approx_kl            | 0.020169044 |
|    clip_fraction        | 0.252       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.631      |
|    explained_variance   | 0.396       |
|    learning_rate        | 0.0003      |
|    loss                 | 13.1        |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.0391     |
|    value_loss           | 27.6        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 181         |
|    ep_rew_mean          | 181         |
| time/                   |             |
|    fps                  | 2033        |
|    iterations           | 4           |
|    time_elapsed         | 40          |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.012308479 |
|    clip_fraction        | 0.162       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.583      |
|    explained_variance   | 0.411       |
|    learning_rate        | 0.0003      |
|    loss                 | 14.1        |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.0257     |
|    value_loss           | 38.6        |
-----------------------------------------


-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 240         |
|    ep_rew_mean          | 240         |
| time/                   |             |
|    fps                  | 1931        |
|    iterations           | 5           |
|    time_elapsed         | 53          |
|    total_timesteps      | 102400      |
| train/                  |             |
|    approx_kl            | 0.014124905 |
|    clip_fraction        | 0.15        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.537      |
|    explained_variance   | 0.589       |
|    learning_rate        | 0.0003      |
|    loss                 | 11.8        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0162     |
|    value_loss           | 27.6        |
-----------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x7ff32b866ad0>

In [None]:
"""Here is one that uses the pygame renderer"""
from time import sleep
import gymnasium as gym
env3 = gym.make("CartPole-v1", render_mode='human') # human renderer uses pygame
episode_reward = 0
reward = 0
step = 0
obs, info = env3.reset()
terminated = truncated = False
while not terminated and not truncated:
    action = model.predict(obs, deterministic=False)[0]  # use the trained model to predict the action
    obs, reward, terminated, truncated, info = env3.step(action)
    episode_reward += reward
    env3.render()
    step += 1
    sleep(0.01)  # sleep to slow down the rendering a bit

print('episode reward was: ', episode_reward)
print('steps taken: ', step)
env3.close()

episode reward was:  293.0
steps taken:  293


In [None]:
"""Here is one episode using the trained algorithm to record a video of the environment"""
import moviepy as mpy
episode_reward = 0
reward = 0
step = 0
img_list = []
env2 = gym.make("CartPole-v1", render_mode='rgb_array')
obs, info = env2.reset()
terminated = truncated = False
while not terminated and not truncated:
    action = model.predict(obs, deterministic=False)[0]  # use the trained model to predict the action
    obs, reward, terminated, truncated, info = env2.step(action)
    episode_reward += reward
    img_list.append(env2.render())
    step += 1
clip = mpy.ImageSequenceClip(img_list, fps=30)
clip.write_videofile('cartpoleppo.mp4', logger=None)
print('episode reward was: ', episode_reward)
print('steps taken: ', step)