In [None]:
!apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
!pip install stable-baselines3[extra]
!pip install pyglet==1.5.27

Reading package lists... Done
Building dependency tree       
Reading state information... Done
freeglut3-dev is already the newest version (2.8.1-3).
ffmpeg is already the newest version (7:3.4.11-0ubuntu0.1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.12).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 7 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyglet==1.5.27
  Downloading pyglet-1.5.27-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 6.6 MB/s 
[?25hInstalling collected packages: pyglet
Successfully installed pyglet-1.5.27


In [None]:
import stable_baselines3
stable_baselines3.__version__

'1.6.2'

In [None]:
import gym
import numpy as np
gym.__version__

'0.21.0'

In [None]:
from stable_baselines3 import PPO

In [None]:
from stable_baselines3.ppo import MlpPolicy

#The Pendulum-v1 environment. 
Description by gym: https://www.gymlibrary.dev/environments/classic_control/pendulum/
See more in the [gym code](https://github.com/openai/gym/blob/master/gym/envs/classic_control/pendulum.py).
See here to understand the forces to apply with the arrows: https://www.youtube.com/watch?v=F0kRQ-s4cWA
The reward should be the angle to the upright position.

In [None]:
env = gym.make('Pendulum-v1')

model = PPO(MlpPolicy, env, verbose=0)

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

Let's evaluate the un-trained agent, this should be a random agent.

In [None]:
# Use a separate environement for evaluation
eval_env = gym.make('Pendulum-v1')

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:-1201.89 +/- 337.40


## Train the agent and evaluate it

In [None]:
# Train the agent for 120000 steps
model.learn(total_timesteps=150000)

<stable_baselines3.ppo.ppo.PPO at 0x7fbc4b3e8610>

In [None]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:-967.14 +/- 75.41


### Prepare video recording

In [None]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [None]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

In [None]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make('Pendulum-v1')])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

### Visualize trained agent



In [None]:
record_video('Pendulum-v1', model, video_length=500, prefix='ppo-Pendulum-v1')

Saving video to /content/videos/ppo-Pendulum-v1-step-0-to-step-500.mp4


In [None]:
show_videos('videos', prefix='ppo-Pendulum-v1')

*It* does not converge either! But it behaves a little bit better!
see the discussion here: https://github.com/DLR-RM/stable-baselines3/issues/25
Don't expect the default values to work well.
You can record the video of the initial model by recompiling codes without training and compare.

In [None]:
untrained_model = PPO(MlpPolicy, env, verbose=0)

In [None]:
record_video('Pendulum-v1', untrained_model, video_length=500, prefix='ppo-Pendulum-v1-untrained')
show_videos('videos', prefix='ppo-Pendulum-v1-untrained')

Saving video to /content/videos/ppo-Pendulum-v1-untrained-step-0-to-step-500.mp4


In [None]:
super_model = PPO(MlpPolicy, env, verbose=0,learning_rate=1e-3,gae_lambda=0.95,ent_coef=0.0,n_epochs=10,gamma=0.9,n_steps=1024,clip_range=0.2,use_sde=True,sde_sample_freq=4)

In [None]:
super_model.learn(total_timesteps=1e5)

<stable_baselines3.ppo.ppo.PPO at 0x7fbbf3855400>

In [None]:
# Random Agent, before training
mean_reward, std_reward = evaluate_policy(super_model, eval_env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")



mean_reward:-282.11 +/- 329.46


In [None]:
record_video('Pendulum-v1', super_model, video_length=500, prefix='ppo-Pendulum-v1-super')
show_videos('videos', prefix='ppo-Pendulum-v1-super')

Saving video to /content/videos/ppo-Pendulum-v1-super-step-0-to-step-500.mp4


Pendulum-v1:

  n_envs: 4

  n_timesteps: !!float 1e5

  policy: 'MlpPolicy'

  n_steps: 1024

  gae_lambda: 0.95

  gamma: 0.9

  n_epochs: 10

  ent_coef: 0.0

  learning_rate: !!float 1e-3

  clip_range: 0.2

  use_sde: True

  sde_sample_freq: 4
  