# Stable Baselines3 Tutorial - Getting Started
- Create a basic RL model.
- Train it in a custom env.
- Evaluate it with SB3 helper.

Also:
- Record a video of the agent.

## Definition of model and env

In [2]:
# Dependencies: previously, install ffmpeg freeglut3-dev xvfb  (for visualization).

# Imports
import gym
import numpy as np

# Import the RL model.
from stable_baselines3 import PPO # RL algorithm.
# from stable_baselines3.ppo.policies import MlpPolicy # Not needed if specified at model creation)

In [3]:
# Create the Gym env:
# The action space is deduced from the env. action space.
env = gym.make('CartPole-v1')

# Instantiate the agent:
# - PPO (actor-critic, using a value function to improve the policy).
# - MlpPolicy because the observation of the CartPole task is a feature vector, not images.
# model = PPO(MlpPolicy, env, verbose=0) # Not recommended, see Note below.
model = PPO('MlpPolicy', env, verbose=0)

# NOTE: Some algorithms like SAC have their own MlpPolicy => using string for the policy is the recommened option.

## Training and evaluation

In [4]:
# Helper function to evaluate the agent.
def evaluate(model, num_episodes=100):
    env = model.get_env() # Use the env. of the model.
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset() # Initialize env; first observation.
        while not done:
            action, _states = model.predict(obs) # states are only useful with LSTM policies.
            obs, reward, done, info = env.step(action) # returns arrays (bc vectorized env.)
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))
    
    mean_episode_reward = np.mean(all_episode_rewards)
    print(f'Mean reward: {mean_episode_reward}, Num episodes: {num_episodes}')

    return mean_episode_reward

In [15]:
# Random agent, before training
mean_reward_before_train = evaluate(model, num_episodes=100)

Mean reward: 22.700000762939453, Num episodes: 100


In [16]:
# Using Stable Baselines 3's helper.
from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f'Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')



Mean reward: 114.24 +/- 15.04


In [17]:
# Train the agent and evaluate it.
model.learn(total_timesteps=10000)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f'Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}')

Mean reward: 432.05 +/- 95.85


## Video recording

In [18]:
# Set up fake display; otherwise rendering will fail.
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [19]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

_XSERVTransmkdir: ERROR: euid != 0,directory /tmp/.X11-unix will not be created.
_XSERVTransSocketUNIXCreateListener: mkdir(/tmp/.X11-unix) failed, errno = 2
_XSERVTransMakeAllCOTSServerListeners: failed to create listener for local
(EE) 
Fatal server error:
(EE) Cannot establish any listening sockets - Make sure an X server isn't already running(EE) 


In [20]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [21]:
# Record and save the video.
record_video('CartPole-v1', model, video_length=500, prefix='ppo2-cartpole')

Saving video to /Users/AlbertoH/projects/sb3-tests/videos/ppo2-cartpole-step-0-to-step-500.mp4


# Alternative 1-line training

In [None]:
# The policy class is inferred; the environment is automatically created.
# This works because both are registered.

model = PPO('MlpPolicy', "CartPole-v1", verbose=1).learn(1000)