# Testing RL agents with Stable Baselines 3

Install the conda environment:

```
pip install highway-env
pip install stable-baselines3[extra]
pip install git+https://github.com/DLR-RM/stable-baselines3@feat/gymnasium-support

```

We can test all algorithms, for which discrete action is implemented out of the box. The *Implemented Algorithms* table in https://github.com/DLR-RM/stable-baselines3/blob/master/README.md gives an overview:

| **Name**      | `Discrete`     |
| ------------  | ------------------ |
| ARS           | ✓ |
| A2C           | ✓ |
| DDPG          | ✗ |
| DQN           | ✓ |
| HER           | ✓ |
| PPO           | ✓ |
| QR-DQN        | ✓ |
| RecurrentPPO  | ✓ |
| SAC           | ✗ |
| TD3           | ✗ |
| TQC           | ✗ |
| TRPO          | ✓ |
| MaskablePPO   | ✓ |

In [1]:
import gymnasium as gym
from stable_baselines3 import (
    A2C,
    DDPG,
    DQN,
    PPO,
    SAC,
    TD3,
    HerReplayBuffer
)
from PIL import Image
import os

In [2]:
env = gym.make("highway-fast-v0", render_mode="rgb_array")

def render_model(model, output_folder="output", num_episodes=100):
  if not os.path.exists(output_folder):
    os.makedirs(output_folder)
  frame_counter = 0 
  for i in range(num_episodes):
    done = truncated = False
    obs, info = env.reset()
    while not (done or truncated):
      action, _states = model.predict(obs, deterministic=True)
      obs, reward, done, truncated, info = env.step(action)
      img = env.render()
      Image.fromarray(img).save(f"{output_folder}/{str(frame_counter).zfill(7)}.jpg")
      frame_counter += 1
  env.close()

  logger.warn(


In [2]:
'''
    Reference: https://github.com/Farama-Foundation/HighwayEnv/blob/master/scripts/sb3_highway_dqn.py
'''
model_dqn = DQN('MlpPolicy', env,
              policy_kwargs=dict(net_arch=[256, 256]),
              learning_rate=5e-4,
              buffer_size=15000,
              learning_starts=200,
              batch_size=32,
              gamma=0.8,
              train_freq=1,
              gradient_steps=1,
              target_update_interval=50,
              verbose=1,
              tensorboard_log="trained_models/stable_baselines3/highway_dqn/")
model_dqn.learn(20_000)
model_dqn.save("trained_models/stable_baselines3/highway_dqn/model_dqn")
render_model(model_dqn, output_folder="trained_models/stable_baselines3/highway_dqn/output")

  logger.warn(


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to trained_models/stable_baselines3/highway_dqn/DQN_2
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.25     |
|    ep_rew_mean      | 6.25     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 27       |
|    time_elapsed     | 1        |
|    total_timesteps  | 33       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.8     |
|    ep_rew_mean      | 8.16     |
|    exploration_rate | 0.959    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 28       |
|    time_elapsed     | 3        |
|    total_timesteps  | 86       |
----------------------------------
----------------------------------
| rollout/            |          |
|  

In [6]:
'''
    Reference: https://github.com/Farama-Foundation/HighwayEnv/blob/master/scripts/sb3_highway_ppo.py
'''
n_cpu = 6
batch_size = 64
model_ppo = PPO("MlpPolicy",
            env,
            policy_kwargs=dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])]),
            n_steps=batch_size * 12 // n_cpu,
            batch_size=batch_size,
            n_epochs=10,
            learning_rate=5e-4,
            gamma=0.8,
            verbose=2,
            tensorboard_log="trained_models/stable_baselines3/highway_ppo/")
model_ppo.learn(total_timesteps=20_000)
model_ppo.save("trained_models/stable_baselines3/highway_ppo/model_ppo")
render_model(model_ppo, output_folder="trained_models/stable_baselines3/highway_ppo/output")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to trained_models/stable_baselines3/highway_ppo/PPO_2




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10.9     |
|    ep_rew_mean     | 8.18     |
| time/              |          |
|    fps             | 26       |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 128      |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 10.3        |
|    ep_rew_mean          | 7.75        |
| time/                   |             |
|    fps                  | 26          |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 256         |
| train/                  |             |
|    approx_kl            | 0.008523207 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.6        |
|    explained_variance   | -0.0137     |
|    learning_rate        | 0.

NameError: name 'render_model' is not defined

In [5]:
model_a2c = A2C('MlpPolicy', env,
            policy_kwargs=dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])]),
            n_steps=64 * 12 // 6,
            learning_rate=5e-4,
            gamma=0.8,
            verbose=2,
            tensorboard_log="trained_models/stable_baselines3/highway_a2c/")
model_a2c.learn(20_000)
model_a2c.save("trained_models/stable_baselines3/highway_a2c/model_a2c")
render_model(model_a2c, output_folder="trained_models/stable_baselines3/highway_a2c/output")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Logging to trained_models/stable_baselines3/highway_a2c/A2C_3
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 26.8     |
|    ep_rew_mean        | 19.8     |
| time/                 |          |
|    fps                | 27       |
|    iterations         | 100      |
|    time_elapsed       | 462      |
|    total_timesteps    | 12800    |
| train/                |          |
|    entropy_loss       | -0.388   |
|    explained_variance | 0.065    |
|    learning_rate      | 0.0005   |
|    n_updates          | 99       |
|    policy_loss        | -0.00947 |
|    value_loss         | 0.222    |
------------------------------------
