# Stable Baselines3 Tutorial - Callbacks and hyperparameter tuning
- Comparing default and beest hyperparameters in RL.
- Using callbacks for monitoring, auto-saving, model manipulation, progress bars...


In [1]:
# Dependencies: swig, tqdm

import gym
from stable_baselines3 import A2C, SAC, PPO, TD3

# 1. Hyperparameter tuning
We'll compare here the performance of "Soft Actor Critic" on the Pendulum environment with default and "tuned" hyperparameters.

Resources:
- rl zoo: https://github.com/DLR-RM/rl-baselines3-zoo
- Optuna: https://github.com/optuna/optuna

In [6]:
import numpy as np
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

eval_env = Monitor(gym.make('Pendulum-v0')) # AH: Wrapped with Monitor to prevent erroneous metrics.

## Default model

In [3]:
default_model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1, seed=0, batch_size=64, policy_kwargs=dict(net_arch=[64, 64])).learn(8000)

Using cpu device
Creating environment from the given name 'Pendulum-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.36e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 136       |
|    time_elapsed    | 5         |
|    total_timesteps | 800       |
| train/             |           |
|    actor_loss      | 20.3      |
|    critic_loss     | 0.968     |
|    ent_coef        | 0.812     |
|    ent_coef_loss   | -0.337    |
|    learning_rate   | 0.0003    |
|    n_updates       | 699       |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.55e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 129       |
|    time_e

In [10]:
mean_reward, std_reward = evaluate_policy(default_model, eval_env, n_eval_episodes=500)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:-182.34 +/- 97.99


## Tuned model

In [8]:
tuned_model = SAC('MlpPolicy', 'Pendulum-v0', batch_size=256, verbose=1, policy_kwargs=dict(net_arch=[256, 256]), seed=0).learn(8000)

Using cpu device
Creating environment from the given name 'Pendulum-v0'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.56e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 50        |
|    time_elapsed    | 15        |
|    total_timesteps | 800       |
| train/             |           |
|    actor_loss      | 24.8      |
|    critic_loss     | 0.259     |
|    ent_coef        | 0.814     |
|    ent_coef_loss   | -0.339    |
|    learning_rate   | 0.0003    |
|    n_updates       | 699       |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.61e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 46        |
|    time_e

In [11]:
mean_reward, std_reward = evaluate_policy(tuned_model, eval_env, n_eval_episodes=500)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:-141.48 +/- 90.55


# 2. Callbacks