In [None]:

!pip install stable-baselines3[extra]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.5.0-py3-none-any.whl (177 kB)
[K     |████████████████████████████████| 177 kB 4.1 MB/s 
Collecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 43.9 MB/s 
Collecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting ale-py~=0.7.4
  Downloading ale_py-0.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 39.3 MB/s 
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gym, AutoROM.accept-rom-license
  Bu

Callbacks
A callback is a set of functions that will be called at given stages of the training procedure. We can use callbacks to access internal state of the RL model during training. It allows one to do monitoring, auto saving, model manipulation, progress bars, …
To build a custom callback, you need to create a class that derives from BaseCallback. This will give you access to events (_on_training_start, _on_step) and useful variables (like self.model for the RL model).


In [None]:
from stable_baselines3.common.callbacks import BaseCallback


class CustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseAlgorithm
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = None  # type: Dict[str, Any]
        # self.globals = None  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger = None  # stable_baselines3.common.logger
        # # Sometimes, for event callback, it is useful
        # # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass

    def _on_rollout_start(self) -> None:
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        pass

    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: (bool) If the callback returns False, training is aborted early.
        """
        return True

    def _on_rollout_end(self) -> None:
        """
        This event is triggered before updating the policy.
        """
        pass

    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the `learn()` method.
        """
        pass

In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback
# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
                                         name_prefix='rl_model')

model = SAC('MlpPolicy', 'Pendulum-v1')
model.learn(2000, callback=checkpoint_callback)

<stable_baselines3.sac.sac.SAC at 0x7fdd319e3350>

In [None]:
import gym

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback

# Separate evaluation env
eval_env = gym.make('Pendulum-v1')
# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/',
                             log_path='./logs/', eval_freq=500,
                             deterministic=True, render=False)

model = SAC('MlpPolicy', 'Pendulum-v1')
model.learn(5000, callback=eval_callback)



Eval num_timesteps=500, episode_reward=-1747.92 +/- 144.14
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1000, episode_reward=-1673.81 +/- 139.82
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1500, episode_reward=-1377.39 +/- 138.81
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=2000, episode_reward=-1138.73 +/- 84.72
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=2500, episode_reward=-452.07 +/- 125.37
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3000, episode_reward=-154.50 +/- 148.97
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3500, episode_reward=-196.58 +/- 93.37
Episode length: 200.00 +/- 0.00
Eval num_timesteps=4000, episode_reward=-466.67 +/- 471.22
Episode length: 200.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=-129.92 +/- 86.56
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=5000, ep

<stable_baselines3.sac.sac.SAC at 0x7fdc6a192450>

In [None]:
import gym

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback

checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/')
# Separate evaluation env
eval_env = gym.make('Pendulum-v1')
eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/best_model',
                             log_path='./logs/results', eval_freq=500)
# Create the callback list
callback = CallbackList([checkpoint_callback, eval_callback])

model = SAC('MlpPolicy', 'Pendulum-v1')
# Equivalent to:
# model.learn(5000, callback=[checkpoint_callback, eval_callback])
model.learn(5000, callback=callback)



Eval num_timesteps=500, episode_reward=-1550.58 +/- 204.18
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1000, episode_reward=-1745.85 +/- 130.22
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1500, episode_reward=-1320.89 +/- 53.21
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=2000, episode_reward=-1140.84 +/- 73.10
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=2500, episode_reward=-914.97 +/- 85.99
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3000, episode_reward=-878.96 +/- 15.72
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3500, episode_reward=-413.86 +/- 86.31
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=4000, episode_reward=-202.49 +/- 128.98
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=4500, episode_reward=-156.61 +/- 59.79
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num

<stable_baselines3.sac.sac.SAC at 0x7fdc67cde250>

In [None]:
import gym

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

# Separate evaluation env
eval_env = gym.make('CartPole-v1')
# Stop training when the model reaches the reward threshold
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-200, verbose=1)
eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1)

model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1)
# Almost infinite number of timesteps, but the training will stop
# early as soon as the reward threshold is reached
model.learn(int(1e10), callback=eval_callback)

Using cpu device
Creating environment from the given name 'Pendulum-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.53e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 46        |
|    time_elapsed    | 17        |
|    total_timesteps | 800       |
| train/             |           |
|    actor_loss      | 25.2      |
|    critic_loss     | 0.221     |
|    ent_coef        | 0.812     |
|    ent_coef_loss   | -0.345    |
|    learning_rate   | 0.0003    |
|    n_updates       | 699       |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.57e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 42        |
|    time_e



Eval num_timesteps=10000, episode_reward=-121.73 +/- 77.76
Episode length: 200.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 200      |
|    mean_reward     | -122     |
| time/              |          |
|    total_timesteps | 10000    |
| train/             |          |
|    actor_loss      | 56.1     |
|    critic_loss     | 1.95     |
|    ent_coef        | 0.169    |
|    ent_coef_loss   | -0.214   |
|    learning_rate   | 0.0003   |
|    n_updates       | 9899     |
---------------------------------
New best mean reward!
Stopping training because the mean reward -121.73  is above the threshold -200


<stable_baselines3.sac.sac.SAC at 0x7fdc67c82990>

In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import StopTrainingOnMaxEpisodes

# Stops training when the model reaches the maximum number of episodes
callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1)

model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1)
# Almost infinite number of timesteps, but the training will stop
# early as soon as the max number of episodes is reached
model.learn(int(1e10), callback=callback_max_episodes)

Using cpu device
Creating environment from the given name 'Pendulum-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -1.6e+03 |
| time/              |          |
|    episodes        | 4        |
|    fps             | 45       |
|    time_elapsed    | 17       |
|    total_timesteps | 800      |
| train/             |          |
|    actor_loss      | 24.6     |
|    critic_loss     | 0.234    |
|    ent_coef        | 0.812    |
|    ent_coef_loss   | -0.341   |
|    learning_rate   | 0.0003   |
|    n_updates       | 699      |
---------------------------------
Stopping training with a total of 1000 steps because the SAC model reached max_episodes=5, by playing for 5 episodes 


<stable_baselines3.sac.sac.SAC at 0x7fdc67c759d0>

In [None]:
import gym

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement

# Separate evaluation env
eval_env = gym.make("Pendulum-v1")
# Stop training if there is no improvement after more than 3 evaluations
stop_train_callback = StopTrainingOnNoModelImprovement(max_no_improvement_evals=3, min_evals=5, verbose=1)
eval_callback = EvalCallback(eval_env, eval_freq=1000, callback_after_eval=stop_train_callback, verbose=1)

model = SAC("MlpPolicy", "Pendulum-v1", learning_rate=1e-3, verbose=1)
# Almost infinite number of timesteps, but the training will stop early
# as soon as the the number of consecutive evaluations without model
# improvement is greater than 3
model.learn(int(1e10), callback=eval_callback)

Using cpu device
Creating environment from the given name 'Pendulum-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -1.3e+03 |
| time/              |          |
|    episodes        | 4        |
|    fps             | 47       |
|    time_elapsed    | 16       |
|    total_timesteps | 800      |
| train/             |          |
|    actor_loss      | 21.7     |
|    critic_loss     | 0.0584   |
|    ent_coef        | 0.507    |
|    ent_coef_loss   | -0.993   |
|    learning_rate   | 0.001    |
|    n_updates       | 699      |
---------------------------------




Eval num_timesteps=1000, episode_reward=-1771.76 +/- 54.40
Episode length: 200.00 +/- 0.00
----------------------------------
| eval/              |           |
|    mean_ep_length  | 200       |
|    mean_reward     | -1.77e+03 |
| time/              |           |
|    total_timesteps | 1000      |
| train/             |           |
|    actor_loss      | 27.4      |
|    critic_loss     | 0.042     |
|    ent_coef        | 0.422     |
|    ent_coef_loss   | -1.21     |
|    learning_rate   | 0.001     |
|    n_updates       | 899       |
----------------------------------
New best mean reward!
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.42e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 41        |
|    time_elapsed    | 38        |
|    total_timesteps | 1600      |
| train/             |           |
|    actor_loss      | 46.4      |
|    critic_

<stable_baselines3.sac.sac.SAC at 0x7fdc67c259d0>