Add stats window argument (#1424)

* added stats_window_size argument * updated changelog * docstring info updated * added missing tensorboard log docstring * added stats_window_size argument for all models * fixed stats_window_size test * Update version --------- Co-authored-by: Antonin Raffin <antonin.raffin@dlr.de>
DLR-RM · Apr 5, 2023 · 12250eb · 12250eb
1 parent 5a70af8
commit 12250eb
Show file tree

Hide file tree

Showing 12 changed files with 54 additions and 8 deletions.
diff --git a/docs/common/logger.rst b/docs/common/logger.rst
@@ -76,10 +76,10 @@ All ``eval/`` values are computed by the ``EvalCallback``.
 
 rollout/
 ^^^^^^^^
-- ``ep_len_mean``: Mean episode length (averaged over 100 episodes)
-- ``ep_rew_mean``: Mean episodic training reward (averaged over 100 episodes), a ``Monitor`` wrapper is required to compute that value (automatically added by `make_vec_env`).
+- ``ep_len_mean``: Mean episode length (averaged over ``stats_window_size`` episodes, 100 by default)
+- ``ep_rew_mean``: Mean episodic training reward (averaged over ``stats_window_size`` episodes, 100 by default), a ``Monitor`` wrapper is required to compute that value (automatically added by `make_vec_env`).
 - ``exploration_rate``: Current value of the exploration rate when using DQN, it corresponds to the fraction of actions taken randomly (epsilon of the "epsilon-greedy" exploration)
-- ``success_rate``: Mean success rate during training (averaged over 100 episodes), you must pass an extra argument to the ``Monitor`` wrapper to log that value (``info_keywords=("is_success",)``) and provide ``info["is_success"]=True/False`` on the final step of the episode
+- ``success_rate``: Mean success rate during training (averaged over ``stats_window_size`` episodes, 100 by default), you must pass an extra argument to the ``Monitor`` wrapper to log that value (``info_keywords=("is_success",)``) and provide ``info["is_success"]=True/False`` on the final step of the episode
 
 time/
 ^^^^^

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -4,7 +4,7 @@ Changelog
 ==========
 
 
-Release 1.8.0a13 (WIP)
+Release 1.8.0a14 (WIP)
 --------------------------
 
 .. warning::
@@ -32,6 +32,7 @@ New Features:
 - Added multiprocessing support for ``HerReplayBuffer``
 - ``HerReplayBuffer`` now supports all datatypes supported by ``ReplayBuffer``
 - Provide more helpful failure messages when validating the ``observation_space`` of custom gym environments using ``check_env``` (@FieteO)
+- Added ``stats_window_size`` argument to control smoothing in rollout logging (@jonasreiher)
 
 
 `SB3-Contrib`_
@@ -1254,4 +1255,4 @@ And all the contributors:
 @Gregwar @ycheng517 @quantitative-technologies @bcollazo @git-thor @TibiGG @cool-RR @MWeltevrede
 @Melanol @qgallouedec @francescoluciano @jlp-ue @burakdmb @timothe-chaumont @honglu2875 @yuanmingqi
 @anand-bala @hughperkins @sidney-tio @AlexPasqua @dominicgkerr @Akhilez @Rocamonde @tobirohrer @ZikangXiong
-@DavyMorgan @luizapozzobon @Bonifatius94 @theSquaredError @harveybellini @DavyMorgan @FieteO
+@DavyMorgan @luizapozzobon @Bonifatius94 @theSquaredError @harveybellini @DavyMorgan @FieteO @jonasreiher
diff --git a/stable_baselines3/a2c/a2c.py b/stable_baselines3/a2c/a2c.py
@@ -42,6 +42,8 @@ class A2C(OnPolicyAlgorithm):
     :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
         Default: -1 (only sample at the beginning of the rollout)
     :param normalize_advantage: Whether to normalize or not the advantage
+    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
+        the reported success rate, mean episode length, and mean reward over
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
@@ -74,6 +76,7 @@ def __init__(
         use_sde: bool = False,
         sde_sample_freq: int = -1,
         normalize_advantage: bool = False,
+        stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
@@ -93,6 +96,7 @@ def __init__(
             max_grad_norm=max_grad_norm,
             use_sde=use_sde,
             sde_sample_freq=sde_sample_freq,
+            stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             policy_kwargs=policy_kwargs,
             verbose=verbose,

diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
@@ -67,6 +67,8 @@ class BaseAlgorithm(ABC):
     :param learning_rate: learning rate for the optimizer,
         it can be a function of the current progress remaining (from 1 to 0)
     :param policy_kwargs: Additional arguments to be passed to the policy on creation
+    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
+        the reported success rate, mean episode length, and mean reward over
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
         debug messages
@@ -95,6 +97,7 @@ def __init__(
         env: Union[GymEnv, str, None],
         learning_rate: Union[float, Schedule],
         policy_kwargs: Optional[Dict[str, Any]] = None,
+        stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         verbose: int = 0,
         device: Union[th.device, str] = "auto",
@@ -145,6 +148,7 @@ def __init__(
         # this is used to update the learning rate
         self._current_progress_remaining = 1
         # Buffers for logging
+        self._stats_window_size = stats_window_size
         self.ep_info_buffer = None  # type: Optional[deque]
         self.ep_success_buffer = None  # type: Optional[deque]
         # For logging (and TD3 delayed updates)
@@ -388,8 +392,8 @@ def _setup_learn(
 
         if self.ep_info_buffer is None or reset_num_timesteps:
             # Initialize buffers if they don't exist, or reinitialize if resetting counters
-            self.ep_info_buffer = deque(maxlen=100)
-            self.ep_success_buffer = deque(maxlen=100)
+            self.ep_info_buffer = deque(maxlen=self._stats_window_size)
+            self.ep_success_buffer = deque(maxlen=self._stats_window_size)
 
         if self.action_noise is not None:
             self.action_noise.reset()

diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py
@@ -52,6 +52,8 @@ class OffPolicyAlgorithm(BaseAlgorithm):
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
     :param policy_kwargs: Additional arguments to be passed to the policy on creation
+    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
+        the reported success rate, mean episode length, and mean reward over
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
         debug messages
@@ -90,6 +92,7 @@ def __init__(
         replay_buffer_kwargs: Optional[Dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
         policy_kwargs: Optional[Dict[str, Any]] = None,
+        stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         verbose: int = 0,
         device: Union[th.device, str] = "auto",
@@ -107,6 +110,7 @@ def __init__(
             env=env,
             learning_rate=learning_rate,
             policy_kwargs=policy_kwargs,
+            stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,

diff --git a/stable_baselines3/common/on_policy_algorithm.py b/stable_baselines3/common/on_policy_algorithm.py
@@ -37,6 +37,8 @@ class OnPolicyAlgorithm(BaseAlgorithm):
         instead of action noise exploration (default: False)
     :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
         Default: -1 (only sample at the beginning of the rollout)
+    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
+        the reported success rate, mean episode length, and mean reward over
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
     :param monitor_wrapper: When creating an environment, whether to wrap it
         or not in a Monitor wrapper.
@@ -63,6 +65,7 @@ def __init__(
         max_grad_norm: float,
         use_sde: bool,
         sde_sample_freq: int,
+        stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         monitor_wrapper: bool = True,
         policy_kwargs: Optional[Dict[str, Any]] = None,
@@ -83,6 +86,7 @@ def __init__(
             sde_sample_freq=sde_sample_freq,
             support_multi_env=True,
             seed=seed,
+            stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             supported_action_spaces=supported_action_spaces,
         )

diff --git a/stable_baselines3/dqn/dqn.py b/stable_baselines3/dqn/dqn.py
@@ -50,6 +50,8 @@ class DQN(OffPolicyAlgorithm):
     :param exploration_initial_eps: initial value of random action probability
     :param exploration_final_eps: final value of random action probability
     :param max_grad_norm: The maximum value for the gradient clipping
+    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
+        the reported success rate, mean episode length, and mean reward over
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
@@ -86,6 +88,7 @@ def __init__(
         exploration_initial_eps: float = 1.0,
         exploration_final_eps: float = 0.05,
         max_grad_norm: float = 10,
+        stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
@@ -108,6 +111,7 @@ def __init__(
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
             policy_kwargs=policy_kwargs,
+            stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,

diff --git a/stable_baselines3/ppo/ppo.py b/stable_baselines3/ppo/ppo.py
@@ -56,6 +56,8 @@ class PPO(OnPolicyAlgorithm):
         because the clipping is not enough to prevent large update
         see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
         By default, there is no limit on the kl div.
+    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
+        the reported success rate, mean episode length, and mean reward over
     :param tensorboard_log: the log location for tensorboard (if None, no logging)
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
@@ -91,6 +93,7 @@ def __init__(
         use_sde: bool = False,
         sde_sample_freq: int = -1,
         target_kl: Optional[float] = None,
+        stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
@@ -110,6 +113,7 @@ def __init__(
             max_grad_norm=max_grad_norm,
             use_sde=use_sde,
             sde_sample_freq=sde_sample_freq,
+            stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             policy_kwargs=policy_kwargs,
             verbose=verbose,

diff --git a/stable_baselines3/sac/sac.py b/stable_baselines3/sac/sac.py
@@ -65,6 +65,9 @@ class SAC(OffPolicyAlgorithm):
         Default: -1 (only sample at the beginning of the rollout)
     :param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling
         during the warm up phase (before learning starts)
+    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
+        the reported success rate, mean episode length, and mean reward over
+    :param tensorboard_log: the log location for tensorboard (if None, no logging)
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
         debug messages
@@ -102,6 +105,7 @@ def __init__(
         use_sde: bool = False,
         sde_sample_freq: int = -1,
         use_sde_at_warmup: bool = False,
+        stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
@@ -124,6 +128,7 @@ def __init__(
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
             policy_kwargs=policy_kwargs,
+            stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,

diff --git a/stable_baselines3/td3/td3.py b/stable_baselines3/td3/td3.py
@@ -53,6 +53,9 @@ class TD3(OffPolicyAlgorithm):
     :param target_policy_noise: Standard deviation of Gaussian noise added to target policy
         (smoothing noise)
     :param target_noise_clip: Limit for absolute value of target policy smoothing noise.
+    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
+        the reported success rate, mean episode length, and mean reward over
+    :param tensorboard_log: the log location for tensorboard (if None, no logging)
     :param policy_kwargs: additional arguments to be passed to the policy on creation
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
         debug messages
@@ -87,6 +90,7 @@ def __init__(
         policy_delay: int = 2,
         target_policy_noise: float = 0.2,
         target_noise_clip: float = 0.5,
+        stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         policy_kwargs: Optional[Dict[str, Any]] = None,
         verbose: int = 0,
@@ -109,6 +113,7 @@ def __init__(
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
             policy_kwargs=policy_kwargs,
+            stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,

diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt
@@ -1 +1 @@
-1.8.0a13
+1.8.0a14
diff --git a/tests/test_logger.py b/tests/test_logger.py
@@ -416,3 +416,14 @@ def test_human_output_format_no_crash_on_same_keys_different_tags():
         {"key1/foo": "value1", "key1/bar": "value2", "key2/bizz": "value3", "key2/foo": "value4"},
         {"key1/foo": None, "key2/bizz": None, "key1/bar": None, "key2/foo": None},
     )
+
+
+@pytest.mark.parametrize("algo", [A2C, DQN])
+@pytest.mark.parametrize("stats_window_size", [1, 42])
+def test_ep_buffers_stats_window_size(algo, stats_window_size):
+    """Set stats_window_size for logging to non-default value and check if
+    ep_info_buffer and ep_success_buffer are initialized to the correct length"""
+    model = algo("MlpPolicy", "CartPole-v1", stats_window_size=stats_window_size)
+    model.learn(total_timesteps=10)
+    assert model.ep_info_buffer.maxlen == stats_window_size
+    assert model.ep_success_buffer.maxlen == stats_window_size