Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix to use float64 actions for off policy algorithms #1572

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Changelog
==========


Release 2.1.0a2 (WIP)
Release 2.1.0a3 (WIP)
--------------------------

Breaking Changes:
Expand All @@ -26,6 +26,7 @@ New Features:
Bug Fixes:
^^^^^^^^^^
- Relaxed check in logger, that was causing issue on Windows with colorama
- Fixed off-policy algorithms with continuous float64 actions (see #1145) (@tobirohrer)

Deprecations:
^^^^^^^^^^^^^
Expand All @@ -34,6 +35,7 @@ Others:
^^^^^^^
- Updated GitHub issue templates
- Fix typo in gym patch error message (@lukashass)
- Refactor ``test_spaces.py`` tests

Documentation:
^^^^^^^^^^^^^^
Expand Down
23 changes: 21 additions & 2 deletions stable_baselines3/common/buffers.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,9 @@ def __init__(
else:
self.next_observations = np.zeros((self.buffer_size, self.n_envs, *self.obs_shape), dtype=observation_space.dtype)

self.actions = np.zeros((self.buffer_size, self.n_envs, self.action_dim), dtype=action_space.dtype)
self.actions = np.zeros(
(self.buffer_size, self.n_envs, self.action_dim), dtype=self._maybe_cast_dtype(action_space.dtype)
)

self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
self.dones = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
Expand Down Expand Up @@ -311,6 +313,21 @@ def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = Non
)
return ReplayBufferSamples(*tuple(map(self.to_torch, data)))

@staticmethod
def _maybe_cast_dtype(dtype: np.typing.DTypeLike) -> np.typing.DTypeLike:
"""
Cast `np.float64` action datatype to `np.float32`,
keep the others dtype unchanged.
See GH#1572 for more information.

:param dtype: The original action space dtype
:return: ``np.float32`` if the dtype was float64,
the original dtype otherwise.
"""
if dtype == np.float64:
return np.float32
return dtype


class RolloutBuffer(BaseBuffer):
"""
Expand Down Expand Up @@ -543,7 +560,9 @@ def __init__(
for key, _obs_shape in self.obs_shape.items()
}

self.actions = np.zeros((self.buffer_size, self.n_envs, self.action_dim), dtype=action_space.dtype)
self.actions = np.zeros(
(self.buffer_size, self.n_envs, self.action_dim), dtype=self._maybe_cast_dtype(action_space.dtype)
)
self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
self.dones = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)

Expand Down
2 changes: 1 addition & 1 deletion stable_baselines3/dqn/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def predict(
if not deterministic and np.random.rand() < self.exploration_rate:
if self.policy.is_vectorized_observation(observation):
if isinstance(observation, dict):
n_batch = observation[list(observation.keys())[0]].shape[0]
n_batch = observation[next(iter(observation.keys()))].shape[0]
else:
n_batch = observation.shape[0]
action = np.array([self.action_space.sample() for _ in range(n_batch)])
Expand Down
2 changes: 1 addition & 1 deletion stable_baselines3/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.1.0a2
2.1.0a3
101 changes: 71 additions & 30 deletions tests/test_spaces.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,67 @@
from dataclasses import dataclass
from typing import Dict, Optional

import gymnasium as gym
import numpy as np
import pytest
from gymnasium import spaces
from gymnasium.spaces.space import Space

from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

BOX_SPACE_FLOAT64 = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float64)
BOX_SPACE_FLOAT32 = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)

class DummyMultiDiscreteSpace(gym.Env):
def __init__(self, nvec):
super().__init__()
self.observation_space = spaces.MultiDiscrete(nvec)
self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)

def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None):
if seed is not None:
super().reset(seed=seed)
return self.observation_space.sample(), {}
@dataclass
class DummyEnv(gym.Env):
observation_space: Space
action_space: Space

def step(self, action):
return self.observation_space.sample(), 0.0, False, False, {}


class DummyMultiBinary(gym.Env):
def __init__(self, n):
super().__init__()
self.observation_space = spaces.MultiBinary(n)
self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)

def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None):
if seed is not None:
super().reset(seed=seed)
return self.observation_space.sample(), {}

def step(self, action):
return self.observation_space.sample(), 0.0, False, False, {}


class DummyMultidimensionalAction(gym.Env):
class DummyMultidimensionalAction(DummyEnv):
def __init__(self):
super().__init__()
self.observation_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)
self.action_space = spaces.Box(low=-1, high=1, shape=(2, 2), dtype=np.float32)
super().__init__(
BOX_SPACE_FLOAT32,
spaces.Box(low=-1, high=1, shape=(2, 2), dtype=np.float32),
)

def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None):
if seed is not None:
super().reset(seed=seed)
return self.observation_space.sample(), {}

def step(self, action):
return self.observation_space.sample(), 0.0, False, False, {}
class DummyMultiBinary(DummyEnv):
def __init__(self, n):
super().__init__(
spaces.MultiBinary(n),
BOX_SPACE_FLOAT32,
)


class DummyMultiDiscreteSpace(DummyEnv):
def __init__(self, nvec):
super().__init__(
spaces.MultiDiscrete(nvec),
BOX_SPACE_FLOAT32,
)


@pytest.mark.parametrize(
"env", [DummyMultiDiscreteSpace([4, 3]), DummyMultiBinary(8), DummyMultiBinary((3, 2)), DummyMultidimensionalAction()]
"env",
[
DummyMultiDiscreteSpace([4, 3]),
DummyMultiBinary(8),
DummyMultiBinary((3, 2)),
DummyMultidimensionalAction(),
],
)
def test_env(env):
# Check the env used for testing
Expand Down Expand Up @@ -127,3 +131,40 @@ def test_discrete_obs_space(model_class, env):
else:
kwargs = dict(n_steps=256)
model_class("MlpPolicy", env, **kwargs).learn(256)


@pytest.mark.parametrize("model_class", [SAC, TD3, PPO, DDPG, A2C])
@pytest.mark.parametrize(
"obs_space",
[
BOX_SPACE_FLOAT32,
BOX_SPACE_FLOAT64,
spaces.Dict({"a": BOX_SPACE_FLOAT32, "b": BOX_SPACE_FLOAT32}),
spaces.Dict({"a": BOX_SPACE_FLOAT32, "b": BOX_SPACE_FLOAT64}),
],
)
@pytest.mark.parametrize(
"action_space",
[
BOX_SPACE_FLOAT32,
BOX_SPACE_FLOAT64,
],
)
def test_float64_action_space(model_class, obs_space, action_space):
env = DummyEnv(obs_space, action_space)
env = gym.wrappers.TimeLimit(env, max_episode_steps=200)
if isinstance(env.observation_space, spaces.Dict):
policy = "MultiInputPolicy"
else:
policy = "MlpPolicy"

if model_class in [PPO, A2C]:
kwargs = dict(n_steps=64, policy_kwargs=dict(net_arch=[12]))
else:
kwargs = dict(learning_starts=60, policy_kwargs=dict(net_arch=[12]))

model = model_class(policy, env, **kwargs)
model.learn(64)
initial_obs, _ = env.reset()
action, _ = model.predict(initial_obs, deterministic=False)
assert action.dtype == env.action_space.dtype