Remove "device" argument from policies (#141)

* Remove device arg from policies * Clean up for PR * Update test and doc * Fix codestyle Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org>
DLR-RM · Aug 23, 2020 · 42ef6d4 · 42ef6d4
1 parent 21e9994
commit 42ef6d4
Show file tree

Hide file tree

Showing 10 changed files with 40 additions and 63 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -98,6 +98,15 @@ To cite this project in publications:
       howpublished = {\url{https://github.com/DLR-RM/stable-baselines3}},
     }
 
+Contributing
+------------
+
+To any interested in making the rl baselines better, there are still some improvements
+that need to be done.
+You can check issues in the `repo <https://github.com/DLR-RM/stable-baselines3/issues>`_.
+
+If you want to contribute, please read `CONTRIBUTING.md <https://github.com/DLR-RM/stable-baselines3/blob/master/CONTRIBUTING.md>`_ first.
+
 Indices and tables
 -------------------
 

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,11 +3,12 @@
 Changelog
 ==========
 
-Pre-Release 0.9.0a0 (WIP)
+Pre-Release 0.9.0a1 (WIP)
 ------------------------------
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
+- Removed ``device`` keyword argument of policies; use ``policy.to(device)`` instead. (@qxcv)
 
 New Features:
 ^^^^^^^^^^^^^

diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py
@@ -141,7 +141,6 @@ def __init__(
         # Update policy keyword arguments
         if sde_support:
             self.policy_kwargs["use_sde"] = self.use_sde
-        self.policy_kwargs["device"] = self.device
         # For gSDE only
         self.use_sde_at_warmup = use_sde_at_warmup
 

diff --git a/stable_baselines3/common/on_policy_algorithm.py b/stable_baselines3/common/on_policy_algorithm.py
@@ -116,7 +116,6 @@ def _setup_model(self) -> None:
             self.action_space,
             self.lr_schedule,
             use_sde=self.use_sde,
-            device=self.device,
             **self.policy_kwargs  # pytype:disable=not-instantiable
         )
         self.policy = self.policy.to(self.device)

diff --git a/stable_baselines3/common/policies.py b/stable_baselines3/common/policies.py
@@ -34,7 +34,6 @@ class BaseModel(nn.Module, ABC):
 
     :param observation_space: (gym.spaces.Space) The observation space of the environment
     :param action_space: (gym.spaces.Space) The action space of the environment
-    :param device: (Union[th.device, str]) Device on which the code should run.
     :param features_extractor_class: (Type[BaseFeaturesExtractor]) Features extractor to use.
     :param features_extractor_kwargs: (Optional[Dict[str, Any]]) Keyword arguments
         to pass to the feature extractor.
@@ -52,7 +51,6 @@ def __init__(
         self,
         observation_space: gym.spaces.Space,
         action_space: gym.spaces.Space,
-        device: Union[th.device, str] = "auto",
         features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
         features_extractor_kwargs: Optional[Dict[str, Any]] = None,
         features_extractor: Optional[nn.Module] = None,
@@ -70,7 +68,6 @@ def __init__(
 
         self.observation_space = observation_space
         self.action_space = action_space
-        self.device = get_device(device)
         self.features_extractor = features_extractor
         self.normalize_images = normalize_images
 
@@ -112,6 +109,16 @@ def _get_data(self) -> Dict[str, Any]:
             normalize_images=self.normalize_images,
         )
 
+    @property
+    def device(self) -> th.device:
+        """Infer which device this policy lives on by inspecting its parameters.
+        If it has no parameters, the 'auto' device is used as a fallback.
+
+        :return: (th.device)"""
+        for param in self.parameters():
+            return param.device
+        return get_device("auto")
+
     def save(self, path: str) -> None:
         """
         Save model to a given location.
@@ -300,7 +307,6 @@ class ActorCriticPolicy(BasePolicy):
     :param action_space: (gym.spaces.Space) Action space
     :param lr_schedule: (Callable) Learning rate schedule (could be constant)
     :param net_arch: ([int or dict]) The specification of the policy and value networks.
-    :param device: (str or th.device) Device on which the code should run.
     :param activation_fn: (Type[nn.Module]) Activation function
     :param ortho_init: (bool) Whether to use or not orthogonal initialization
     :param use_sde: (bool) Whether to use State Dependent Exploration or not
@@ -332,7 +338,6 @@ def __init__(
         action_space: gym.spaces.Space,
         lr_schedule: Callable[[float], float],
         net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
-        device: Union[th.device, str] = "auto",
         activation_fn: Type[nn.Module] = nn.Tanh,
         ortho_init: bool = True,
         use_sde: bool = False,
@@ -357,7 +362,6 @@ def __init__(
         super(ActorCriticPolicy, self).__init__(
             observation_space,
             action_space,
-            device,
             features_extractor_class,
             features_extractor_kwargs,
             optimizer_class=optimizer_class,
@@ -445,9 +449,7 @@ def _build(self, lr_schedule: Callable[[float], float]) -> None:
         # Note: If net_arch is None and some features extractor is used,
         #       net_arch here is an empty list and mlp_extractor does not
         #       really contain any layers (acts like an identity module).
-        self.mlp_extractor = MlpExtractor(
-            self.features_dim, net_arch=self.net_arch, activation_fn=self.activation_fn, device=self.device
-        )
+        self.mlp_extractor = MlpExtractor(self.features_dim, net_arch=self.net_arch, activation_fn=self.activation_fn)
 
         latent_dim_pi = self.mlp_extractor.latent_dim_pi
 
@@ -594,7 +596,6 @@ class ActorCriticCnnPolicy(ActorCriticPolicy):
     :param action_space: (gym.spaces.Space) Action space
     :param lr_schedule: (Callable) Learning rate schedule (could be constant)
     :param net_arch: ([int or dict]) The specification of the policy and value networks.
-    :param device: (str or th.device) Device on which the code should run.
     :param activation_fn: (Type[nn.Module]) Activation function
     :param ortho_init: (bool) Whether to use or not orthogonal initialization
     :param use_sde: (bool) Whether to use State Dependent Exploration or not
@@ -626,7 +627,6 @@ def __init__(
         action_space: gym.spaces.Space,
         lr_schedule: Callable,
         net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
-        device: Union[th.device, str] = "auto",
         activation_fn: Type[nn.Module] = nn.Tanh,
         ortho_init: bool = True,
         use_sde: bool = False,
@@ -646,7 +646,6 @@ def __init__(
             action_space,
             lr_schedule,
             net_arch,
-            device,
             activation_fn,
             ortho_init,
             use_sde,
@@ -685,7 +684,6 @@ class ContinuousCritic(BaseModel):
     :param activation_fn: (Type[nn.Module]) Activation function
     :param normalize_images: (bool) Whether to normalize images or not,
          dividing by 255.0 (True by default)
-    :param device: (Union[th.device, str]) Device on which the code should run.
     :param n_critics: (int) Number of critic networks to create.
     """
 
@@ -698,15 +696,10 @@ def __init__(
         features_dim: int,
         activation_fn: Type[nn.Module] = nn.ReLU,
         normalize_images: bool = True,
-        device: Union[th.device, str] = "auto",
         n_critics: int = 2,
     ):
         super().__init__(
-            observation_space,
-            action_space,
-            features_extractor=features_extractor,
-            normalize_images=normalize_images,
-            device=device,
+            observation_space, action_space, features_extractor=features_extractor, normalize_images=normalize_images,
         )
 
         action_dim = get_action_dim(self.action_space)

diff --git a/stable_baselines3/dqn/policies.py b/stable_baselines3/dqn/policies.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type
 
 import gym
 import torch as th
@@ -15,7 +15,6 @@ class QNetwork(BasePolicy):
     :param observation_space: (gym.spaces.Space) Observation space
     :param action_space: (gym.spaces.Space) Action space
     :param net_arch: (Optional[List[int]]) The specification of the policy and value networks.
-    :param device: (str or th.device) Device on which the code should run.
     :param activation_fn: (Type[nn.Module]) Activation function
     :param normalize_images: (bool) Whether to normalize images or not,
          dividing by 255.0 (True by default)
@@ -28,16 +27,11 @@ def __init__(
         features_extractor: nn.Module,
         features_dim: int,
         net_arch: Optional[List[int]] = None,
-        device: Union[th.device, str] = "auto",
         activation_fn: Type[nn.Module] = nn.ReLU,
         normalize_images: bool = True,
     ):
         super(QNetwork, self).__init__(
-            observation_space,
-            action_space,
-            features_extractor=features_extractor,
-            normalize_images=normalize_images,
-            device=device,
+            observation_space, action_space, features_extractor=features_extractor, normalize_images=normalize_images,
         )
 
         if net_arch is None:
@@ -90,7 +84,6 @@ class DQNPolicy(BasePolicy):
     :param action_space: (gym.spaces.Space) Action space
     :param lr_schedule: (callable) Learning rate schedule (could be constant)
     :param net_arch: (Optional[List[int]]) The specification of the policy and value networks.
-    :param device: (str or th.device) Device on which the code should run.
     :param activation_fn: (Type[nn.Module]) Activation function
     :param features_extractor_class: (Type[BaseFeaturesExtractor]) Features extractor to use.
     :param features_extractor_kwargs: (Optional[Dict[str, Any]]) Keyword arguments
@@ -109,7 +102,6 @@ def __init__(
         action_space: gym.spaces.Space,
         lr_schedule: Callable,
         net_arch: Optional[List[int]] = None,
-        device: Union[th.device, str] = "auto",
         activation_fn: Type[nn.Module] = nn.ReLU,
         features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
         features_extractor_kwargs: Optional[Dict[str, Any]] = None,
@@ -120,7 +112,6 @@ def __init__(
         super(DQNPolicy, self).__init__(
             observation_space,
             action_space,
-            device,
             features_extractor_class,
             features_extractor_kwargs,
             optimizer_class=optimizer_class,
@@ -143,7 +134,6 @@ def __init__(
             "net_arch": self.net_arch,
             "activation_fn": self.activation_fn,
             "normalize_images": normalize_images,
-            "device": device,
         }
 
         self.q_net, self.q_net_target = None, None
@@ -204,7 +194,6 @@ class CnnPolicy(DQNPolicy):
     :param action_space: (gym.spaces.Space) Action space
     :param lr_schedule: (callable) Learning rate schedule (could be constant)
     :param net_arch: (Optional[List[int]]) The specification of the policy and value networks.
-    :param device: (str or th.device) Device on which the code should run.
     :param activation_fn: (Type[nn.Module]) Activation function
     :param features_extractor_class: (Type[BaseFeaturesExtractor]) Features extractor to use.
     :param normalize_images: (bool) Whether to normalize images or not,
@@ -221,7 +210,6 @@ def __init__(
         action_space: gym.spaces.Space,
         lr_schedule: Callable,
         net_arch: Optional[List[int]] = None,
-        device: Union[th.device, str] = "auto",
         activation_fn: Type[nn.Module] = nn.ReLU,
         features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN,
         features_extractor_kwargs: Optional[Dict[str, Any]] = None,
@@ -234,7 +222,6 @@ def __init__(
             action_space,
             lr_schedule,
             net_arch,
-            device,
             activation_fn,
             features_extractor_class,
             features_extractor_kwargs,

diff --git a/stable_baselines3/sac/policies.py b/stable_baselines3/sac/policies.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import gym
 import torch as th
@@ -38,7 +38,6 @@ class Actor(BasePolicy):
     :param clip_mean: (float) Clip the mean output when using gSDE to avoid numerical instability.
     :param normalize_images: (bool) Whether to normalize images or not,
          dividing by 255.0 (True by default)
-    :param device: (Union[th.device, str]) Device on which the code should run.
     """
 
     def __init__(
@@ -56,14 +55,12 @@ def __init__(
         use_expln: bool = False,
         clip_mean: float = 2.0,
         normalize_images: bool = True,
-        device: Union[th.device, str] = "auto",
     ):
         super(Actor, self).__init__(
             observation_space,
             action_space,
             features_extractor=features_extractor,
             normalize_images=normalize_images,
-            device=device,
             squash_output=True,
         )
 
@@ -196,7 +193,6 @@ class SACPolicy(BasePolicy):
     :param action_space: (gym.spaces.Space) Action space
     :param lr_schedule: (callable) Learning rate schedule (could be constant)
     :param net_arch: (Optional[List[int]]) The specification of the policy and value networks.
-    :param device: (str or th.device) Device on which the code should run.
     :param activation_fn: (Type[nn.Module]) Activation function
     :param use_sde: (bool) Whether to use State Dependent Exploration or not
     :param log_std_init: (float) Initial value for the log standard deviation
@@ -225,7 +221,6 @@ def __init__(
         action_space: gym.spaces.Space,
         lr_schedule: Callable,
         net_arch: Optional[List[int]] = None,
-        device: Union[th.device, str] = "auto",
         activation_fn: Type[nn.Module] = nn.ReLU,
         use_sde: bool = False,
         log_std_init: float = -3,
@@ -242,7 +237,6 @@ def __init__(
         super(SACPolicy, self).__init__(
             observation_space,
             action_space,
-            device,
             features_extractor_class,
             features_extractor_kwargs,
             optimizer_class=optimizer_class,
@@ -270,7 +264,6 @@ def __init__(
             "net_arch": self.net_arch,
             "activation_fn": self.activation_fn,
             "normalize_images": normalize_images,
-            "device": device,
         }
         self.actor_kwargs = self.net_args.copy()
         sde_kwargs = {
@@ -356,7 +349,6 @@ class CnnPolicy(SACPolicy):
     :param action_space: (gym.spaces.Space) Action space
     :param lr_schedule: (callable) Learning rate schedule (could be constant)
     :param net_arch: (Optional[List[int]]) The specification of the policy and value networks.
-    :param device: (str or th.device) Device on which the code should run.
     :param activation_fn: (Type[nn.Module]) Activation function
     :param use_sde: (bool) Whether to use State Dependent Exploration or not
     :param log_std_init: (float) Initial value for the log standard deviation
@@ -383,7 +375,6 @@ def __init__(
         action_space: gym.spaces.Space,
         lr_schedule: Callable,
         net_arch: Optional[List[int]] = None,
-        device: Union[th.device, str] = "auto",
         activation_fn: Type[nn.Module] = nn.ReLU,
         use_sde: bool = False,
         log_std_init: float = -3,
@@ -402,7 +393,6 @@ def __init__(
             action_space,
             lr_schedule,
             net_arch,
-            device,
             activation_fn,
             use_sde,
             log_std_init,