appended documentation

AboudyKreidieh · Sep 28, 2019 · e1fdfcd · e1fdfcd
1 parent d764588
commit e1fdfcd
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 2 deletions.
diff --git a/hbaselines/hiro/algorithm.py b/hbaselines/hiro/algorithm.py
@@ -146,6 +146,11 @@ class TD3(object):
     actor_update_freq : int
         number of training steps per actor policy update step. The critic
         policy is updated every training step.
+    meta_update_freq : int
+        number of training steps per meta policy update step. The actor policy
+        of the meta-policy is further updated at the frequency provided by the
+        actor_update_freq variable. Note that this value is only relevant when
+        using the GoalDirectedPolicy policy.
     reward_scale : float
         the value the reward should be scaled by
     render : bool
@@ -917,6 +922,7 @@ def _evaluate(self, total_timesteps):
                 eval_action, _ = self._policy(
                     eval_obs,
                     apply_noise=False,
+                    random_actions=False,
                     compute_q=False,
                     context=[getattr(self.eval_env, "current_context", None)],
                     episode_step=eval_episode_step)

diff --git a/hbaselines/hiro/policy.py b/hbaselines/hiro/policy.py
@@ -627,7 +627,24 @@ def make_critic(self, obs, action, reuse=False, scope="qf"):
         return qvalue_fn
 
     def update(self, update_actor=True, **kwargs):
-        """See parent class."""
+        """Perform a gradient update step.
+
+        **Note**; The target update soft updates occur at the same frequency as
+        the actor update frequencies.
+
+        Parameters
+        ----------
+        update_actor : bool
+            specifies whether to update the actor policy. The critic policy is
+            still updated if this value is set to False.
+
+        Returns
+        -------
+        float
+            critic loss
+        float
+            actor loss
+        """
         # Not enough samples in the replay buffer.
         if not self.replay_buffer.can_sample(self.batch_size):
             return 0, 0
@@ -1263,7 +1280,9 @@ def initialize(self):
         self.meta_reward = 0
 
     def update(self, update_actor=True, **kwargs):
-        """See parent class.
+        """Perform a gradient update step.
+
+        This is done both at the level of the Manager and Worker policies.
 
         The kwargs argument for this method contains two additional terms:
 
@@ -1272,6 +1291,23 @@ def update(self, update_actor=True, **kwargs):
         * update_meta_actor (bool): similar to the `update_policy` term, but
           for the meta-policy. Note that, if `update_meta` is set to False,
           this term is void.
+
+        **Note**; The target update soft updates for both the manager and the
+        worker policies occur at the same frequency as their respective actor
+        update frequencies.
+
+        Parameters
+        ----------
+        update_actor : bool
+            specifies whether to update the actor policy. The critic policy is
+            still updated if this value is set to False.
+
+        Returns
+        -------
+        (float, float)
+            manager critic loss, worker critic loss
+        (float, float)
+            manager actor loss, worker actor loss
         """
         # Not enough samples in the replay buffer.
         if not self.replay_buffer.can_sample(self.batch_size):