Fixes HER mixed ordering of desired_goal and achieved_goal (#1570)

* change ordering of achieved_goal and desired_goal to match expected compute_reward order * Update changelog.rst * Update version * Update version.txt * Update changelog.rst --------- Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
DLR-RM · Jun 21, 2023 · f667f08 · f667f08
1 parent 4eed3e9
commit f667f08
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 7 deletions.
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,7 +3,7 @@
 Changelog
 ==========
 
-Release 2.0.0a13 (WIP)
+Release 2.0.0a14 (WIP)
 --------------------------
 
 **Gymnasium support**
@@ -42,6 +42,7 @@ Bug Fixes:
 - Fixed env checker to properly reset the env before calling ``step()`` when checking
   for ``Inf`` and ``NaN`` (@lutogniew)
 - Fixed HER ``truncate_last_trajectory()`` (@lbergmann1)
+- Fixed HER desired and achieved goal order in reward computation (@JonathanKuelz)
 
 Deprecations:
 ^^^^^^^^^^^^^
@@ -1347,7 +1348,7 @@ And all the contributors:
 @eleurent @ac-93 @cove9988 @theDebugger811 @hsuehch @Demetrio92 @thomasgubler @IperGiove @ScheiklP
 @simoninithomas @armandpl @manuel-delverme @Gautam-J @gianlucadecola @buoyancy99 @caburu @xy9485
 @Gregwar @ycheng517 @quantitative-technologies @bcollazo @git-thor @TibiGG @cool-RR @MWeltevrede
-@carlosluis @arjun-kg @tlpss
+@carlosluis @arjun-kg @tlpss @JonathanKuelz
 @Melanol @qgallouedec @francescoluciano @jlp-ue @burakdmb @timothe-chaumont @honglu2875
 @anand-bala @hughperkins @sidney-tio @AlexPasqua @dominicgkerr @Akhilez @Rocamonde @tobirohrer @ZikangXiong
 @DavyMorgan @luizapozzobon @Bonifatius94 @theSquaredError @harveybellini @DavyMorgan @FieteO @jonasreiher @npit @WeberSamuel @troiganto

diff --git a/stable_baselines3/common/env_checker.py b/stable_baselines3/common/env_checker.py
@@ -128,10 +128,10 @@ def _check_goal_env_obs(obs: dict, observation_space: spaces.Dict, method_name:
     """
     Check that an environment implementing the `compute_rewards()` method
     (previously known as GoalEnv in gym) contains at least three elements,
-    namely `observation`, `desired_goal`, and `achieved_goal`.
+    namely `observation`, `achieved_goal`, and `desired_goal`.
     """
     assert len(observation_space.spaces) >= 3, (
-        "A goal conditioned env must contain at least 3 observation keys: `observation`, `desired_goal`, and `achieved_goal`. "
+        "A goal conditioned env must contain at least 3 observation keys: `observation`, `achieved_goal`, and `desired_goal`. "
         f"The current observation contains {len(observation_space.spaces)} keys: {list(observation_space.spaces.keys())}"
     )
 

diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
@@ -312,15 +312,15 @@ def _get_virtual_samples(
         # Compute new reward
         rewards = self.env.env_method(
             "compute_reward",
-            # here we use the new desired goal
-            obs["desired_goal"],
             # the new state depends on the previous state and action
             # s_{t+1} = f(s_t, a_t)
             # so the next achieved_goal depends also on the previous state and action
             # because we are in a GoalEnv:
             # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal)
             # therefore we have to use next_obs["achieved_goal"] and not obs["achieved_goal"]
             next_obs["achieved_goal"],
+            # here we use the new desired goal
+            obs["desired_goal"],
             infos,
             # we use the method of the first environment assuming that all environments are identical.
             indices=[0],

diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt
@@ -1 +1 @@
-2.0.0a13
+2.0.0a14