In [1]:
import time

import gymnasium as gym
import numpy as np

#from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [2]:
from typing import Callable


def make_env(env_id: str, rank: int, seed: int = 0) -> Callable:
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """

    def _init() -> gym.Env:
        env = gym.make(env_id, render_mode='rgb_array', reward_type='sparse', max_episode_steps=50)
        env.reset(seed=seed + rank)
        return env

    set_random_seed(seed)
    return _init

In [3]:
env_id = 'FetchSlide-v2'
num_cpu = 4

vec_env = make_vec_env(env_id, n_envs=num_cpu)

#model = A2C("MlpPolicy", vec_env, verbose=0)

In [None]:
def env_creator(env_config={}):
    import gymnasium as gym
    env = gym.make('FetchReach-v2', render_mode='rgb_array', reward_type='sparse', max_episode_steps=50)
    #env = SeededFetchReachEnv(env, 960)
    env.reset()
    return env

In [None]:
from stable_baselines3 import HerReplayBuffer, DDPG, DQN, SAC, TD3
from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
from stable_baselines3.common.envs import BitFlippingEnv

model_class = SAC  # works also with SAC, DDPG and TD3
N_BITS = 15

#env = env_creator({})#BitFlippingEnv(n_bits=N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)

# Available strategies (cf paper): future, final, episode
goal_selection_strategy = "future" # equivalent to GoalSelectionStrategy.FUTURE

# Initialize the model
model = model_class(
    "MultiInputPolicy",
    vec_env,
    replay_buffer_class=HerReplayBuffer,
    # Parameters for HER
    replay_buffer_kwargs=dict(
        #n_envs=4,
        #buffer_size=1000000,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
    ),
    learning_starts=600,
    batch_size=256,
    verbose=1,
    learning_rate=0.001,
    #action_noise
    tensorboard_log="../R3L-LOGS/Ftch-Slide-04-MPI"
)

# Train the model
model.learn(500000)

model.save("./her_Fetch-Reach_env02")
# Because it needs access to `env.compute_reward()`
# HER must be loaded with the env
model = model_class.load("./her_Fetch-Reach_env02", env=env)

obs, info = env.reset()
for _ in range(100):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = env.step(action)
    if terminated or truncated:
        obs, info = env.reset()

Using cuda device
Logging to ../R3L-LOGS/Ftch-Slide-04-MPI/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0.0      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 413      |
|    time_elapsed    | 0        |
|    total_timesteps | 200      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -43.8    |
|    success_rate    | 0.125    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 372      |
|    time_elapsed    | 1        |
|    total_timesteps | 400      |
---------------------------------
------------------------------------
| rollout/           |             |
|    ep_len_mean     | 50          |
|    ep_rew_mean     | -45.8       |
|    success_rate    | 0.083333336 |
| 

  logger.warn(


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -46.9    |
|    success_rate    | 0.0625   |
| time/              |          |
|    episodes        | 16       |
|    fps             | 118      |
|    time_elapsed    | 6        |
|    total_timesteps | 800      |
| train/             |          |
|    actor_loss      | -5.26    |
|    critic_loss     | 0.322    |
|    ent_coef        | 0.953    |
|    ent_coef_loss   | -0.324   |
|    learning_rate   | 0.001    |
|    n_updates       | 49       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -47.5    |
|    success_rate    | 0.05     |
| time/              |          |
|    episodes        | 20       |
|    fps             | 120      |
|    time_elapsed    | 8        |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_los

In [None]:
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
img = plt.imshow(env.render())

obs, info = env.reset()
for _ in range(200):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = env.step(action)
    if terminated or truncated:
        obs, info = env.reset()
    img.set_data(env.render()) # Just update the data
    ipythondisplay.display(plt.gcf())
    ipythondisplay.clear_output(wait=True)