In [1]:
import os
from functools import partial

import gymnasium as gym
import numpy as np
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import HerReplayBuffer, DDPG, DQN, SAC, TD3
from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
from stable_baselines3.common.envs import BitFlippingEnv

In [2]:
from stable_baselines3.common.callbacks import BaseCallback

class ReportRewardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(ReportRewardCallback, self).__init__(verbose)

    def _on_step(self) -> bool:
        # Check if 'infos' is in locals and if 'episode' is in any of the info dicts
        if self.locals.get("infos") is not None:
            episode_rewards = [info["episode"]["r"] for info in self.locals["infos"] if "episode" in info]
            if episode_rewards:
                episode_reward_mean = np.mean(episode_rewards)
                tune.report(episode_reward_mean=episode_reward_mean)
        return True


In [3]:
env_id = 'FetchPickAndPlace-v2'
num_cpu = 4

from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env(env_id, n_envs=num_cpu) #vec_env_cls=SubprocVecEnv)


In [4]:
def train_sac(config, checkpoint_dir=None, data_dir=None):
    #vec_env = make_vec_env(env_id, n_envs=num_cpu, vec_env_cls=SubprocVecEnv)env = DummyVecEnv([lambda: gym.make("Pendulum-v1")])
    goal_selection_strategy = "future"
    model = SAC(
        "MultiInputPolicy",
        vec_env,
        learning_rate=config["lr"],
        replay_buffer_class=HerReplayBuffer,
        # Parameters for HER
        replay_buffer_kwargs=dict(
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
        ),
        buffer_size=config["buffer_size"],
        learning_starts=config["learning_starts"],
        train_freq=config["train_freq"],
        gradient_steps=config["gradient_steps"],
        ent_coef=config["ent_coef"],
        tau=config["tau"],
        verbose=1
    )

    if checkpoint_dir:
        model_path = os.path.join(checkpoint_dir, "model.pkl")
        model.load(model_path)

    eval_env = vec_env#Monitor(DummyVecEnv([lambda: gym.make("Pendulum-v1")]))#eval_env = DummyVecEnv([lambda: gym.make("Pendulum-v1")])
    #report_reward_callback = ReportRewardCallback()
    eval_callback = EvalCallback(
        eval_env,
        best_model_save_path="./logs/",
        log_path="./logs/",
        eval_freq=5000,
        deterministic=True,
        render=False,
        callback_on_new_best=ReportRewardCallback()
    )

    model.learn(
        total_timesteps=100000, callback=eval_callback
    )

    if checkpoint_dir:
        model.save(os.path.join(checkpoint_dir, "checkpoint"))
        np.save(os.path.join(checkpoint_dir, "results.npy"), np.array([1.0]))

In [None]:
if __name__ == "__main__":
    config = {
        "lr": tune.loguniform(1e-4, 1e-1),
        "buffer_size": tune.choice([50000, 100000, 200000]),
        "learning_starts": tune.choice([0, 1000, 10000]),
        "train_freq": tune.choice([1, 10, 100]),
        "gradient_steps": tune.choice([1, 10, 100]),
        "ent_coef": tune.loguniform(1e-4, 1e-1),
        "tau": tune.uniform(0.005, 0.02),
    }

    scheduler = ASHAScheduler(
        metric="episode_reward_mean",
        mode="max",
        max_t=100000,
        grace_period=10000,
        reduction_factor=2,
    )

    reporter = CLIReporter(
        metric_columns=["episode_reward_mean", "training_iteration"]
    )

    result = tune.run(
        partial(train_sac, data_dir="/tmp"),
        resources_per_trial={"cpu": 1},
        config=config,
        num_samples=10,
        scheduler=scheduler,
        progress_reporter=reporter,
    )

    best_trial = result.get_best_trial("episode_reward_mean", "max", "last")
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final reward: {best_trial.last_result['episode_reward_mean']}")


2023-08-16 16:29:37,321	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2023-08-16 16:29:38,329	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2023-08-16 16:29:38,362	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/latest/tune/api/trainable.html



== Status ==
Current time: 2023-08-16 16:29:43 (running for 00:00:05.28)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 80000.000: None | Iter 40000.000: None | Iter 20000.000: None | Iter 10000.000: None
Logical resource usage: 10.0/20 CPUs, 0/0 GPUs
Result logdir: /home/cocp5/ray_results/train_sac_2023-08-16_16-29-38
Number of trials: 10/10 (10 PENDING)
+-----------------------+----------+-------+---------------+-------------+------------------+-------------------+-------------+------------+--------------+
| Trial name            | status   | loc   |   buffer_size |    ent_coef |   gradient_steps |   learning_starts |          lr |        tau |   train_freq |
|-----------------------+----------+-------+---------------+-------------+------------------+-------------------+-------------+------------+--------------|
| train_sac_b64c9_00000 | PENDING  |       |        200000 | 0.000117486 |                1 |             10000 | 0.000505342 | 0.00753809 |           10 |
| train_sac_b64

2023-08-16 16:29:52,023	ERROR tune_controller.py:911 -- Trial task failed for trial train_sac_b64c9_00005
Traceback (most recent call last):
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-packages/ray/_private/worker.py", line 2493, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=57767, ip=172.21.2.236, actor_id=3d2f6f364c793b3d1aa7571601000000, repr=func)
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-pac

Trial name
train_sac_b64c9_00003
train_sac_b64c9_00005
train_sac_b64c9_00007


2023-08-16 16:29:52,090	ERROR tune_controller.py:911 -- Trial task failed for trial train_sac_b64c9_00003
Traceback (most recent call last):
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-packages/ray/_private/worker.py", line 2493, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc.train()[39m (pid=57765, ip=172.21.2.236, actor_id=92cb131a3ca234887619fe9901000000, repr=func)
  File "/home/cocp5/anaconda3/envs/R3L/lib/python3.9/site-pac

[2m[36m(func pid=57770)[0m ---------------------------------
[2m[36m(func pid=57770)[0m | rollout/           |          |
[2m[36m(func pid=57770)[0m |    ep_len_mean     | 50       |
[2m[36m(func pid=57770)[0m |    ep_rew_mean     | -50      |
[2m[36m(func pid=57770)[0m |    success_rate    | 0.0      |
[2m[36m(func pid=57770)[0m | time/              |          |
[2m[36m(func pid=57770)[0m |    episodes        | 4        |
[2m[36m(func pid=57770)[0m |    fps             | 534      |
[2m[36m(func pid=57770)[0m |    time_elapsed    | 0        |
[2m[36m(func pid=57770)[0m |    total_timesteps | 200      |
[2m[36m(func pid=57770)[0m ---------------------------------
[2m[36m(func pid=57763)[0m ------------------------------------
[2m[36m(func pid=57763)[0m ------------------------------------


[2m[36m(func pid=57763)[0m   logger.warn(


== Status ==
Current time: 2023-08-16 16:29:53 (running for 00:00:15.43)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 80000.000: None | Iter 40000.000: None | Iter 20000.000: None | Iter 10000.000: None
Logical resource usage: 7.0/20 CPUs, 0/0 GPUs
Result logdir: /home/cocp5/ray_results/train_sac_2023-08-16_16-29-38
Number of trials: 10/10 (3 ERROR, 7 RUNNING)
+-----------------------+----------+--------------------+---------------+-------------+------------------+-------------------+-------------+------------+--------------+
| Trial name            | status   | loc                |   buffer_size |    ent_coef |   gradient_steps |   learning_starts |          lr |        tau |   train_freq |
|-----------------------+----------+--------------------+---------------+-------------+------------------+-------------------+-------------+------------+--------------|
| train_sac_b64c9_00000 | RUNNING  | 172.21.2.236:57762 |        200000 | 0.000117486 |                1 |             10000 

[2m[36m(func pid=57766)[0m   logger.warn([32m [repeated 4x across cluster][0m


[2m[36m(func pid=57764)[0m | train/             |          |[32m [repeated 18x across cluster][0m
[2m[36m(func pid=57764)[0m |    actor_loss      | 0.279    |[32m [repeated 18x across cluster][0m
[2m[36m(func pid=57764)[0m |    critic_loss     | 0.156    |[32m [repeated 18x across cluster][0m
[2m[36m(func pid=57764)[0m |    ent_coef        | 0.0292   |[32m [repeated 18x across cluster][0m
[2m[36m(func pid=57764)[0m |    learning_rate   | 0.000134 |[32m [repeated 18x across cluster][0m
[2m[36m(func pid=57764)[0m |    n_updates       | 159      |[32m [repeated 18x across cluster][0m
[2m[36m(func pid=57762)[0m ---------------------------------[32m [repeated 102x across cluster][0m
[2m[36m(func pid=57762)[0m | rollout/           |          |[32m [repeated 51x across cluster][0m
[2m[36m(func pid=57762)[0m |    ep_len_mean     | 50       |[32m [repeated 51x across cluster][0m
[2m[36m(func pid=57762)[0m |    ep_rew_mean     | -47.5    |[32m [re

[2m[36m(func pid=57762)[0m   logger.warn([32m [repeated 3x across cluster][0m


== Status ==
Current time: 2023-08-16 16:30:34 (running for 00:00:55.80)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 80000.000: None | Iter 40000.000: None | Iter 20000.000: None | Iter 10000.000: None
Logical resource usage: 7.0/20 CPUs, 0/0 GPUs
Result logdir: /home/cocp5/ray_results/train_sac_2023-08-16_16-29-38
Number of trials: 10/10 (3 ERROR, 7 RUNNING)
+-----------------------+----------+--------------------+---------------+-------------+------------------+-------------------+-------------+------------+--------------+
| Trial name            | status   | loc                |   buffer_size |    ent_coef |   gradient_steps |   learning_starts |          lr |        tau |   train_freq |
|-----------------------+----------+--------------------+---------------+-------------+------------------+-------------------+-------------+------------+--------------|
| train_sac_b64c9_00000 | RUNNING  | 172.21.2.236:57762 |        200000 | 0.000117486 |                1 |             10000 

[2m[36m(func pid=57764)[0m   logger.warn([32m [repeated 2x across cluster][0m


[2m[36m(func pid=57763)[0m ------------------------------------[32m [repeated 2x across cluster][0m
[2m[36m(func pid=57764)[0m ---------------------------------[32m [repeated 74x across cluster][0m
[2m[36m(func pid=57764)[0m | rollout/           |          |[32m [repeated 37x across cluster][0m
[2m[36m(func pid=57764)[0m |    ep_len_mean     | 50       |[32m [repeated 37x across cluster][0m
[2m[36m(func pid=57764)[0m |    ep_rew_mean     | -48      |[32m [repeated 37x across cluster][0m
[2m[36m(func pid=57764)[0m |    success_rate    | 0.04     |[32m [repeated 38x across cluster][0m
[2m[36m(func pid=57764)[0m | time/              |          |[32m [repeated 38x across cluster][0m
[2m[36m(func pid=57764)[0m |    episodes        | 424      |[32m [repeated 37x across cluster][0m
[2m[36m(func pid=57764)[0m |    fps             | 415      |[32m [repeated 37x across cluster][0m
[2m[36m(func pid=57764)[0m |    time_elapsed    | 51       |[32m [r

[2m[36m(func pid=57766)[0m   logger.warn(


[2m[36m(func pid=57766)[0m Eval num_timesteps=20000, episode_reward=-50.00 +/- 0.00
[2m[36m(func pid=57766)[0m Episode length: 50.00 +/- 0.00
[2m[36m(func pid=57766)[0m Success rate: 0.00%
[2m[36m(func pid=57766)[0m | eval/              |          |
[2m[36m(func pid=57766)[0m |    mean_ep_length  | 50       |
[2m[36m(func pid=57766)[0m |    mean_reward     | -50      |
[2m[36m(func pid=57764)[0m ---------------------------------[32m [repeated 58x across cluster][0m
[2m[36m(func pid=57764)[0m | rollout/           |          |[32m [repeated 28x across cluster][0m
[2m[36m(func pid=57764)[0m |    ep_len_mean     | 50       |[32m [repeated 28x across cluster][0m
[2m[36m(func pid=57764)[0m |    ep_rew_mean     | -46      |[32m [repeated 28x across cluster][0m
[2m[36m(func pid=57764)[0m |    success_rate    | 0.08     |[32m [repeated 29x across cluster][0m
[2m[36m(func pid=57764)[0m | time/              |          |[32m [repeated 29x across clust