# Import library

In [2]:
import time
import numpy as np
from ray import tune, train
import ray
from ray import tune
from ray.rllib.algorithms.a2c import A2C, A2CConfig

import gymnasium as gym
from ray.tune.registry import register_env

# Policy gradient

In [None]:
# config training parameters
train_config = {
    "env": "CartPole-v1", # MyCustomEnv_v0,
    "framework": "torch",
    "num_workers": 2,
    "num_cpus_per_worker": 3,
    "model": {
        "fcnet_hiddens": [512, 512, 256],
        "fcnet_activation": "relu",
    },
    "lr": tune.grid_search([0.001,0.0001]),  
    "optimization": {
        "optimizer": "adam",
        "adam_epsilon": 1e-8,
        "adam_beta1": 0.9,
        "adam_beta2": 0.999,
    },  
    "gamma": 0.99,
    "num_sgd_iter": 10,  
    "sgd_minibatch_size": 500, 
    "rollout_fragment_length": 500,
    "train_batch_size": 4000,
    "prioritized_replay": True,
    "prioritized_replay_alpha": 0.6,
    "prioritized_replay_beta": 0.4, 
    "buffer_size": 500000,
    "stop": {"episodes_total": 5000000},
    "exploration_config": {},
}
stop_criteria = {"episode_reward_mean": 400}

# start to train
results = tune.run(
    A2C, 
    config=train_config,
    stop=stop_criteria,
    verbose=1,
    checkpoint_freq=1,
    keep_checkpoints_num=1,
    checkpoint_score_attr='training_iteration',
)

In [3]:
config = A2CConfig()

In [4]:
config

<ray.rllib.algorithms.a2c.a2c.A2CConfig at 0x29e29cc10>

In [None]:
training_config = {
  "gamma": 0.99, # discount factor
  "lr": tune.grid_search([0.001,0.0001]),
  "grad_clip":
  "grad_clip_by":
  "train_batch_size": 512,
  "model":  {
    
  },
  "optimizer":,
}

environment_config = {
  "env": "CartPole-v1",
  "render_env": False, # If True, try to render the environment on the local worker or on worker 1 
  "auto_wrap_old_gym_envs":
}

algorithm_config = {
  "framework": "torch",


}

rollout_config = {
  "num_rollout_workers":,
  "num_envs_per_worker":,
  "sample_collector":, #Override the SampleCollector base class to implement your own collection/buffering/retrieval logic.
}

evaluation_config = {
  "evaluation_interval":,
  "evaluation_duration":,
  "evaluation_duration_utit":,
  "custom_evaluation_function":,
}

reporting_config = {

}

checkpointing_config = {
  
}

callback_config = {

}

resources_config = {
  "num_gpus":,
  "num_cpus_per_worker":,
  "num_gpus_per_worker":,
  "num_learner_workers":,
  "num_cpus_per_learner_worker":,
  "num_gpus_per_learner_worker":,
  "local_gpu_idx":,
}

# Scaling Training with RLlib: Best Practices

## 1. Sample-Efficient Off-Policy Algorithms (e.g., DQN, SAC)

When the environment is slow and cannot be replicated, especially in scenarios involving physical systems, sample-efficient off-policy algorithms like DQN or SAC are recommended. These algorithms default to `num_workers: 0` for single-process operation. Ensure to set `num_gpus: 1` if GPU usage is desired. Additionally, consider exploring batch RL training with the offline data API.

## 2. Time-Efficient Algorithms (e.g., PPO, IMPALA, APEX)

For fast environments and small models (typical in RL scenarios), opt for time-efficient algorithms like PPO, IMPALA, or APEX. These algorithms can be efficiently scaled by increasing the `num_workers` to add rollout workers. Enabling vectorization for inference can further enhance efficiency. Don't forget to set `num_gpus: 1` if GPU acceleration is desired. If the learner becomes a bottleneck, leverage multiple GPUs for learning by setting `num_gpus > 1`.

## 3. Compute-Intensive Models and GPU Allocation

In scenarios where the model demands significant computational power (e.g., large deep residual networks) and inference becomes the bottleneck, consider allocating GPUs to workers. This can be achieved by setting `num_gpus_per_worker: 1`. If you possess only a single GPU, consider setting `num_workers: 0` to utilize the learner GPU for inference. For optimal GPU utilization, use a small number of GPU workers and a large number of environments per worker.

## 4. Scaling with Remote Worker Environments and Async Batching

When both the model and environment are compute-intensive, enabling remote worker environments with async batching can significantly enhance efficiency. Set `remote_worker_envs: True` and optionally configure `remote_env_batch_wait_ms`. This configuration batches inference on GPUs in the rollout workers while allowing environments to run asynchronously in separate actors, resembling the SEED architecture. To maximize GPU utilization, fine-tune the number of workers and environments per worker. If your environment requires GPUs to function or multi-node SGD is necessary, consider exploring DD-PPO.

These guidelines provide valuable insights into optimizing training efficiency based on the specific characteristics of your environment and model in RLlib.


In case you are using lots of workers (num_workers >> 10) and you observe worker failures for whatever reasons, which normally interrupt your RLlib training runs, consider using the config settings ignore_worker_failures=True, recreate_failed_workers=True, or restart_failed_sub_environments=True:

ignore_worker_failures: When set to True, your Algorithm will not crash due to a single worker error but continue for as long as there is at least one functional worker remaining. recreate_failed_workers: When set to True, your Algorithm will attempt to replace/recreate any failed worker(s) with newly created one(s). This way, your number of workers will never decrease, even if some of them fail from time to time. restart_failed_sub_environments: When set to True and there is a failure in one of the vectorized sub-environments in one of your workers, the worker will try to recreate only the failed sub-environment and re-integrate the newly created one into your vectorized env stack on that worker.

Note that only one of ignore_worker_failures or recreate_failed_workers may be set to True (they are mutually exclusive settings). However, you can combine each of these with the restart_failed_sub_environments=True setting. Using these options will make your training runs much more stable and more robust against occasional OOM or other similar “once in a while” errors on your workers themselves or inside your environments.

The "monitor": true config can be used to save Gym episode videos to the result dir. For example: