# Our Environment

In [1]:
import torch
import tensorflow as tf
import os

print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2023-09-07 13:02:54.587348: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


PyTorch Version: 2.0.1+cu117
CUDA Available: True
CUDA Version: 11.7
Num GPUs Available:  4


In [4]:
import ray
from ray import air, tune
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.algorithms.ppo import PPOConfig

from custom_env import CustomEnvironment
from config import run_config

## The RLlib configuration
class Args:
    def __init__(self):
        self.run = "PPO"
        self.framework = "torch" # "tf2" or "torch"
        self.stop_iters = 5
        self.stop_timesteps = 100000
        self.stop_reward = 0.1
        self.as_test = False

args = Args()

## Generate the configuration
ray.init()
env = CustomEnvironment(run_config["env"])

config = (
    PPOConfig()
    .rollouts(rollout_fragment_length="auto", num_rollout_workers=1)
    .environment(CustomEnvironment, env_config=run_config["env"])
    .framework(args.framework)
    .training(num_sgd_iter=10, sgd_minibatch_size=256, train_batch_size=4000)
    .multi_agent(
        policies= {
            "prey": PolicySpec(
                policy_class=None,  # infer automatically from Algorithm
                observation_space=env.observation_space[0],  # if None infer automatically from env
                action_space=env.action_space[0],  # if None infer automatically from env
                config={"gamma": 0.85},  # use main config plus <- this override here
            ),
            "predator": PolicySpec(
                policy_class=None,
                observation_space=env.observation_space[0],
                action_space=env.action_space[0],
                config={"gamma": 0.85},
            ),
        },
        policy_mapping_fn = lambda id, *arg, **karg: "prey" if env.agents[id].agent_type == 0 else "predator",
        policies_to_train=["prey", "predator"]
    )
    .rl_module(_enable_rl_module_api=True)
    .training(_enable_learner_api=True)
    .resources(
        num_gpus_per_learner_worker=1,
        num_cpus_per_worker = 4,
        num_learner_workers= 4
    )
)


stop = {
    "training_iteration": args.stop_iters,
    "timesteps_total": args.stop_timesteps,
    "episode_reward_mean": args.stop_reward,
}

## Run the experiemnt    

tuner = tune.Tuner(
    args.run,
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop, verbose=3),
)
results = tuner.fit()

if args.as_test:
    print("Checking if learning goals were achieved")
    check_learning_achieved(results, args.stop_reward)
ray.shutdown()



2023-09-07 13:49:03,306	INFO worker.py:1621 -- Started a local Ray instance.
2023-09-07 13:49:13,097	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-09-07 14:22:42
Running for:,00:33:29.38
Memory:,503.0/1510.5 GiB

Trial name,# failures,error file
PPO_CustomEnvironment_540a9_00000,1,/home/tcazalet/ray_results/PPO/PPO_CustomEnvironment_540a9_00000_0_2023-09-07_13-49-13/error.txt

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomEnvironment_540a9_00000,ERROR,172.17.0.14:39835,2,109.675,8000,-180.471,-168.186,-205.908,1000


[2m[36m(PPO pid=39835)[0m Starting distributed worker processes: ['40168 (172.17.0.14)', '40169 (172.17.0.14)', '40170 (172.17.0.14)', '40171 (172.17.0.14)']
[2m[36m(_WrappedExecutable pid=40168)[0m Setting up process group for: env:// [rank=0, world_size=4]
[2m[36m(PPO pid=39835)[0m Trainable.setup took 27.535 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(PPO pid=39835)[0m Install gputil for GPU system monitoring.


Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_tag,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_sampled_throughput_per_sec,num_env_steps_trained,num_env_steps_trained_this_iter,num_env_steps_trained_throughput_per_sec,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_total,training_iteration,trial_id
PPO_CustomEnvironment_540a9_00000,80056,"{'ObsPreprocessorConnector_ms': 0.01589655876159668, 'StateBufferConnector_ms': 0.011245906352996826, 'ViewRequirementAgentConnector_ms': 1.3507261872291565}","{'num_env_steps_sampled': 8000, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 80056, 'num_agent_steps_trained': 0}",{},2023-09-07_13-51-39,False,1000,{},-168.186,-180.471,-205.908,4,8,0,fe2e97c884b6,"{'learner': {'__all__': {'num_agent_steps_trained': 512.0, 'num_env_steps_trained': 7008.0, 'total_loss': 0.011249048166303302}, 'prey': {'total_loss': 0.011249048166303302, 'policy_loss': -0.004245043145475677, 'vf_loss': 0.011553500236900983, 'vf_loss_unclipped': 0.13543880806501496, 'vf_explained_var': -0.030111400152209904, 'entropy': 2.864132779140542, 'mean_kl_loss': 0.005250766952114171, 'curr_lr': 5e-05, 'curr_entropy_coeff': 0.0, 'curr_kl_coeff': 0.10000000149011612}, 'predator': {'total_loss': 0.0034155141580068105, 'policy_loss': -0.007571948348904831, 'vf_loss': 0.009203437678134321, 'vf_loss_unclipped': 0.016845857860261257, 'vf_explained_var': -0.035182035480537555, 'entropy': 2.6974907965555674, 'mean_kl_loss': 0.008920122762169979, 'curr_lr': 5e-05, 'curr_entropy_coeff': 0.0, 'curr_kl_coeff': 0.20000000298023224}}, 'num_env_steps_sampled': 8000, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 80056, 'num_agent_steps_trained': 0}",2,172.17.0.14,80056,0,8000,4000,79.3613,0,0,0,0,1,0,0,0,"{'cpu_util_percent': 27.529166666666665, 'ram_util_percent': 32.65833333333333}",39835,"{'prey': -10.000017745590373, 'predator': -5.30930330666487}","{'prey': -11.004339080255129, 'predator': -7.702771963824885}","{'prey': -20.018213682750982, 'predator': -10.286724535030958}","{'mean_raw_obs_processing_ms': 5.0570805068171785, 'mean_inference_ms': 4.263618604174226, 'mean_action_processing_ms': 1.4091045959798016, 'mean_env_wait_ms': 0.7565173665295403, 'mean_env_render_ms': 0.0}","{'episode_reward_max': -168.18557419865178, 'episode_reward_min': -205.9078273878741, 'episode_reward_mean': -180.470630131473, 'episode_len_mean': 1000.0, 'episode_media': {}, 'episodes_this_iter': 4, 'policy_reward_min': {'prey': -20.018213682750982, 'predator': -10.286724535030958}, 'policy_reward_max': {'prey': -10.000017745590373, 'predator': -5.30930330666487}, 'policy_reward_mean': {'prey': -11.004339080255129, 'predator': -7.702771963824885}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-176.73007458832748, -168.18557419865178, -205.9078273878741, -186.21461408245963, -171.00724767910626, -191.29643696037257, -173.60985362675117, -170.81341252824083], 'episode_lengths': [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], 'policy_prey_reward': [-11.297270217849853, -10.018920058141273, -11.10468878310532, -10.007679710663314, -10.012103181017379, -11.132937203002562, -10.007063167804144, -10.99930513517291, -11.351434971007919, -10.011995595491562, -10.015792867790598, -10.020000000427286, -11.169568456729872, -11.50484213342416, -11.505469133888944, -10.011237864715516, -10.014010721946876, -10.012540756905995, -10.012497203066388, -11.375207322160653, -10.017897800485413, -10.011329213100295, -10.020000000427286, -10.011534212607815, -10.000017745590373, -10.006680043294052, -11.332454569286346, -11.463738998561645, -11.523662309511938, -10.006622524857397, -11.301981095974842, -11.2683206837666, -11.15885070691624, -20.00579366091105, -11.632797713668792, -11.271445796813785, -11.316454933203529, -20.016589091475492, -11.251291016538472, -11.198121146362672, -11.371809734871519, -11.191689498298555, -11.29045065444914, -11.361531870913629, -11.518455581999872, -11.446901541252332, -11.54629840645269, -11.591913625307459, -11.65803583943265, -11.480862942319105, -10.00413056525053, -10.004209727141054, -11.335690259473528, -10.002827149624522, -11.505497364544741, -11.653769531172024, -11.720885439533394, -10.01216636211859, -11.630227324452413, -11.688230335300219, -11.06189769514878, -11.18352672356134, -10.004616895007635, -10.012158798929487, -10.003207694373039, -10.007088654152945, -10.018897043297809, -11.662483088767413, -11.024152996824217, -11.29545205322791, -10.004418522988342, -10.020000000427286, -10.01657309470586, -10.017260982970518, -11.334549484669969, -20.013038581317495, -11.01637543175806, -10.994415756466285, -10.010000000278275, -10.010000000278275, -10.843449949399577, -11.052714956974562, -20.018213682750982, -10.94392069778632, -11.13444682544208, -10.761377623142007, -10.010000000278275, -10.010000000278275, -10.008636200214172, -10.871571558783131, -10.835415197671121, -10.015824305022877, -10.020000000427286, -11.079721698715517, -11.221416986896708, -10.006366684021533, -11.08519472882599, -10.004770558232751, -10.999211240560097, -10.015569143126529, -10.004757505656306, -10.01264050606879, -10.9452540532619, -10.723349778704996, -10.774579599482811, -10.015207047144692, -10.020000000427286, -11.262262699049552, -10.010000000278275, -10.010000000278275, -10.013417937363307, -10.020000000427286, -11.26284038831843, -11.204471379281118, -10.017316999415677, -11.084442210593595, -10.007663872936948, -11.02605253398473, -11.033559845320656, -10.009206233345518], 'policy_predator_reward': [-7.91576464166239, -8.655239331150312, -5.590650953824275, -6.775491958313885, -9.551985843986717, -9.200258357729737, -8.646243134058139, -10.286724535030958, -6.1710423149318645, -7.169921635124246, -5.30930330666487, -8.288972388564266, -7.436949666976132, -8.428831973102302, -8.253002536229111, -5.5639688438489365]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 5.0570805068171785, 'mean_inference_ms': 4.263618604174226, 'mean_action_processing_ms': 1.4091045959798016, 'mean_env_wait_ms': 0.7565173665295403, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0, 'connector_metrics': {'ObsPreprocessorConnector_ms': 0.01589655876159668, 'StateBufferConnector_ms': 0.011245906352996826, 'ViewRequirementAgentConnector_ms': 1.3507261872291565}}",109.675,50.4062,109.675,"{'training_iteration_time_ms': 54833.144, 'sample_time_ms': 45111.077, 'synch_weights_time_ms': 14.388}",1694094699,8000,2,540a9_00000


[2m[36m(_WrappedExecutable pid=40168)[0m [E ProcessGroupNCCL.cpp:828] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1869, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1800475 milliseconds before timing out.
[2m[36m(_WrappedExecutable pid=40168)[0m [E ProcessGroupNCCL.cpp:455] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[2m[36m(_WrappedExecutable pid=40168)[0m [E ProcessGroupNCCL.cpp:460] To avoid data inconsistency, we are taking the entire process down.
[2m[36m(_WrappedExecutable pid=40168)[0m [2023-09-07 14:22:34,820 E 40168 40567] logging.cc:97: Unhandled exception: St13runtime_error. what(): [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1869, OpType=ALLREDUCE, Timeout(ms)=1800000) ran for 1800475 milliseconds before timing out.
[2m[36m(_WrappedExecutable pid=40168)[0m [2023-09-07 14:22:34,912 E 40168 