# Some preliminary checks

In [1]:
import torch
import tensorflow as tf
import os

os.environ["RAY_DEDUP_LOGS"] = "0"

print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

PyTorch Version: 2.1.0
CUDA Available: False
CUDA Version: None


In [2]:
import psutil

# print number of gpus / CPUs
print("Number of CPUs: ", psutil.cpu_count())

num_cpus = 12
num_gpus = 0

Number of CPUs:  12


# Training

In [3]:
from ray.rllib.policy.policy import PolicySpec
from ray.tune.registry import get_trainable_cls

from custom_env import CustomEnvironment
from config import run_config

## The RLlib configuration
class Args:
    def __init__(self):
        self.run = "PPO"
        self.framework = "torch" # "tf2" or "torch"
args = Args()

## Generate the configuration
env = CustomEnvironment(run_config["env"])

config = (
    get_trainable_cls(args.run)
    .get_default_config()
    .environment(CustomEnvironment, env_config=run_config["env"])
    .framework(args.framework)
    .training(_enable_learner_api=True, num_sgd_iter=10, sgd_minibatch_size=256, train_batch_size=20000)
    .multi_agent(
        policies= {
            "prey": PolicySpec(
                policy_class=None,  # infer automatically from Algorithm
                observation_space=env.observation_space[0],  # if None infer automatically from env
                action_space=env.action_space[0],  # if None infer automatically from env
                config={"gamma": 0.85},  # use main config plus <- this override here
            ),
            "predator": PolicySpec(
                policy_class=None,
                observation_space=env.observation_space[0],
                action_space=env.action_space[0],
                config={"gamma": 0.85},
            ),
        },
        policy_mapping_fn = lambda id, *arg, **karg: "prey" if env.agents[id].agent_type == 0 else "predator",
        policies_to_train=["prey", "predator"]
    )
    .rl_module(_enable_rl_module_api=True)
    .rollouts(
        rollout_fragment_length="auto",
        batch_mode= 'truncate_episodes',
        num_rollout_workers=num_cpus-1,
        num_envs_per_worker=1,
        #create_env_on_local_worker=False,
    )
    # This as to be specified everytime (don't know how to automatically ajust)
    .resources(
        #num_gpus = num_gpus,
        #num_gpus_per_worker=0,
        #num_cpus_per_worker=2,
        # learner workers
        #num_learner_workers=num_gpus,
        #num_gpus_per_learner_worker=1,
        #num_cpus_per_learner_worker=0,
    )
    .checkpointing(export_native_model_files=True)
)

2023-10-09 14:05:12,245	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-10-09 14:05:12,296	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


## To load a previously trained policy

In [4]:
from ray.rllib.policy.policy import Policy
from ray.rllib.algorithms.callbacks import DefaultCallbacks

path_to_checkpoint = None
def restore_weights(path_to_checkpoint, policy_type):
    checkpoint_path = os.path.join(path_to_checkpoint, f"policies/{policy_type}")
    restored_policy = Policy.from_checkpoint(checkpoint_path)
    return restored_policy.get_weights()

if path_to_checkpoint is not None: 
    class RestoreWeightsCallback(DefaultCallbacks):
        def __init__(self):
            self.restored_policy_predator_weights = restore_weights(path_to_checkpoint, "predator")
            self.restored_policy_prey_weights = restore_weights(path_to_checkpoint,"prey")
    
        def on_algorithm_init(self, *, algorithm: "Algorithm", **kwargs) -> None:
            algorithm.set_weights({"predator": self.restored_policy_predator_weights})
            algorithm.set_weights({"prey": self.restored_policy_prey_weights})

    config.callbacks(RestoreWeightsCallback)



In [5]:
import ray 
from ray import train, tune
from ray.air.integrations.wandb import WandbLoggerCallback
from ray.rllib.utils.test_utils import check_learning_achieved

ray.init(num_cpus=num_cpus, num_gpus=num_gpus)

print("num CPUS rays sees :", ray.cluster_resources().get('CPU', 0))
print("num GPUS rays sees :", ray.cluster_resources().get('GPU', 0))

opti_config = {
    'stop_iters': 500,
    'stop_timesteps': 2000000,
    'stop_reward': 0.1,
    'as_test': False
}

## Run the experiemnt    
tuner = tune.Tuner(
    args.run,
    param_space=config,
    run_config=train.RunConfig(
        stop={
            "training_iteration": opti_config["stop_iters"],
            "timesteps_total": opti_config["stop_timesteps"],
            "episode_reward_mean": opti_config["stop_reward"],
        },
        verbose=3,
        callbacks=[WandbLoggerCallback(
            project="marl-rllib", 
            group="PPO",
            api_key="90dc2cefddde123eaac0caae90161981ed969abe",
            log_config=True,
        )],
        checkpoint_config=train.CheckpointConfig(
            checkpoint_at_end=True,
            checkpoint_frequency=10
        ),
    ),
)
results = tuner.fit()

if opti_config["as_test"]:
    print("Checking if learning goals were achieved")
    check_learning_achieved(results, opti_config["stop_reward"])
ray.shutdown()


2023-10-09 14:05:14,368	INFO worker.py:1642 -- Started a local Ray instance.
2023-10-09 14:05:14,858	INFO tune.py:645 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


num CPUS rays sees : 12.0
num GPUS rays sees : 0


0,1
Current time:,2023-10-09 14:05:14
Running for:,00:00:00.12
Memory:,22.5/64.0 GiB

Trial name,status,loc
PPO_CustomEnvironment_1afa3_00000,PENDING,


2023-10-09 14:05:17,836	ERROR tune_controller.py:1502 -- Trial task failed for trial PPO_CustomEnvironment_1afa3_00000
Traceback (most recent call last):
  File "/Users/tanguy/miniforge3/envs/collective_env/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/Users/tanguy/miniforge3/envs/collective_env/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/tanguy/miniforge3/envs/collective_env/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/tanguy/miniforge3/envs/collective_env/lib/python3.11/site-packages/ray/_private/worker.py", line 2549, in get
    raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its 

KeyError: PPO_CustomEnvironment_1afa3_00000

# Render episode 

### Retrieve checkpoint

In [None]:
best_checkpoint = results.get_best_result().checkpoint
best_checkpoint

In [None]:
import ray
from ray.rllib.algorithms.algorithm import Algorithm

path_to_checkpoint = "/Users/tanguy/ray_results/PPO_2023-10-09_11-58-40/PPO_CustomEnvironment_6c9d5_00000_0_2023-10-09_11-58-40/checkpoint_000009"

ray.init()
algo = Algorithm.from_checkpoint(path_to_checkpoint)

# After loading the algorithm
local_worker = algo.workers.local_worker()
available_policy_ids = list(local_worker.policy_map.keys())
print("Available Policy IDs:", available_policy_ids)

### Run and plot

In [None]:
import numpy as np

def process_observations(observation, agent_ids, truncation=None):
    loc_x = [observation[key][2] if key in observation else 0 for key in agent_ids]
    loc_y = [observation[key][3] if key in observation else 0 for key in agent_ids]
    heading = [observation[key][4] if key in observation else 0 for key in agent_ids]
    if truncation:
        still_in_the_game = [1 if not truncation[key] else 0 for key in agent_ids]
    else:
        still_in_the_game = [1 for _ in agent_ids]
    observations["loc_x"].append(np.array(loc_x))
    observations["loc_y"].append(np.array(loc_y))
    observations["heading"].append(np.array(loc_y))
    observations["still_in_the_game"].append(np.array(still_in_the_game))
    
    return observations

# Use the first available policy ID
policy_id = available_policy_ids[0]

observations = {"loc_x": [], "loc_y": [], "heading": [], "still_in_the_game": []}

observation, _ = env.reset()
agent_ids = env._agent_ids
loc_x, loc_y, heading, still_in_the_game = process_observations(observation, agent_ids)
step_count = 1


while step_count < 500:
    actions = {
        key: algo.compute_single_action(
            value, policy_id="prey" if env.agents[key].agent_type == 0 else "predator"
        ) for key, value in observation.items()
    }
    
    observation, _, termination, truncation, _ = env.step(actions)
    
    observations = process_observations(observation, agent_ids, truncation)
    
    step_count += 1

stage_size = env.stage_size
observations["loc_x"] = np.array(observations["loc_x"]) * stage_size
observations["loc_y"] = np.array(observations["loc_y"]) * stage_size
observations["still_in_the_game"] = np.array(observations["still_in_the_game"])

env.close()
ray.shutdown()


In [None]:
import importlib
import animation
importlib.reload(animation)

from animation import generate_animation

ani = generate_animation(observations, env)

In [None]:
from IPython.display import HTML

HTML(ani.to_html5_video())

# Retrain

In [None]:
from ray.rllib.policy.policy import Policy
from ray.rllib.algorithms.callbacks import DefaultCallbacks

def restore_policy_and_weights(policy_type):
    checkpoint_path = os.path.join(best_checkpoint.to_directory(), f"policies/{policy_type}")
    restored_policy = Policy.from_checkpoint(checkpoint_path)
    return restored_policy.get_weights()

restored_policy_predator_weights = restore_policy_and_weights("predator")
restored_policy_prey_weights = restore_policy_and_weights("prey")

print("Starting new tune.Tuner().fit()")

ray.init()

# Start our actual experiment.
stop = {
    "episode_reward_mean": args.stop_reward,
    "timesteps_total": args.stop_timesteps,
    "training_iteration": args.stop_iters,
}

class RestoreWeightsCallback(DefaultCallbacks):
    def on_algorithm_init(self, *, algorithm: "Algorithm", **kwargs) -> None:
        algorithm.set_weights({"predator": restored_policy_predator_weights})
        algorithm.set_weights({"prey": restored_policy_prey_weights})

config.callbacks(RestoreWeightsCallback)

results = tune.run(
    "PPO",
    stop=stop,
    config=config.to_dict(),
    verbose=1,
)

if args.as_test:
    check_learning_achieved(results, args.stop_reward)

ray.shutdown()