<a href="https://colab.research.google.com/github/Davidmenamm/Reinforcement-Learning---PPO-Algorithm-Communication/blob/main/PPO_PARAMS_RUN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PPO PARAMS - All Algorithms**

# **Config**

## Google Drive Mount

In [None]:
# Connect to Google Drive Files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Install Dependencies

In [None]:
# Install Dependencies
! pip install gymnasium
! pip install pymunk
! pip install lz4

# box 2d dependency
! pip install swig
! pip install xvfbwrapper
! pip install pyvirtualdisplay

# pettingzoo
! pip install pettingzoo[all]

# jax parallel
! pip install jaxlib==0.4.26 -f https://storage.googleapis.com/jax-releases/jax_releases.html
! pip install jax-dataclasses
! pip install cflib

# ray
! pip install ray


# experiments
! pip install supersuit


# SUPRESS PRINT FOR A RUN, COMMENT IF WANT TO SEE PRINTS IN CONSOLE
# sys.stdout = open(os.devnull, 'w')

Looking in links: https://storage.googleapis.com/jax-releases/jax_releases.html
Collecting jaxlib==0.4.26
  Using cached jaxlib-0.4.26-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.8 kB)
Using cached jaxlib-0.4.26-cp310-cp310-manylinux2014_x86_64.whl (78.8 MB)
Installing collected packages: jaxlib
  Attempting uninstall: jaxlib
    Found existing installation: jaxlib 0.4.33
    Uninstalling jaxlib-0.4.33:
      Successfully uninstalled jaxlib-0.4.33
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chex 0.1.87 requires jaxlib>=0.4.27, but you have jaxlib 0.4.26 which is incompatible.
jax 0.4.33 requires jaxlib<=0.4.33,>=0.4.33, but you have jaxlib 0.4.26 which is incompatible.
optax 0.2.3 requires jaxlib>=0.4.27, but you have jaxlib 0.4.26 which is incompatible.[0m[31m
[0mSuccessfully installed jaxlib-0.4.26
Collecting jaxlib (from jax-dataclasses)
  U

## Imports

In [None]:
# Import libraries
import os
import sys
import time
import yaml
import numpy as np
from datetime import datetime, timedelta
import ray
from ray.tune import run
from ray.tune.registry import register_env
from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv, ParallelPettingZooEnv
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
from ray.rllib.algorithms.ppo import (
    PPOConfig,
    PPOTF1Policy,
    PPOTF2Policy,
    PPOTorchPolicy,
)
from ray import air, tune
from ray.air.constants import TRAINING_ITERATION
from ray.rllib.utils.metrics import (
    ENV_RUNNER_RESULTS,
    EPISODE_RETURN_MEAN,
    NUM_ENV_STEPS_SAMPLED_LIFETIME,
)
from gymnasium.spaces import Box
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.torch.misc import SlimFC
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf, try_import_torch
# pettingzoo envs
from pettingzoo.sisl import pursuit_v4, multiwalker_v9, waterworld_v4
from pettingzoo.butterfly import pistonball_v6

## Robotic Envs

In [None]:
# Clone CrazyRL repository and install it
!git clone https://github.com/ffelten/CrazyRL.git
os.chdir("CrazyRL")
!poetry install
os.chdir("..")

# Add CrazyRL directory to PYTHONPATH
crazy_rl_dir = os.path.abspath("CrazyRL")
if crazy_rl_dir not in sys.path:
    sys.path.append(crazy_rl_dir)

# Import personalized env
import crazy_rl
from crazy_rl.multi_agent.numpy.catch.catch import Catch
from crazy_rl.multi_agent.numpy.hover.hover import Hover
from crazy_rl.multi_agent.numpy.circle.circle import Circle
from crazy_rl.multi_agent.numpy.surround.surround import Surround

fatal: destination path 'CrazyRL' already exists and is not an empty directory.
/bin/bash: line 1: poetry: command not found


# Parameters

## Environments to Run

In [None]:
# environments to run
envs_to_run = {
    "Hover": ("mlp", True),
    "Circle": ("mlp", True),
    # "Pursuit": ("cnn", False),
    # "Multiwalker": ("cnn", False)
}

## PPO Params

### Mlp module

In [None]:
import json
from pathlib import Path

# base param path
params_base_path = "/content/drive/My Drive/PPO/CONFIG/"

# PPO params
ppo_params_path = f'{params_base_path}ppo_params.json'
ppo_params = json.loads(Path(ppo_params_path).read_text())
ppo_params

{'lr': 0.0001,
 'use_critic': True,
 'use_gae': True,
 'lambda_': 0.95,
 'num_sgd_iter': 7,
 'train_batch_size': 4096,
 'sgd_minibatch_size': 1024,
 'entropy_coeff': 0.01,
 'vf_loss_coeff': 0.3,
 'clip_param': 0.1,
 'vf_clip_param': 10.0,
 'grad_clip': 0.4,
 'kl_coeff': 0.2,
 'kl_target': 0.01,
 'shuffle_sequences': True}

### Cnn module

In [None]:
# PPO params cnn
ppo_params_cnn_path = f'{params_base_path}ppo_params_cnn.json'
ppo_params_cnn = json.loads(Path(ppo_params_cnn_path).read_text())
ppo_params_cnn

{'model': {'uses_new_env_runners': True},
 'lr': 0.0001,
 'use_critic': True,
 'use_gae': True,
 'lambda_': 0.95,
 'num_sgd_iter': 7,
 'train_batch_size': 4096,
 'sgd_minibatch_size': 1024,
 'entropy_coeff': 0.01,
 'vf_loss_coeff': 0.3,
 'clip_param': 0.1,
 'vf_clip_param': 10.0,
 'grad_clip': 0.4,
 'kl_coeff': 0.2,
 'kl_target': 0.01,
 'shuffle_sequences': True}

## Workspace Params

In [None]:
# PPO params
workspace_params_path = f'{params_base_path}workspace_params.json'
workspace_params = json.loads(Path(workspace_params_path).read_text())
workspace_params

{'num_workers': 10,
 'num_envs_per_worker': -1,
 'episode_aprox_length': -1,
 'max_episodes': -1,
 'verbosity': -1}

## Environment Params Path and Formatting

In [None]:
# env params paths
drone_hover_params_path = f'{params_base_path}drone_hover_params.json'
target_catch_params_path = f'{params_base_path}target_catch_params.json'
target_surround_params_path = f'{params_base_path}target_surround_params.json'
target_circle_params_path = f'{params_base_path}target_circle_params.json'
pursuit_params_path = f'{params_base_path}pursuit_params.json'
multiwalker_params_path = f'{params_base_path}multiwalker_params.json'
waterworld_params_path = f'{params_base_path}waterworld_params.json'
pistonball_params_path = f'{params_base_path}pistonball_params.json'

# Make lists to numpy arr for robotic envs
def lists_to_np_arrays(d):
    return {k: np.array(v) if isinstance(v, list) else v for k, v in d.items()}

## Drone Hover params

In [None]:
# drone hover
drone_hover_params = lists_to_np_arrays( json.loads(Path(drone_hover_params_path).read_text()) )
drone_hover_params

{'init_flying_pos': array([[2, 0, 0],
        [1, 2, 0]]),
 'drone_ids': array([0, 1]),
 'size': 3,
 'render_mode': None}

## Target Catch params

In [None]:
# target catch
target_catch_params = lists_to_np_arrays( json.loads(Path(target_catch_params_path).read_text()) )
target_catch_params

{'drone_ids': array([0, 1]),
 'init_flying_pos': array([[2, 2, 0],
        [2, 2, 2]]),
 'init_target_location': array([1. , 1. , 2.5]),
 'render_mode': None,
 'target_speed': 0.1,
 'size': 3}

## Target Surround params

In [None]:
# target surround
target_surround_params = lists_to_np_arrays( json.loads(Path(target_surround_params_path).read_text()) )
target_surround_params

{'drone_ids': array([0, 1]),
 'init_flying_pos': array([[0, 1, 2],
        [0, 0, 1]]),
 'target_location': array([1. , 1. , 2.5]),
 'target_id': None,
 'render_mode': None,
 'size': 3,
 'multi_obj': False}

## Target Circle params

In [None]:
# target circle
target_circle_params = lists_to_np_arrays( json.loads(Path(target_circle_params_path).read_text()) )
target_circle_params

{'init_flying_pos': array([[1, 1, 1],
        [0, 0, 1]]),
 'drone_ids': array([0, 1]),
 'render_mode': None,
 'num_intermediate_points': 100,
 'size': 3}

## Pursuit params

In [None]:
# pursuit
pursuit_params = json.loads(Path(pursuit_params_path).read_text())
pursuit_params

{'max_cycles': 500,
 'render_mode': 'rgb_array',
 'x_size': 20,
 'y_size': 20,
 'shared_reward': True,
 'n_evaders': 6,
 'n_pursuers': 2,
 'obs_range': 7,
 'n_catch': 2,
 'freeze_evaders': False,
 'tag_reward': 0.01,
 'catch_reward': 5.0,
 'urgency_reward': -0.1,
 'surround': True,
 'constraint_window': 1.0}

## Multiwalker params

In [None]:
# multiwalker
multiwalker_params = json.loads(Path(multiwalker_params_path).read_text())
multiwalker_params

{'n_walkers': 2,
 'position_noise': 0.001,
 'angle_noise': 0.001,
 'forward_reward': 1.0,
 'terminate_reward': -40.0,
 'fall_reward': -4.0,
 'shared_reward': False,
 'terminate_on_fall': True,
 'remove_on_fall': True,
 'terrain_length': 200,
 'max_cycles': 500,
 'render_mode': 'rgb_array'}

## Waterworld params

In [None]:
# waterworld
waterworld_params = json.loads(Path(waterworld_params_path).read_text())
waterworld_params

{'max_cycles': 500,
 'n_coop': 2,
 'n_sensors': 20,
 'sensor_range': 0.2,
 'radius': 0.015,
 'obstacle_radius': 0.2,
 'n_obstacles': 1,
 'obstacle_coord': [[0.5, 0.5]],
 'pursuer_max_accel': 0.1,
 'evader_speed': 0.1,
 'poison_speed': 0.1,
 'thrust_penalty': -1.0,
 'local_ratio': 1.0,
 'speed_features': True,
 'n_pursuers': 2,
 'n_evaders': 4,
 'n_poisons': 4,
 'poison_reward': -10.0,
 'food_reward': 20.0,
 'encounter_reward': 0.5,
 'render_mode': 'rgb_array'}

## Pistonball params

In [None]:
# pistonball
pistonball_params = json.loads(Path(pistonball_params_path).read_text())
pistonball_params

{'render_mode': 'rgb_array',
 'n_pistons': 10,
 'time_penalty': -0.1,
 'continuous': True,
 'random_drop': True,
 'random_rotate': True,
 'ball_mass': 0.75,
 'ball_friction': 0.3,
 'ball_elasticity': 1.5,
 'max_cycles': 125}

# Setup Environments

In [None]:
# Environment creator
def env_creator(config):
    # Create corresponding env
    envs = {
        # robotic envs
        'Hover' : (lambda: Hover(**drone_hover_params), True),
        'Catch' : (lambda: Catch(**target_catch_params), True),
        'Surround' : (lambda: Surround(**target_surround_params), True),
        'Circle' : (lambda: Circle(**target_circle_params), True),
        # regular envs
        'Pursuit' : (lambda: pursuit_v4.env(**pursuit_params), False),
        'Multiwalker' : (lambda: multiwalker_v9.env(**multiwalker_params), False),
        'Waterworld' : (lambda: waterworld_v4.env(**waterworld_params), False),
        'Pistonball' : (lambda: pistonball_v6.env(**pistonball_params), False),
    }
    print(config["name"])
    # Create environment with params
    environment, is_parallel = envs[config["name"]]
    environment_call = environment()
    # Return env
    return environment_call, is_parallel

# Get environment specifications (obs space, act space, etc)
def get_specification(config):
    # current env
    current_env, is_parallel = env_creator(config)
    print(current_env)
    # init observation
    init_obs = current_env.reset()
    # agents and spaces
    agents = current_env.agents
    obs_space = current_env.observation_space(agents[0])
    act_space = current_env.action_space(agents[0])
    # return
    return obs_space, act_space, agents, is_parallel

# Register environment
def register_environment(env_name):
    # register env
    env_name_config = {"name": env_name}
    # dummy env specifications
    obs_space, act_space, agents, is_parallel = get_specification(env_name_config)
    if is_parallel:
        register_env(f"{env_name}", lambda config: ParallelPettingZooEnv(env_creator(env_name_config)[0]))
    else:
        register_env(f"{env_name}", lambda config: PettingZooEnv(env_creator(env_name_config)[0]))
    # return env specifications
    return obs_space, act_space, agents

# Register available environments and save specifications
envs_specifications = {}
for env in envs_to_run.keys():
    obs_space, act_space, agents = register_environment(env)
    envs_specifications[env] = (obs_space, act_space, agents)

# print specifications in console
envs_specifications

Hover
Hover
Circle
Circle


{'Hover': (Box([-3. -3.  0. -3. -3.  0.], 3.0, (6,), float32),
  Box(-1.0, 1.0, (3,), float32),
  ['agent_0', 'agent_1']),
 'Circle': (Box([-3. -3.  0. -3. -3.  0.], 3.0, (6,), float32),
  Box(-1.0, 1.0, (3,), float32),
  ['agent_0', 'agent_1'])}

# Init Ray Ecosystem

In [None]:
# Ray ecosystem initialization
ray.init(local_mode=True, ignore_reinit_error=True)

2024-10-15 14:33:45,437	INFO worker.py:1786 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.37.0


# Callback & Metrics

## Config & Paths

In [None]:
# Ensure the directory exists
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)

# time now
timeNow = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

## Callbacks

In [None]:
# Custom Callback
class CustomCallback(DefaultCallbacks):
    def __init__(self, env_name, algo_mode):
        self.env_name = env_name
        # variables
        self.start_time = time.time()
        self.episode_num = 1
        self.delta_timestep = 0
        self.delta_timestep_sum = 0
        # base dir
        base_log_dir = f"/content/drive/My Drive/PPO/{algo_mode}/"
        # env dirs
        env_dirs = {
            "Hover": f'{base_log_dir}Hover/rrun_{timeNow}',
            "Catch": f'{base_log_dir}Catch/rrun_{timeNow}',
            "Surround": f'{base_log_dir}Surround/rrun_{timeNow}',
            "Circle": f'{base_log_dir}Circle/rrun_{timeNow}',
            "Pursuit": f'{base_log_dir}Pursuit/rrun_{timeNow}',
            "Multiwalker": f'{base_log_dir}Multiwalker/rrun_{timeNow}',
            "Waterworld": f'{base_log_dir}Waterworld/rrun_{timeNow}',
            "Pistonball": f'{base_log_dir}Pistonball/rrun_{timeNow}'
        }
        # inside dirs
        fileEpisodes = f"episodes_{timeNow}.txt"
        fileRewards = f"episode_rewards_{timeNow}.txt"
        fileTime = f"time_{timeNow}.txt"
        # Ensure the log directories exists
        ensure_dir(base_log_dir)
        for env in env_dirs.values():
            ensure_dir(env)

        # Env dirs
        self.log_dir = env_dirs[env_name]
        self.episode_num_log = os.path.join(self.log_dir, fileEpisodes)
        self.episode_reward_log = os.path.join(self.log_dir, fileRewards)
        self.episode_time_log = os.path.join(self.log_dir, fileTime)

        # erase previous content
        with open(self.episode_num_log , 'w') as f:
            f.write(f"{self.env_name}\n")
        with open(self.episode_reward_log , 'w') as f:
            f.write(f"{self.env_name}\n")

    # on episode end
    def on_episode_end(
        self,
        *,
        worker,
        base_env,
        policies,
        episode: MultiAgentEpisode,
        **kwargs
    ):
        print('BASE-ENV', self.env_name)
        episode_reward_total_sum = sum(episode.agent_rewards.values())

        try:
            # Timer
            def elapsed_time(start_time):
                current_time = time.time()
                elapsed_seconds = current_time - start_time
                elapsed_minutes = elapsed_seconds / 60
                elapsed_hours = elapsed_minutes / 60
                return elapsed_hours, elapsed_minutes

            # Delta time
            if self.episode_num == 1:
                self.delta_timestep = (elapsed_time(self.start_time)[1]) / workspace_params["num_workers"]
            if self.episode_num < 20+1:
                self.delta_timestep_sum += elapsed_time(self.start_time)[1]
            if self.episode_num == 20:
                self.delta_timestep = self.delta_timestep_sum / self.episode_num
                self.delta_timestep_sum = 0

            minutes_time_remaining = workspace_params["max_episodes"]* self.delta_timestep - self.episode_num * self.delta_timestep
            hours_time_remaining = minutes_time_remaining / 60

            # Log future time
            with open(self.episode_time_log, 'a') as f:
                elapsed_hours, elapsed_minutes = elapsed_time(self.start_time)
                f.write(
                    f"""
                    \nEpisode {self.episode_num} / {workspace_params["max_episodes"]}
                    Elapsed Time: {elapsed_hours:.2f} hours ({elapsed_minutes:.2f} minutes)
                    Approximate missing time {hours_time_remaining:.2f} hours ({minutes_time_remaining:.2f} minutes)
                    """
                )
                f.flush()  # Force the flush to disk after write
                print(f"Logged time for episode {self.episode_num}")

            # Save episodes in file
            with open(self.episode_num_log, 'a') as f:
                f.write(f"{self.episode_num}\n")
                f.flush()  # Force flush
                print(f"Logged episode number {self.episode_num}")

            # Sum episode num
            self.episode_num += 1

            # Save reward in file
            with open(self.episode_reward_log, 'a') as f:
                f.write(f"{episode_reward_total_sum}\n")
                f.flush()  # Force flush
                print(f"Logged reward for episode {self.episode_num - 1}: {episode_reward_total_sum}")

        except Exception as e:
            print(f"Failed to write episode reward: {e}")


# Tuning & Training

In [None]:
# modes
modes = ['parameters', 'multiple']  # , 'centralized']

# run envs
for mode in modes:
    for env_name, setup in envs_to_run.items():
        base_model, old_stack = setup
        print('params ', mode, env_name, base_model, old_stack)

        # BASE CONFIG
        config = (
            PPOConfig()
            .framework("torch")
            .env_runners(batch_mode="complete_episodes", num_env_runners=workspace_params['num_workers'])
            .training( **(ppo_params if base_model == "mlp" else ppo_params_cnn) )
        )

        # STACK
        # new stack
        if old_stack:
            config = (
                config.environment(f"{env_name}")
                .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")), num_cpus_per_worker=1)
            )
        # old stack
        else:
            _, _, agents = envs_specifications[env_name]
            config = (
                config.environment("env_1", env_config={"num_agents": len(agents)})
                .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
                .api_stack(enable_rl_module_and_learner=True, enable_env_runner_and_connector_v2=True)
                .learners(num_learners=1, num_gpus_per_learner=1)
            )

        # TYPE OF MODEL
        if base_model == "cnn":
            config = (
                config.rl_module(
                    model_config_dict={
                        "uses_new_env_runners": True,
                        "vf_share_layers": False,
                        "use_lstm": False,
                        "fcnet_hiddens": [128, 128],
                        "conv_filters": [[7, 7, 3], [32, 4, 2], [64, 8, 1]],
                    }
                )
            )

        # MODE
        if mode == "parameters":
            # callbacks
            config = (
                config.callbacks(lambda: CustomCallback(env_name, 'Parameters'))
            )
        elif mode == "multiple":
            # Multi-agent configuration
            obs_space, act_space, agents = envs_specifications[env_name]

            # Generate policies
            def gen_policy(i):
                return (None, obs_space, act_space, {})

            # Assigned policies to each agent
            policies = {"policy_{}".format(i): gen_policy(i) for i in range(len(agents))}

            # Policies mapping function
            def policy_mapping_fn(agent_id, *args, **kwargs):
                policy_ids = list(policies.keys())
                return policy_ids[agents.index(agent_id)]

            # env name
            print("env_name", env_name)

            # config
            config = (
                config.multi_agent(
                    policy_mapping_fn=policy_mapping_fn,
                    policies=policies
                )
                .callbacks(lambda: CustomCallback(env_name, 'Independent'))
            )

        # print config
        print('config', config.to_dict())

        # STOP CONDITIONS
        stop = {
            TRAINING_ITERATION: 10,
            # "num_env_steps_sampled_lifetime": 100000,
            # f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 7.99,
        }

        # TUNING
        tuner = tune.Tuner(
            "PPO",
            param_space=config.to_dict(),
            run_config=air.RunConfig(stop=stop, verbose=1),
        )

        # RESULTS
        results = tuner.fit()
        print(results)

    # Optionally, the previous run approach
    # run(
    #   'PPO',
    #   config=config,
    #   checkpoint_freq=10,
    #   checkpoint_at_end=True,
    #   verbose=3,
    #   stop={"episodes_total": 10000},
    #   reuse_actors=True,
    # )

  gym.logger.warn(
  gym.logger.warn(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
2024-10-15 14:33:46,637	INFO tensorboardx.py:193 -- pip install "ray[tune]" to see TensorBoard files.


params  parameters Hover mlp True
config {'extra_python_environs_for_driver': {}, 'extra_python_environs_for_worker': {}, 'placement_strategy': 'PACK', 'num_gpus': 0, '_fake_gpus': False, 'num_cpus_for_main_process': 1, 'eager_tracing': True, 'eager_max_retraces': 20, 'tf_session_args': {'intra_op_parallelism_threads': 2, 'inter_op_parallelism_threads': 2, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}, 'local_tf_session_args': {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8}, 'torch_compile_learner': False, 'torch_compile_learner_what_to_compile': <TorchCompileWhatToCompile.FORWARD_TRAIN: 'forward_train'>, 'torch_compile_learner_dynamo_backend': 'inductor', 'torch_compile_learner_dynamo_mode': None, 'torch_compile_worker': False, 'torch_compile_worker_dynamo_backend': 'onnxrt', 'torch_compile_worker_dynamo_mode': None, 'torch_ddp_kwargs': {}, 'torch_skip_nan_gradients': False, 'enabl

:job_id:01000000
:task_name:bundle_reservation_check_func
:actor_name:PPO
:actor_name:RolloutWorker


:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover


:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker


:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker


:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker


Hover
:actor_name:PPO

Trial PPO_Hover_7c7a3_00000 started with configuration:
+---------------------------------------------------------------------------+
| Trial PPO_Hover_7c7a3_00000 config                                        |
+---------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config                                |
| _disable_action_flattening                                          False |
| _disable_execution_plan_api                                            -1 |
| _disable_initialize_loss_from_dummy_batch                           False |
| _disable_preprocessor_api                                           False |
| _dont_auto_sync_env_runner_states                                   False |
| _enable_rl_module_api                                                  -1 |
| _env_to_module_connector                                                  |
| _evaluation_parallel_to_training_wo_thread                   

:actor_name:PPO


BASE-ENV Hover
Logged time for episode 1
Logged episode number 1
Logged reward for episode 1: -6.171397539920196
BASE-ENV Hover
Logged time for episode 2
Logged episode number 2
Logged reward for episode 2: -4.652869980297892
BASE-ENV Hover
Logged time for episode 3
Logged episode number 3
Logged reward for episode 3: -5.407157445082735
BASE-ENV Hover
Logged time for episode 1
Logged episode number 1
Logged reward for episode 1: -8.893116090807537
BASE-ENV Hover
Logged time for episode 2
Logged episode number 2
Logged reward for episode 2: -4.650318109455629
BASE-ENV Hover
Logged time for episode 3
Logged episode number 3
Logged reward for episode 3: -8.347770587164892
BASE-ENV Hover
Logged time for episode 1
Logged episode number 1
Logged reward for episode 1: -4.036171563635468
BASE-ENV Hover
Logged time for episode 2
Logged episode number 2
Logged reward for episode 2: -8.65700025769169
BASE-ENV Hover
Logged time for episode 3
Logged episode number 3
Logged reward for episode 3: -5.

2024-10-15 14:36:17,410	INFO storage.py:556 -- Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/PPO_2024-10-15_14-33-46/PPO_Hover_7c7a3_00000_0_2024-10-15_14-33-46/checkpoint_000000)
2024-10-15 14:36:17,555	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/PPO_2024-10-15_14-33-46' in 0.0177s.
:task_name:bundle_reservation_check_func



Trial PPO_Hover_7c7a3_00000 completed after 10 iterations at 2024-10-15 14:36:17. Total running time: 2min 30s
+-------------------------------------------------+
| Trial PPO_Hover_7c7a3_00000 result              |
+-------------------------------------------------+
| env_runners/episode_len_mean                200 |
| env_runners/episode_return_mean        -3.69995 |
| num_env_steps_sampled_lifetime            42000 |
+-------------------------------------------------+

Trial status: 1 TERMINATED
Current time: 2024-10-15 14:36:17. Total running time: 2min 30s
Logical resource usage: 0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:L4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status         iter     total time (s)      ts     num_healthy_workers     ...async_sample_reqs     ...e_worker_restarts     ...ent_steps_sampled |
+---------

:actor_name:PPO
:actor_name:RolloutWorker
:actor_name:RolloutWorker


:actor_name:PPO
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle


:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker


:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker


:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:PPO


Circle
:actor_name:RolloutWorker
Circle
:actor_name:PPO

Trial PPO_Circle_d6733_00000 started with configuration:
+---------------------------------------------------------------------------+
| Trial PPO_Circle_d6733_00000 config                                       |
+---------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config                                |
| _disable_action_flattening                                          False |
| _disable_execution_plan_api                                            -1 |
| _disable_initialize_loss_from_dummy_batch                           False |
| _disable_preprocessor_api                                           False |
| _dont_auto_sync_env_runner_states                                   False |
| _enable_rl_module_api                                                  -1 |
| _env_to_module_connector                                                  |
| _evaluation_parallel_to_tr

2024-10-15 14:38:48,532	INFO storage.py:556 -- Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/PPO_2024-10-15_14-36-17/PPO_Circle_d6733_00000_0_2024-10-15_14-36-17/checkpoint_000000)


Trial status: 1 RUNNING
Current time: 2024-10-15 14:38:48. Total running time: 2min 30s
Logical resource usage: 11.0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:L4)
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status       iter     total time (s)      ts     num_healthy_workers     ...async_sample_reqs     ...e_worker_restarts     ...ent_steps_sampled |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| PPO_Circle_d6733_00000   RUNNING         9            135.062   37800                      10                        0                        0                    75600 |
+----------------------------------------------------------------------------------------------------------------------------------------------

2024-10-15 14:38:49,097	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/PPO_2024-10-15_14-36-17' in 0.0187s.
:task_name:bundle_reservation_check_func
:actor_name:PPO



Trial status: 1 TERMINATED
Current time: 2024-10-15 14:38:49. Total running time: 2min 31s
Logical resource usage: 0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         iter     total time (s)      ts     num_healthy_workers     ...async_sample_reqs     ...e_worker_restarts     ...ent_steps_sampled |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| PPO_Circle_d6733_00000   TERMINATED       10            150.075   42000                      10                        0                        0                    84000 |
+-------------------------------------------------------------------------------------------------------------------------------------

:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker


:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover


:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker


:actor_name:RolloutWorker
Hover
:actor_name:RolloutWorker
Hover
:actor_name:PPO

Trial PPO_Hover_30c71_00000 started with configuration:
+---------------------------------------------------------------------------+
| Trial PPO_Hover_30c71_00000 config                                        |
+---------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config                                |
| _disable_action_flattening                                          False |
| _disable_execution_plan_api                                            -1 |
| _disable_initialize_loss_from_dummy_batch                           False |
| _disable_preprocessor_api                                           False |
| _dont_auto_sync_env_runner_states                                   False |
| _enable_rl_module_api                                                  -1 |
| _env_to_module_connector                                                  |
| _ev

:actor_name:PPO


BASE-ENV Hover
Logged time for episode 1
Logged episode number 1
Logged reward for episode 1: -9.35929773577639
BASE-ENV Hover
Logged time for episode 2
Logged episode number 2
Logged reward for episode 2: -9.312312171194225
BASE-ENV Hover
Logged time for episode 3
Logged episode number 3
Logged reward for episode 3: -9.52882495624354
BASE-ENV Hover
Logged time for episode 1
Logged episode number 1
Logged reward for episode 1: -7.281205074787733
BASE-ENV Hover
Logged time for episode 2
Logged episode number 2
Logged reward for episode 2: -8.099248139666301
BASE-ENV Hover
Logged time for episode 3
Logged episode number 3
Logged reward for episode 3: -10.808043304284658
BASE-ENV Hover
Logged time for episode 1
Logged episode number 1
Logged reward for episode 1: -5.992286909079894
BASE-ENV Hover
Logged time for episode 2
Logged episode number 2
Logged reward for episode 2: -9.235920665278009
BASE-ENV Hover
Logged time for episode 3
Logged episode number 3
Logged reward for episode 3: -7.

2024-10-15 14:42:34,693	INFO storage.py:556 -- Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/PPO_2024-10-15_14-38-49/PPO_Hover_30c71_00000_0_2024-10-15_14-38-49/checkpoint_000000)
2024-10-15 14:42:34,806	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/PPO_2024-10-15_14-38-49' in 0.0214s.
:task_name:bundle_reservation_check_func



Trial PPO_Hover_30c71_00000 completed after 10 iterations at 2024-10-15 14:42:34. Total running time: 3min 45s
+-------------------------------------------------+
| Trial PPO_Hover_30c71_00000 result              |
+-------------------------------------------------+
| env_runners/episode_len_mean                200 |
| env_runners/episode_return_mean        -3.68267 |
| num_env_steps_sampled_lifetime            42000 |
+-------------------------------------------------+

Trial status: 1 TERMINATED
Current time: 2024-10-15 14:42:34. Total running time: 3min 45s
Logical resource usage: 0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:L4)
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name              status         iter     total time (s)      ts     num_healthy_workers     ...async_sample_reqs     ...e_worker_restarts     ...ent_steps_sampled |
+---------

:actor_name:PPO
:actor_name:RolloutWorker
:actor_name:RolloutWorker


:actor_name:PPO
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker


:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker


Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker


:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker
:actor_name:RolloutWorker


Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle
:actor_name:RolloutWorker
Circle


:actor_name:PPO


:actor_name:PPO

Trial PPO_Circle_b74f1_00000 started with configuration:
+---------------------------------------------------------------------------+
| Trial PPO_Circle_b74f1_00000 config                                       |
+---------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config                                |
| _disable_action_flattening                                          False |
| _disable_execution_plan_api                                            -1 |
| _disable_initialize_loss_from_dummy_batch                           False |
| _disable_preprocessor_api                                           False |
| _dont_auto_sync_env_runner_states                                   False |
| _enable_rl_module_api                                                  -1 |
| _env_to_module_connector                                                  |
| _evaluation_parallel_to_training_wo_thread                        

2024-10-15 14:46:16,899	INFO storage.py:556 -- Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/PPO_2024-10-15_14-42-34/PPO_Circle_b74f1_00000_0_2024-10-15_14-42-34/checkpoint_000000)



Trial PPO_Circle_b74f1_00000 completed after 10 iterations at 2024-10-15 14:46:16. Total running time: 3min 42s
+-------------------------------------------------+
| Trial PPO_Circle_b74f1_00000 result             |
+-------------------------------------------------+
| env_runners/episode_len_mean                200 |
| env_runners/episode_return_mean         6.00269 |
| num_env_steps_sampled_lifetime            42000 |
+-------------------------------------------------+


2024-10-15 14:46:17,328	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/PPO_2024-10-15_14-42-34' in 0.0215s.



Trial status: 1 TERMINATED
Current time: 2024-10-15 14:46:17. Total running time: 3min 42s
Logical resource usage: 0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name               status         iter     total time (s)      ts     num_healthy_workers     ...async_sample_reqs     ...e_worker_restarts     ...ent_steps_sampled |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| PPO_Circle_b74f1_00000   TERMINATED       10            221.019   42000                      10                        0                        0                    84000 |
+-------------------------------------------------------------------------------------------------------------------------------------