# Sequencial

In [13]:
from pettingzoo.butterfly import pistonball_v6

# Creates env
env = pistonball_v6.env(render_mode="human")
env.reset(seed=42)


for agent in env.agent_iter():
    # Get observation and reward of the agent
    observation, reward, termination, truncation, info = env.last()

    if termination or truncation:
        action = None
    else:
        # Randomly selected action from the action space
        action = env.action_space(agent).sample()

    env.step(action)
env.close()

# Parallel

In [4]:
from pettingzoo.butterfly import pistonball_v6

env = pistonball_v6.parallel_env(render_mode="human", n_pistons = 20)
observations, infos = env.reset()



while env.agents:
    # this is where you would insert your policy
    actions = {agent: env.action_space(agent).sample() for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)

env.close()

# Wrapping Environment for Fully Centralized Control

In [6]:
import gymnasium
from gymnasium import spaces
import numpy as np



class SingleAgentWrapperEnv(gymnasium.Env) :
    """
    This wrapper permits to create a gymnasium env where the action space is the cartesian product of agents' action space
    and the observation space is the cartesian product of agents' observation space.

    This permits to train a single "super-agent" which will receive all observations and distribute actions that has to be made by all "sub-agents".

    WARNING : USE THIS WRAPPER ONLY IF YOU FOLLOW ALL ASSUMPTIONS BELOW !!!
     - The env is a pettingzoo env with the parallel API (ParallelEnv) (can be wrapped)
     - All agents of the env has a Box action space and a Box observation space
     - All agents of the env must have the same bounds, dtypes and a shape of (1,) for the Box representing their action space
     - All agents of the env must have the same bounds, dtypes and a 1D shape of same size for the Box space representing their observation space


    Do not forget to use env.reset() before creating SingleAgentWrapperEnv(env) to properly initialize all attributes of the env
    """

    def __init__(self, env, options = None) :
        super(SingleAgentWrapperEnv, self).__init__()

        self.env = env
        self.agents = self.env.unwrapped.agents
        self.nb_agent = len(self.agents)


        # Creating observation_space
        temp_space = self.env.observation_space(self.agents[0])
        low_bound = min(temp_space.low) # Not optimized this using min instead of taking the lowest value for each element grows the observation_space
        high_bound = max(temp_space.high) # Same here with the max
        dtype = temp_space.dtype

        shape = (self.nb_agent * temp_space.shape[0],)
        self.observation_space = spaces.Box(np.full(shape, low_bound), np.full(shape, high_bound), shape, dtype)

        
        # Creating action_space
        temp_space = self.env.action_space(self.agents[0])
        low_bound = temp_space.low # Lowest value an action of an agent can take
        high_bound = temp_space.high # Highest value an action of an agent can take
        dtype = temp_space.dtype # The exact type of an action
        
        shape = (self.nb_agent,) # A vector container at index i the action made by self.env.agents[i]
        self.action_space = spaces.Box(np.full(shape, low_bound), np.full(shape, high_bound), shape, dtype)


    def reset(self, seed = None, options = None) :
        super().reset(seed = seed)
        observations, infos = self.env.reset(seed, options)

        obs = np.array([], dtype = self.observation_space.dtype)
        for i in range(self.nb_agent) :
            obs = np.append(obs, observations[self.agents[i]]) # Concatenation of all observations
        return obs, infos


    def step(self, action) :
        """
        The action in input is a vector containing actions of each agents
        """
        dict_actions = {self.agents[i]: np.array(object = [action[i]], dtype=np.float32) for i in range(self.nb_agent)}
        
        observations, rewards, terminations, truncations, infos = self.env.step(dict_actions)

        
        obs = np.array([], dtype = self.observation_space.dtype)
        for i in range(self.nb_agent) :
            obs = np.append(obs, observations[self.agents[i]]) # Concatenation of all observations

        
        reward = 0
        termination = False
        truncation = False
        for agent in self.agents :
            # Reward is the mean of all agents' reward
            reward += rewards[agent]
            reward = reward / self.nb_agent

            # Episode ends as soon as it ends for one agent
            if terminations[agent] :
                termination = True

            if truncations[agent] :
                truncation = True

        return obs, reward, termination or len(self.agents) != self.nb_agent, truncation, infos


### Use Grey Scale Image as observation and flatten it

In [2]:
from pettingzoo.butterfly import pistonball_v6

env = pistonball_v6.parallel_env()
env.reset()


from supersuit import color_reduction_v0
from supersuit import resize_v1
from supersuit import flatten_v0

grey_scale_env = color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = resize_v1(grey_scale_env, 114, 30) # Divide by 4 quality of each image observed by each agent
grey_scale_env = flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper

env.close()

  from pkg_resources import resource_stream, resource_exists


### Testing the wrapped environment

In [3]:
from pettingzoo.butterfly import pistonball_v6
from stable_baselines3.common.env_checker import check_env
import supersuit

env = pistonball_v6.parallel_env()
env.reset()

grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)

check_env(wrappedEnv)

while env.aec_env.agents :
    # this is where you would insert your policy
    actions = [env.action_space(env.agents[0]).sample()[0] for agent in env.aec_env.agents]
    
    observations, rewards, terminations, truncations, infos = wrappedEnv.step(actions)

env.close()
print(wrappedEnv.observation_space)
print("Test of the Wrapped Environment is done.")

Box(0, 255, (51300,), uint8)
Test of the Wrapped Environment is done.


### Initializing PPO agent on wrapped environment

In [7]:
from pettingzoo.butterfly import pistonball_v6
import supersuit

# Creating env
env = pistonball_v6.parallel_env(n_pistons = 10)
env.reset()


grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)


# Initializing PPO
from stable_baselines3 import PPO

model = PPO("MlpPolicy", wrappedEnv)
print("The PPO model has been initialized.")

The PPO model has been initialized.


In [3]:
print("Neural networks used by model.")
print(model.policy)

Neural networks used by model.
ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=51300, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=51300, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=20, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


### Train PPO agent

In [None]:
model.learn(total_timesteps=100000)

env.close()

### Save PPO agent

In [13]:
model.save("./models/crossProductPPO.zip")



### Import PPO agent

In [8]:
model = model.load("./models/crossProductPPO.zip")

### Test PPO agent

In [15]:
from stable_baselines3.common.env_util import make_vec_env
from pettingzoo.butterfly import pistonball_v6
import supersuit

# Creating env
env = pistonball_v6.parallel_env(render_mode="human", n_pistons=10)
env.reset()


grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)

vec_env = make_vec_env(lambda : wrappedEnv, n_envs=1)


# Get first observation
obs = vec_env.reset()

done = False
while not done :
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, info = vec_env.step(action)


env.close()

# Parameter Sharing

All pistons have the same goal, same rewards and same observations.

Instead of training each pistons or training a single agent controlling all pistons, we could consider all pistons being the same one.

In this case we would use all observations and rewards from all pistons to train only a single piston.
And then treat all pistons as being a copy of this single piston.


In fact parameter sharing cannot work here based on how rewards are distributed in this environment : same reward for each agent. Thus useless pistons (and actions they made) will have the same impact (even a bigger impact since they are more than usefull ones) as usefull pistons which make the ball going to the left wall.

### Initialize PPO agent

In [2]:
from pettingzoo.butterfly import pistonball_v6
import numpy as np
import supersuit

from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env


env = pistonball_v6.parallel_env(render_mode = None, n_pistons=10) # We use AEC env
env.reset()


env = supersuit.color_reduction_v0(env, mode="full")
env = supersuit.resize_v1(env, 114, 30)
env = supersuit.reshape_v0(env, env.observation_space(env.unwrapped.agents[0]).shape + (1,))
env = supersuit.frame_stack_v1(env, 5)

env = supersuit.pettingzoo_env_to_vec_env_v1(env)
env = supersuit.concat_vec_envs_v1(env, 1, num_cpus=1, base_class="stable_baselines3")

print(env.observation_space)


#model = PPO("CnnPolicy", env)
model = PPO("MlpPolicy", env)
print("The PPO model has been initialized.")

Box(0, 255, (30, 114, 5), uint8)




The PPO model has been initialized.


In [3]:
print("Neural networks used by model.")
print(model.policy)

Neural networks used by model.
ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=17100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=17100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=1, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


### Train PPO agent

In [4]:
model.learn(total_timesteps=1000)

env.close()

### Test PPO Agent

In [6]:
from stable_baselines3.common.env_util import make_vec_env
import numpy as np

import supersuit


# Creating env
env = pistonball_v6.parallel_env(render_mode = "human", n_pistons=10) # We use AEC env
env.reset()


env = supersuit.color_reduction_v0(env, mode="full")
env = supersuit.resize_v1(env, 114, 30)
env = supersuit.reshape_v0(env, env.observation_space(env.unwrapped.agents[0]).shape + (1,))
env = supersuit.frame_stack_v1(env, 5)

env = supersuit.pettingzoo_env_to_vec_env_v1(env)
vec_env = supersuit.concat_vec_envs_v1(env, 1, num_cpus=1, base_class="stable_baselines3")


#vec_env = make_vec_env(lambda : env, n_envs=1)


# Get first observation
obs = vec_env.reset()

done = np.array([False])
while not done.any() :
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, info = vec_env.step(action)


env.close()

# RLlib

### Register Env

In [75]:
import ray
from pettingzoo.butterfly import pistonball_v6
from ray.tune.registry import register_env
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
import supersuit

ray.init()


env_name = "pistonball_v6"

def env_creator(args) :
    env = pistonball_v6.parallel_env()
    env = supersuit.color_reduction_v0(env, mode='full') # image to black and white
    env = supersuit.resize_v1(env, 84, 84) # Reduce size of image
    env = supersuit.reshape_v0(env, env.observation_space('piston_0').shape + (1,))
    return env


register_env(env_name, lambda config: ParallelPettingZooEnv(env_creator(config)))

2025-12-12 16:55:55,258	INFO worker.py:2012 -- Started a local Ray instance.
[36m(MultiAgentEnvRunner pid=95900)[0m   from pkg_resources import resource_stream, resource_exists
[36m(MultiAgentEnvRunner pid=95900)[0m 2025-12-12 16:56:19,906	ERROR multi_agent_env_runner.py:855 -- Your environment (<ParallelPettingZooEnv<rllib-multi-agent-env-v0>>) does not abide to the new gymnasium-style API!
[36m(MultiAgentEnvRunner pid=95900)[0m From Ray 2.3 on, RLlib only supports the new (gym>=0.26 or gymnasium) Env APIs.
[36m(MultiAgentEnvRunner pid=95900)[0m In particular, the `reset()` method seems to be faulty.
[36m(MultiAgentEnvRunner pid=95900)[0m Learn more about the most important changes here:
[36m(MultiAgentEnvRunner pid=95900)[0m https://github.com/openai/gym and here: https://github.com/Farama-Foundation/Gymnasium
[36m(MultiAgentEnvRunner pid=95900)[0m 
[36m(MultiAgentEnvRunner pid=95900)[0m In order to fix this problem, do the following:
[36m(MultiAgentEnvRunner pid=959

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. Lease ID: 00000000c88bc639e58c24e80025e6b1954369df1ce4548ffc14a27a585d4083 Worker ID: ea40a3baa2383065ac6b5b7c705682c6b078822d036cd7371d8a9a3d Node ID: fa72a426818fb64be56d548a3101fda9e34e3f8765cbfb934af04e40 Worker IP address: 134.206.154.215 Worker port: 43595 Worker PID: 95900 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code 1.


[36m(MultiAgentEnvRunner pid=95893)[0m   from pkg_resources import resource_stream, resource_exists
[36m(MultiAgentEnvRunner pid=95896)[0m 2025-12-12 16:57:23,907	ERROR multi_agent_env_runner.py:855 -- Your environment (<ParallelPettingZooEnv<rllib-multi-agent-env-v0>>) does not abide to the new gymnasium-style API!
[36m(MultiAgentEnvRunner pid=95896)[0m From Ray 2.3 on, RLlib only supports the new (gym>=0.26 or gymnasium) Env APIs.
[36m(MultiAgentEnvRunner pid=95896)[0m In particular, the `reset()` method seems to be faulty.
[36m(MultiAgentEnvRunner pid=95896)[0m Learn more about the most important changes here:
[36m(MultiAgentEnvRunner pid=95896)[0m https://github.com/openai/gym and here: https://github.com/Farama-Foundation/Gymnasium
[36m(MultiAgentEnvRunner pid=95896)[0m 
[36m(MultiAgentEnvRunner pid=95896)[0m In order to fix this problem, do the following:
[36m(MultiAgentEnvRunner pid=95896)[0m 
[36m(MultiAgentEnvRunner pid=95896)[0m 1) Run `pip install gymnasi

In [2]:
env = env_creator("")
env.reset()
print(env.observation_space('piston_0').shape)

(84, 84, 1)


### Initialize Model

One policy per agent

In [96]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
from ray.rllib.policy.policy import Policy, PolicySpec


config = (
    PPOConfig()
    .environment(env=pistonball_v6.parallel_env)
    .multi_agent(
        policies = {elem for elem in env.unwrapped.agents}, 
        policy_mapping_fn = lambda agent_id, episode, **kwargs : agent_id
    )
)

config.framework("torch")
config.validate()
print(config.is_multi_agent)




"""
    .rl_module(
        rl_module_spec = MultiRLModuleSpec(rl_module_specs={
            agent_id : RLModuleSpec() for agent_id in env.unwrapped.agents
        })
    )
"""

True


'\n    .rl_module(\n        rl_module_spec = MultiRLModuleSpec(rl_module_specs={\n            agent_id : RLModuleSpec() for agent_id in env.unwrapped.agents\n        })\n    )\n'

### Train Model

With Tune

In [57]:
from ray import tune

tuner = tune.Tuner(
    config.algo_class,
    param_space = config,
)


results = tuner.fit()

0,1
Current time:,2025-12-12 16:17:22
Running for:,00:00:11.46
Memory:,9.2/15.6 GiB

Trial name,# failures,error file
PPO_pistonball_v6_a1b00_00000,1,/tmp/ray/session_2025-12-12_16-17-05_089833_74177/artifacts/2025-12-12_16-17-11/PPO_2025-12-12_16-17-05/driver_artifacts/PPO_pistonball_v6_a1b00_00000_0_2025-12-12_16-17-11/error.txt

Trial name,status,loc
PPO_pistonball_v6_a1b00_00000,ERROR,


[36m(PPO pid=92554)[0m [2025-12-12 16:17:17,093 E 92554 92554] core_worker.cc:2200: Actor with class name: 'MultiAgentEnvRunner' and ID: '7e1c5e350b1caf7af32fa84501000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor restart will fail. See https://github.com/ray-project/ray/issues/53727 for more details.
[36m(MultiAgentEnvRunner pid=92621)[0m Exception raised in creation task: The actor died because of an error raised in its creation task, [36mray::MultiAgentEnvRunner.__init__()[39m (pid=92621, ip=134.206.154.215, actor_id=7e1c5e350b1caf7af32fa84501000000, repr=<ray.rllib.env.multi_agent_env_runner.MultiAgentEnvRunner object at 0x7b7c37a234d0>)
[36m(MultiAgentEnvRunner pid=92621)[0m           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[36m(MultiAgentEnvRunner pid=92621)[0m   File "/home/enzo/Documents/Depots/customEnv/lib/python3.12/site-packages/gymnasium/envs/registration.py", li

Without Tune

In [77]:
algo = config.build_algo()

algo.train()

[2025-12-12 16:56:28,856 E 74177 95892] core_worker_process.cc:825: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
2025-12-12 16:57:22,872	ERROR actor_manager.py:973 -- Ray error (The actor 359aa17652c39a73e7f38a0a01000000 is unavailable: The actor is temporarily unavailable: IOError: The actor was restarted. The task may or may not have been executed on the actor.), taking actor 1 out of service.
2025-12-12 16:57:22,873	ERROR actor_manager.py:973 -- Ray error (The actor 8950e7b3ab1d209b48849ce301000000 is unavailable: The actor is temporarily unavailable: IOError: The actor was restarted. The task may or may not have been executed on the actor.), taking actor 2 out of service.
2025-12-12 16:57:22,873	ERROR actor_manager.py:771 -- The actor 359aa17652c39a73e7f38a0a01000000 is unavailable: The actor is temporarily unavailable: IOError: The ac

{'timers': {'training_iteration': 60.01145464298315,
  'restore_env_runners': 1.7073936760425568e-05,
  'training_step': 60.01118877506815,
  'env_runner_sampling_timer': 60.011092095053755},
 'num_training_step_calls_per_iteration': 1,
 'num_env_steps_sampled_lifetime': 0,
 'fault_tolerance': {'num_healthy_workers': 0,
  'num_remote_worker_restarts': 0},
 'env_runner_group': {'actor_manager_num_outstanding_async_reqs': 0},
 'done': False,
 'training_iteration': 1,
 'trial_id': 'default',
 'date': '2025-12-12_16-57-22',
 'timestamp': 1765555042,
 'time_this_iter_s': 61.84901475906372,
 'time_total_s': 61.84901475906372,
 'pid': 74177,
 'hostname': 'smac-Precision-Tower-3620',
 'node_ip': '134.206.154.215',
 'config': {'exploration_config': {},
  'extra_python_environs_for_driver': {},
  'extra_python_environs_for_worker': {},
  'placement_strategy': 'PACK',
  'num_gpus': 0,
  '_fake_gpus': False,
  'num_cpus_for_main_process': 1,
  'eager_tracing': True,
  'eager_max_retraces': 20,
  '

In [63]:
algo.evaluate()

2025-12-12 16:20:40,811	ERROR actor_manager.py:973 -- Ray error (The actor 674a12fded5e55e8a65cbefe01000000 is unavailable: The actor is temporarily unavailable: UnexpectedSystemExit: Worker exits with an exit code 1.. The task may or may not have been executed on the actor.), taking actor 1 out of service.
2025-12-12 16:20:40,812	ERROR actor_manager.py:771 -- The actor 674a12fded5e55e8a65cbefe01000000 is unavailable: The actor is temporarily unavailable: UnexpectedSystemExit: Worker exits with an exit code 1.. The task may or may not have been executed on the actor.
NoneType: None


{}

In [95]:
algo.stop()

ray.shutdown()

# Test

In [99]:
print(config.observation_space)
print(config.get_multi_agent_setup(env = gym.make('pistonball_v6')))


ModuleNotFoundError: No module named 'gym'