# Sequencial

In [13]:
from pettingzoo.butterfly import pistonball_v6

# Creates env
env = pistonball_v6.env(render_mode="human")
env.reset(seed=42)


for agent in env.agent_iter():
    # Get observation and reward of the agent
    observation, reward, termination, truncation, info = env.last()

    if termination or truncation:
        action = None
    else:
        # Randomly selected action from the action space
        action = env.action_space(agent).sample()

    env.step(action)
env.close()

# Parallel

In [4]:
from pettingzoo.butterfly import pistonball_v6

env = pistonball_v6.parallel_env(render_mode="human", n_pistons = 20)
observations, infos = env.reset()



while env.agents:
    # this is where you would insert your policy
    actions = {agent: env.action_space(agent).sample() for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)

env.close()

# Wrapping Environment for Fully Centralized Control

In [6]:
import gymnasium
from gymnasium import spaces
import numpy as np



class SingleAgentWrapperEnv(gymnasium.Env) :
    """
    This wrapper permits to create a gymnasium env where the action space is the cartesian product of agents' action space
    and the observation space is the cartesian product of agents' observation space.

    This permits to train a single "super-agent" which will receive all observations and distribute actions that has to be made by all "sub-agents".

    WARNING : USE THIS WRAPPER ONLY IF YOU FOLLOW ALL ASSUMPTIONS BELOW !!!
     - The env is a pettingzoo env with the parallel API (ParallelEnv) (can be wrapped)
     - All agents of the env has a Box action space and a Box observation space
     - All agents of the env must have the same bounds, dtypes and a shape of (1,) for the Box representing their action space
     - All agents of the env must have the same bounds, dtypes and a 1D shape of same size for the Box space representing their observation space


    Do not forget to use env.reset() before creating SingleAgentWrapperEnv(env) to properly initialize all attributes of the env
    """

    def __init__(self, env, options = None) :
        super(SingleAgentWrapperEnv, self).__init__()

        self.env = env
        self.agents = self.env.unwrapped.agents
        self.nb_agent = len(self.agents)


        # Creating observation_space
        temp_space = self.env.observation_space(self.agents[0])
        low_bound = min(temp_space.low) # Not optimized this using min instead of taking the lowest value for each element grows the observation_space
        high_bound = max(temp_space.high) # Same here with the max
        dtype = temp_space.dtype

        shape = (self.nb_agent * temp_space.shape[0],)
        self.observation_space = spaces.Box(np.full(shape, low_bound), np.full(shape, high_bound), shape, dtype)

        
        # Creating action_space
        temp_space = self.env.action_space(self.agents[0])
        low_bound = temp_space.low # Lowest value an action of an agent can take
        high_bound = temp_space.high # Highest value an action of an agent can take
        dtype = temp_space.dtype # The exact type of an action
        
        shape = (self.nb_agent,) # A vector container at index i the action made by self.env.agents[i]
        self.action_space = spaces.Box(np.full(shape, low_bound), np.full(shape, high_bound), shape, dtype)


    def reset(self, seed = None, options = None) :
        super().reset(seed = seed)
        observations, infos = self.env.reset(seed, options)

        obs = np.array([], dtype = self.observation_space.dtype)
        for i in range(self.nb_agent) :
            obs = np.append(obs, observations[self.agents[i]]) # Concatenation of all observations
        return obs, infos


    def step(self, action) :
        """
        The action in input is a vector containing actions of each agents
        """
        dict_actions = {self.agents[i]: np.array(object = [action[i]], dtype=np.float32) for i in range(self.nb_agent)}
        
        observations, rewards, terminations, truncations, infos = self.env.step(dict_actions)

        
        obs = np.array([], dtype = self.observation_space.dtype)
        for i in range(self.nb_agent) :
            obs = np.append(obs, observations[self.agents[i]]) # Concatenation of all observations

        
        reward = 0
        termination = False
        truncation = False
        for agent in self.agents :
            # Reward is the mean of all agents' reward
            reward += rewards[agent]
            reward = reward / self.nb_agent

            # Episode ends as soon as it ends for one agent
            if terminations[agent] :
                termination = True

            if truncations[agent] :
                truncation = True

        return obs, reward, termination or len(self.agents) != self.nb_agent, truncation, infos


### Use Grey Scale Image as observation and flatten it

In [2]:
from pettingzoo.butterfly import pistonball_v6

env = pistonball_v6.parallel_env()
env.reset()


from supersuit import color_reduction_v0
from supersuit import resize_v1
from supersuit import flatten_v0

grey_scale_env = color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = resize_v1(grey_scale_env, 114, 30) # Divide by 4 quality of each image observed by each agent
grey_scale_env = flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper

env.close()

  from pkg_resources import resource_stream, resource_exists


### Testing the wrapped environment

In [3]:
from pettingzoo.butterfly import pistonball_v6
from stable_baselines3.common.env_checker import check_env
import supersuit

env = pistonball_v6.parallel_env()
env.reset()

grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)

check_env(wrappedEnv)

while env.aec_env.agents :
    # this is where you would insert your policy
    actions = [env.action_space(env.agents[0]).sample()[0] for agent in env.aec_env.agents]
    
    observations, rewards, terminations, truncations, infos = wrappedEnv.step(actions)

env.close()
print(wrappedEnv.observation_space)
print("Test of the Wrapped Environment is done.")

Box(0, 255, (51300,), uint8)
Test of the Wrapped Environment is done.


### Initializing PPO agent on wrapped environment

In [7]:
from pettingzoo.butterfly import pistonball_v6
import supersuit

# Creating env
env = pistonball_v6.parallel_env(n_pistons = 10)
env.reset()


grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)


# Initializing PPO
from stable_baselines3 import PPO

model = PPO("MlpPolicy", wrappedEnv)
print("The PPO model has been initialized.")

The PPO model has been initialized.


In [3]:
print("Neural networks used by model.")
print(model.policy)

Neural networks used by model.
ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=51300, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=51300, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=20, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


### Train PPO agent

In [None]:
model.learn(total_timesteps=100000)

env.close()

### Save PPO agent

In [13]:
model.save("./models/crossProductPPO.zip")



### Import PPO agent

In [8]:
model = model.load("./models/crossProductPPO.zip")

### Test PPO agent

In [15]:
from stable_baselines3.common.env_util import make_vec_env
from pettingzoo.butterfly import pistonball_v6
import supersuit

# Creating env
env = pistonball_v6.parallel_env(render_mode="human", n_pistons=10)
env.reset()


grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)

vec_env = make_vec_env(lambda : wrappedEnv, n_envs=1)


# Get first observation
obs = vec_env.reset()

done = False
while not done :
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, info = vec_env.step(action)


env.close()

# Parameter Sharing

All pistons have the same goal, same rewards and same observations.

Instead of training each pistons or training a single agent controlling all pistons, we could consider all pistons being the same one.

In this case we would use all observations and rewards from all pistons to train only a single piston.
And then treat all pistons as being a copy of this single piston.


In fact parameter sharing cannot work here based on how rewards are distributed in this environment : same reward for each agent. Thus useless pistons (and actions they made) will have the same impact (even a bigger impact since they are more than usefull ones) as usefull pistons which make the ball going to the left wall.

### Initialize PPO agent

In [2]:
from pettingzoo.butterfly import pistonball_v6
import numpy as np
import supersuit

from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env


env = pistonball_v6.parallel_env(render_mode = None, n_pistons=10) # We use AEC env
env.reset()


env = supersuit.color_reduction_v0(env, mode="full")
env = supersuit.resize_v1(env, 114, 30)
env = supersuit.reshape_v0(env, env.observation_space(env.unwrapped.agents[0]).shape + (1,))
env = supersuit.frame_stack_v1(env, 5)

env = supersuit.pettingzoo_env_to_vec_env_v1(env)
env = supersuit.concat_vec_envs_v1(env, 1, num_cpus=1, base_class="stable_baselines3")

print(env.observation_space)


#model = PPO("CnnPolicy", env)
model = PPO("MlpPolicy", env)
print("The PPO model has been initialized.")

Box(0, 255, (30, 114, 5), uint8)




The PPO model has been initialized.


In [3]:
print("Neural networks used by model.")
print(model.policy)

Neural networks used by model.
ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=17100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=17100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=1, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


### Train PPO agent

In [4]:
model.learn(total_timesteps=1000)

env.close()

### Test PPO Agent

In [6]:
from stable_baselines3.common.env_util import make_vec_env
import numpy as np

import supersuit


# Creating env
env = pistonball_v6.parallel_env(render_mode = "human", n_pistons=10) # We use AEC env
env.reset()


env = supersuit.color_reduction_v0(env, mode="full")
env = supersuit.resize_v1(env, 114, 30)
env = supersuit.reshape_v0(env, env.observation_space(env.unwrapped.agents[0]).shape + (1,))
env = supersuit.frame_stack_v1(env, 5)

env = supersuit.pettingzoo_env_to_vec_env_v1(env)
vec_env = supersuit.concat_vec_envs_v1(env, 1, num_cpus=1, base_class="stable_baselines3")


#vec_env = make_vec_env(lambda : env, n_envs=1)


# Get first observation
obs = vec_env.reset()

done = np.array([False])
while not done.any() :
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, info = vec_env.step(action)


env.close()

# RLlib

### Register Env

In [1]:
import ray
import numpy as np
from pettingzoo.butterfly import pistonball_v6
from ray.tune.registry import register_env
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
import supersuit

ray.init()


env_name = "pistonball_v6"

def env_creator(args) :
    if 'render_mode' in args.keys() :
        env = pistonball_v6.parallel_env(render_mode = args['render_mode'])
    else :
        env = pistonball_v6.parallel_env()
    env = supersuit.color_reduction_v0(env, mode='full') # image to black and white reshapes from (x, y, 3) to (x, y)
    env = supersuit.resize_v1(env, 10, 10) # Resize to (10, 10) because default size hasn't a default model
    env = supersuit.frame_stack_v1(env, 3) # Since we have 3 frames, the shape is (10, 10, 3)
    #env = supersuit.flatten_v0(env) # Make the observation 1 dimensional to be processed by default model
    #env = supersuit.reshape_v0(env, env.observation_space('piston_0').shape + (1,)) # (84, 84) won't be detected as an image, we need (84, 84, 1)  TO ACTIVATE UNCOMMENT THIS AND COMMENT FRAME STACKING
    env = supersuit.dtype_v0(env, np.float32) # Conv2d needs input to be float32
    env = ParallelPettingZooEnv(env)
    return env


register_env(env_name, lambda config: env_creator(config))

  from pkg_resources import resource_stream, resource_exists
2025-12-19 17:21:11,940	INFO worker.py:2012 -- Started a local Ray instance.
[36m(MultiAgentEnvRunner pid=75548)[0m   from pkg_resources import resource_stream, resource_exists
[36m(pid=gcs_server)[0m [2025-12-19 17:21:40,114 E 75392 75392] (gcs_server) gcs_server.cc:302: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[33m(raylet)[0m [2025-12-19 17:21:41,872 E 75489 75489] (raylet) main.cc:975: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(MultiAgentEnvRunner pid=75547)[0m [2025-12-19 17:21:45,500 E 75547 75663] core_worker_process.cc:825: Failed to establish connection to the metrics exporter agent. Metric

In [2]:
env = env_creator({})
env.reset()
print(env.observation_space['piston_0'].shape)
print(env.observation_space['piston_0'].dtype)

(10, 10, 3)
float32


### Initialize Model

#### One policy per agent

In [7]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
from ray.rllib.policy.policy import Policy, PolicySpec


func_map_agentID_policyID = lambda agent_id, episode, **kwargs : agent_id


config = (
    PPOConfig()
    .environment(
        env="pistonball_v6"
    )
    
    .env_runners(
        num_env_runners=1
    )
    
    .multi_agent(
        policies = {policyID for policyID in env.unwrapped.agents}, 
        policy_mapping_fn = lambda agent_id, episode, **kwargs : agent_id
    )
)

config.framework("torch")
config.validate()
print(config.is_multi_agent)

True


#### Same policy for all agents

In [3]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
from ray.rllib.policy.policy import Policy, PolicySpec

func_map_agentID_policyID = lambda agent_id, episode, **kwargs : 'shared_policy'


config = (
    PPOConfig()
    .environment(
        env="pistonball_v6"
    )
    
    .env_runners(
        #num_env_runners=1,
        #num_envs_per_env_runner=1,
        sample_timeout_s=600
        #rollout_fragment_length = 32
    )
    
    .multi_agent(
        policies = {'shared_policy'}, 
        policy_mapping_fn = func_map_agentID_policyID
    )
)

config.framework("torch")
config.validate()
print(config.is_multi_agent)



True


### Train Model

#### With Tune

In [4]:
from ray import tune

tuner = tune.Tuner(
    config.algo_class,
    param_space = config,
)


results = tuner.fit()

0,1
Current time:,2025-12-18 14:05:27
Running for:,00:03:26.78
Memory:,8.9/15.6 GiB

Trial name,status,loc,iter,total time (s),num_training_step_ca lls_per_iteration,num_env_steps_sample d_lifetime
PPO_pistonball_v6_bdf19_00000,RUNNING,134.206.154.215:151621,2,140.015,1,0


[2025-12-18 14:02:13,443 E 150994 151182] core_worker_process.cc:825: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
2025-12-18 14:05:27,582	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/enzo/ray_results/PPO_2025-12-18_14-02-00' in 0.0022s.
2025-12-18 14:05:37,589	INFO tune.py:1041 -- Total run time: 216.82 seconds (206.78 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/home/enzo/ray_results/PPO_2025-12-18_14-02-00", trainable=...)


#### Without Tune

Build algo

In [4]:
algo = config.build_algo()

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[2025-12-19 17:21:24,221 E 75355 75355] core_worker.cc:2200: Actor with class name: 'MultiAgentEnvRunner' and ID: '3f4b88d3ed2edc46d5e4ce7901000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor restart wi

Train

In [6]:
algo.train()

{'timers': {'training_iteration': 231.49328594362905,
  'restore_env_runners': 3.098200755630387e-05,
  'training_step': 231.49278652649932,
  'env_runner_sampling_timer': 66.49747035070845,
  'learner_update_timer': 164.98578238518996,
  'synch_weights': 0.002247075340274023,
  'synch_env_connectors': 0.001954883999133017},
 'env_runners': {'episode_len_max': 125,
  'num_agent_steps_sampled_lifetime': {'piston_18': 8000.0,
   'piston_10': 8000.0,
   'piston_2': 8000.0,
   'piston_14': 8000.0,
   'piston_19': 8000.0,
   'piston_13': 8000.0,
   'piston_15': 8000.0,
   'piston_5': 8000.0,
   'piston_7': 8000.0,
   'piston_16': 8000.0,
   'piston_8': 8000.0,
   'piston_12': 8000.0,
   'piston_17': 8000.0,
   'piston_0': 8000.0,
   'piston_1': 8000.0,
   'piston_11': 8000.0,
   'piston_9': 8000.0,
   'piston_3': 8000.0,
   'piston_6': 8000.0,
   'piston_4': 8000.0},
  'agent_steps': {'piston_19': 125.0,
   'piston_13': 125.0,
   'piston_15': 125.0,
   'piston_5': 125.0,
   'piston_7': 125.

### View one episode

In [9]:
import torch as th
import numpy as np
import time


env = env_creator({'render_mode': 'human'})

done = False
obs, info = env.reset()

# Avoids infinite loops
i = 1
max_iter = 1000

while not done and i <= max_iter:
    actions = {}
    for agent_ID in env.unwrapped.agents :

        # Retrieve policy and modules associated to the agent
        policy_ID = func_map_agentID_policyID(agent_ID, 0)
        module = algo.get_module(policy_ID)

        # We use unsqueeze(0) to have [observation] instead of observation since forward_inference needs [obs1, obs2,...] ??? NEEDS TO BE VERIFIED
        tensor = th.from_numpy(obs[agent_ID]).unsqueeze(0).float() # Convert numpy observation to a tensor of dtype float32 for forward inference
        
        model_outputs = module.forward_inference({'obs': tensor})

        # In the results, we either have {'actions' : The chosen action} or {'action_dist_inputs' : describes parameters of gaussian distribution ?????}
        
        # Retrieve logits of the actions of the agent
        action_logits = model_outputs["action_dist_inputs"][0]

        # Get distribution of actions for this agent
        action_distrib = module.get_inference_action_dist_cls().from_logits(action_logits)
        
        greedy_action = action_distrib.sample().numpy() # to deterministic() permits to retrieve the most likely action


        actions[agent_ID] = np.array(greedy_action)


    # Apply agent's actions
    obs, reward, terminated, truncated, info = env.step(actions)

    # Verifies if episode ended or reached max steps
    done = terminated['__all__'] or truncated['__all__']
    i += 1

    # Adjuste time between two steps
    time.sleep(0.001)

env.close()




### Evaluation

In [75]:
algo.evaluate()

ValueError: <ray.rllib.env.multi_agent_env_runner.MultiAgentEnvRunner object at 0x7323c7fe3f20> doesn't have an env! Can't call `sample()` on it.

In [94]:
algo.stop()

ray.shutdown()