# Sequencial

In [1]:
from pettingzoo.butterfly import pistonball_v6

# Creates env
env = pistonball_v6.env(render_mode="human")
env.reset(seed=42)


for agent in env.agent_iter():
    # Get observation and reward of the agent
    observation, reward, termination, truncation, info = env.last()

    if termination or truncation:
        action = None
    else:
        # Randomly selected action from the action space
        action = env.action_space(agent).sample()

    env.step(action)
env.close()

  from pkg_resources import resource_stream, resource_exists


# Parallel

In [4]:
from pettingzoo.butterfly import pistonball_v6

env = pistonball_v6.parallel_env(render_mode="human", n_pistons = 20)
observations, infos = env.reset()



while env.agents:
    # this is where you would insert your policy
    actions = {agent: env.action_space(agent).sample() for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)

env.close()

# Wrapping Environment for Fully Centralized Control

In [1]:
import gymnasium
from gymnasium import spaces
import numpy as np



class SingleAgentWrapperEnv(gymnasium.Env) :
    """
    This wrapper permits to create a gymnasium env where the action space is the cartesian product of agents' action space
    and the observation space is the cartesian product of agents' observation space.

    This permits to train a single "super-agent" which will receive all observations and distribute actions that has to be made by all "sub-agents".

    WARNING : USE THIS WRAPPER ONLY IF YOU FOLLOW ALL ASSUMPTIONS BELOW !!!
     - The env is a pettingzoo env with the parallel API (ParallelEnv) (can be wrapped)
     - All agents of the env has a Box action space and a Box observation space
     - All agents of the env must have the same bounds, dtypes and a shape of (1,) for the Box representing their action space
     - All agents of the env must have the same bounds, dtypes and a 1D shape of same size for the Box space representing their observation space


    Do not forget to use env.reset() before creating SingleAgentWrapperEnv(env) to properly initialize all attributes of the env
    """

    def __init__(self, env, options = None) :
        super(SingleAgentWrapperEnv, self).__init__()

        self.env = env
        self.agents = self.env.unwrapped.agents
        self.nb_agent = len(self.agents)


        # Creating observation_space
        temp_space = self.env.observation_space(self.agents[0])
        low_bound = min(temp_space.low) # Not optimized this using min instead of taking the lowest value for each element grows the observation_space
        high_bound = max(temp_space.high) # Same here with the max
        dtype = temp_space.dtype

        shape = (self.nb_agent * temp_space.shape[0],)
        self.observation_space = spaces.Box(np.full(shape, low_bound), np.full(shape, high_bound), shape, dtype)

        
        # Creating action_space
        temp_space = self.env.action_space(self.agents[0])
        low_bound = temp_space.low # Lowest value an action of an agent can take
        high_bound = temp_space.high # Highest value an action of an agent can take
        dtype = temp_space.dtype # The exact type of an action
        
        shape = (self.nb_agent,) # A vector container at index i the action made by self.env.agents[i]
        self.action_space = spaces.Box(np.full(shape, low_bound), np.full(shape, high_bound), shape, dtype)


    def reset(self, seed = None, options = None) :
        super().reset(seed = seed)
        observations, infos = self.env.reset(seed, options)

        obs = np.array([], dtype = self.observation_space.dtype)
        for i in range(self.nb_agent) :
            obs = np.append(obs, observations[self.agents[i]]) # Concatenation of all observations
        return obs, infos


    def step(self, action) :
        """
        The action in input is a vector containing actions of each agents
        """
        dict_actions = {self.agents[i]: np.array(object = [action[i]], dtype=np.float32) for i in range(self.nb_agent)}
        
        observations, rewards, terminations, truncations, infos = self.env.step(dict_actions)

        
        obs = np.array([], dtype = self.observation_space.dtype)
        for i in range(self.nb_agent) :
            obs = np.append(obs, observations[self.agents[i]]) # Concatenation of all observations

        
        reward = 0
        termination = False
        truncation = False
        for agent in self.agents :
            # Reward is the mean of all agents' reward
            reward += rewards[agent]
            reward = reward / self.nb_agent

            # Episode ends as soon as it ends for one agent
            if terminations[agent] :
                termination = True

            if truncations[agent] :
                truncation = True

        return obs, reward, termination or len(self.agents) != self.nb_agent, truncation, infos


### Use Grey Scale Image as observation and flatten it

In [2]:
from pettingzoo.butterfly import pistonball_v6

env = pistonball_v6.parallel_env()
env.reset()


from supersuit import color_reduction_v0
from supersuit import resize_v1
from supersuit import flatten_v0

grey_scale_env = color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = resize_v1(grey_scale_env, 114, 30) # Divide by 4 quality of each image observed by each agent
grey_scale_env = flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper

env.close()

  from pkg_resources import resource_stream, resource_exists


### Testing the wrapped environment

In [3]:
from pettingzoo.butterfly import pistonball_v6
from stable_baselines3.common.env_checker import check_env
import supersuit

env = pistonball_v6.parallel_env()
env.reset()

grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)

check_env(wrappedEnv)

while env.aec_env.agents :
    # this is where you would insert your policy
    actions = [env.action_space(env.agents[0]).sample()[0] for agent in env.aec_env.agents]
    
    observations, rewards, terminations, truncations, infos = wrappedEnv.step(actions)

env.close()
print(wrappedEnv.observation_space)
print("Test of the Wrapped Environment is done.")

Box(0, 255, (51300,), uint8)
Test of the Wrapped Environment is done.


### Initializing PPO agent on wrapped environment

In [2]:
from pettingzoo.butterfly import pistonball_v6
import supersuit

# Creating env
env = pistonball_v6.parallel_env(n_pistons = 10)
env.reset()


grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)


# Initializing PPO
from stable_baselines3 import PPO

model = PPO("MlpPolicy", wrappedEnv)
print("The PPO model has been initialized.")

  from pkg_resources import resource_stream, resource_exists


The PPO model has been initialized.


In [3]:
print("Neural networks used by model.")
print(model.policy)

Neural networks used by model.
ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=51300, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=51300, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=20, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


### Train PPO agent

In [4]:
model.learn(total_timesteps=30000)

env.close()

KeyboardInterrupt: 

### Save PPO agent

In [13]:
model.save("./models/crossProductPPO.zip")



### Import PPO agent

In [9]:
model = model.load("./models/crossProductPPO.zip")

### Test PPO agent

In [11]:
from stable_baselines3.common.env_util import make_vec_env
from pettingzoo.butterfly import pistonball_v6
import supersuit

# Creating env
env = pistonball_v6.parallel_env(render_mode="human", n_pistons=10)
env.reset()


grey_scale_env = supersuit.color_reduction_v0(env, mode='full') # This changes observations as grey scale images
grey_scale_env = supersuit.resize_v1(grey_scale_env, 57, 15) # Divide by 8 size of each image observed by each agent
grey_scale_env = supersuit.flatten_v0(grey_scale_env) # Flatten to 1D to make it compatible with our wrapper
grey_scale_env = supersuit.frame_stack_v1(grey_scale_env, 3) # Observations are now the past 3 observations (so pistons can observe in which direction the ball moves)


wrappedEnv = SingleAgentWrapperEnv(grey_scale_env)

vec_env = make_vec_env(lambda : wrappedEnv, n_envs=1)


# Get first observation
obs = vec_env.reset()

done = False
while not done :
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, info = vec_env.step(action)


env.close()

# Parameter Sharing

All pistons have the same goal, same rewards and same observations.

Instead of training each pistons or training a single agent controlling all pistons, we could consider all pistons being the same one.

In this case we would use all observations and rewards from all pistons to train only a single piston.
And then treat all pistons as being a copy of this single piston.


In fact parameter sharing cannot work here based on how rewards are distributed in this environment : same reward for each agent. Thus useless pistons (and actions they made) will have the same impact (even a bigger impact since they are more than usefull ones) as usefull pistons which make the ball going to the left wall.

### Initialize PPO agent

In [2]:
from pettingzoo.butterfly import pistonball_v6
import numpy as np
import supersuit

from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env


env = pistonball_v6.parallel_env(render_mode = None, n_pistons=10) # We use AEC env
env.reset()


env = supersuit.color_reduction_v0(env, mode="full")
env = supersuit.resize_v1(env, 114, 30)
env = supersuit.reshape_v0(env, env.observation_space(env.unwrapped.agents[0]).shape + (1,))
env = supersuit.frame_stack_v1(env, 5)

env = supersuit.pettingzoo_env_to_vec_env_v1(env)
env = supersuit.concat_vec_envs_v1(env, 1, num_cpus=1, base_class="stable_baselines3")

print(env.observation_space)


#model = PPO("CnnPolicy", env)
model = PPO("MlpPolicy", env)
print("The PPO model has been initialized.")

Box(0, 255, (30, 114, 5), uint8)




The PPO model has been initialized.


In [3]:
print("Neural networks used by model.")
print(model.policy)

Neural networks used by model.
ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=17100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=17100, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=1, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


### Train PPO agent

In [4]:
model.learn(total_timesteps=1000)

env.close()

### Test PPO Agent

In [6]:
from stable_baselines3.common.env_util import make_vec_env
import numpy as np

import supersuit


# Creating env
env = pistonball_v6.parallel_env(render_mode = "human", n_pistons=10) # We use AEC env
env.reset()


env = supersuit.color_reduction_v0(env, mode="full")
env = supersuit.resize_v1(env, 114, 30)
env = supersuit.reshape_v0(env, env.observation_space(env.unwrapped.agents[0]).shape + (1,))
env = supersuit.frame_stack_v1(env, 5)

env = supersuit.pettingzoo_env_to_vec_env_v1(env)
vec_env = supersuit.concat_vec_envs_v1(env, 1, num_cpus=1, base_class="stable_baselines3")


#vec_env = make_vec_env(lambda : env, n_envs=1)


# Get first observation
obs = vec_env.reset()

done = np.array([False])
while not done.any() :
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, info = vec_env.step(action)


env.close()