In [15]:
import time

import numpy as np
import gym # need to install "shimmy>=0.2.1"
from gym import spaces

# pip install lbforaging
import lbforaging  # need to install "pyglet<2" (source: https://github.com/semitable/lb-foraging/issues/23)

from stable_baselines3 import DQN, PPO
from stable_baselines3.common.env_util import DummyVecEnv, SubprocVecEnv

## 1. Explore environment

In [3]:
# Initialize the environment
env = gym.make('Foraging-8x8-4p-8f-v2')
obs = env.reset()

  logger.warn(


In [4]:
actions = env.action_space.sample()
actions

(4, 0, 2, 2)

In [5]:
nobs, nreward, ndone, ninfo = env.step(actions)
nobs, nreward, ndone, ninfo

((array([ 1.,  2.,  2.,  2.,  6.,  2.,  3.,  1.,  2.,  4.,  3.,  1.,  6.,
          1.,  1.,  6.,  6.,  1., -1., -1.,  0., -1., -1.,  0.,  6.,  5.,
          1.,  6.,  0.,  1.,  7.,  3.,  1.,  7.,  2.,  1.], dtype=float32),
  array([ 1.,  2.,  2.,  2.,  6.,  2.,  3.,  1.,  2.,  4.,  3.,  1.,  6.,
          1.,  1.,  6.,  6.,  1., -1., -1.,  0., -1., -1.,  0.,  6.,  0.,
          1.,  6.,  5.,  1.,  7.,  3.,  1.,  7.,  2.,  1.], dtype=float32),
  array([ 1.,  2.,  2.,  2.,  6.,  2.,  3.,  1.,  2.,  4.,  3.,  1.,  6.,
          1.,  1.,  6.,  6.,  1., -1., -1.,  0., -1., -1.,  0.,  7.,  3.,
          1.,  6.,  5.,  1.,  6.,  0.,  1.,  7.,  2.,  1.], dtype=float32),
  array([ 1.,  2.,  2.,  2.,  6.,  2.,  3.,  1.,  2.,  4.,  3.,  1.,  6.,
          1.,  1.,  6.,  6.,  1., -1., -1.,  0., -1., -1.,  0.,  7.,  2.,
          1.,  6.,  5.,  1.,  6.,  0.,  1.,  7.,  3.,  1.], dtype=float32)),
 [0, 0, 0, 0],
 [False, False, False, False],
 {})

In [68]:
actions = env.action_space.sample()
nobs, nreward, ndone, ninfo = env.step(actions)
env.render()

True

## 2. Learning Decentralized

In [5]:
class SingleAgentEnv(gym.Env):
    def __init__(self, env, agent_id):
        super(SingleAgentEnv, self).__init__()
        self.env = env
        self.agent_id = agent_id
        self.action_space = self.env.action_space[agent_id]
        self.observation_space = self.env.observation_space[agent_id]

    def step(self, action):
        actions = [0] * self.env.n_agents  # Default actions (0) for all agents
        actions[self.agent_id] = action    # Assign the agent's action
        obs, rewards, dones, info = self.env.step(actions)
        return obs[self.agent_id], rewards[self.agent_id], dones[self.agent_id], info

    def reset(self):
        obs = self.env.reset()
        return obs[self.agent_id]

    def render(self, mode='human'):
        return self.env.render(mode=mode)

    def close(self):
        return self.env.close()

In [8]:
# Initialize the environment
env = gym.make('Foraging-8x8-2p-4f-v2', max_episode_steps=500, normalize_reward=False)
num_agents = env.n_agents

# Create individual environments for each agent
agent_envs = [SingleAgentEnv(env, i) for i in range(num_agents)]
vec_envs = [DummyVecEnv([lambda: agent_env]) for agent_env in agent_envs]

# Define individual policies (Q-Learning) for each agent
agents = [DQN('MlpPolicy', vec_env, verbose=1) for vec_env in vec_envs]

# Train each agent independently
num_timesteps = int(1e5)
for i, agent in enumerate(agents):
    print(f"Training agent {i + 1}/{num_agents}")
    agent.learn(total_timesteps=num_timesteps)

Using cpu device
Using cpu device
Training agent 1/2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.856    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 9537     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1516     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.666    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 9790     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3516     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.476    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 9727     |
|    time_elapsed     | 0        |
|    total_timesteps  | 5516     |
----------------------------------
--

In [9]:
# Evaluate the trained agents
def evaluate_agents(env, agents, num_episodes=10):
    episode_rewards = np.zeros((num_episodes, num_agents))
    for episode in range(num_episodes):
        obs = env.reset()
        done = [False] * num_agents
        while not all(done):
            actions = [agents[i].predict(obs[i])[0] for i in range(num_agents)]
            obs, rewards, done, _ = env.step(actions)
            for i in range(num_agents):
                episode_rewards[episode, i] += rewards[i]
    return episode_rewards

# Run the evaluation
episode_rewards = evaluate_agents(env, agents, num_episodes=100)
print(f"Average rewards per episode: {np.mean(episode_rewards, axis=0)}")

Average rewards per episode: [2.17 2.05]


In [12]:
# Render the environment with trained agents
env = gym.make('Foraging-8x8-2p-4f-v2', max_episode_steps=1000, normalize_reward=False)
obs = env.reset()
done = [False] * num_agents

while not all(done):
    actions = [agents[i].predict(obs[i])[0] for i in range(num_agents)]
    obs, rewards, done, _ = env.step(actions)
    env.render()
    time.sleep(0.01)
# env.close()

## 3. Learning Centralized

In [13]:
class CentralizedEnv(gym.Env):
    def __init__(self, env):
        super(CentralizedEnv, self).__init__()
        self.env = env
        # Combine the observation spaces of all agents
        obs_space = self.env.observation_space[0]
        low = np.repeat(obs_space.low, self.env.n_agents, axis=0)
        high = np.repeat(obs_space.high, self.env.n_agents, axis=0)
        self.observation_space = spaces.Box(low=low, high=high, dtype=obs_space.dtype)
        
        # Action space will be a tuple of action spaces of all agents
        self.action_space = spaces.MultiDiscrete([self.env.action_space[i].n for i in range(self.env.n_agents)])

    def step(self, actions):
        actions = np.split(actions, self.env.n_agents)
        obs, rewards, dones, info = self.env.step(actions)
        obs = np.concatenate(obs, axis=0)
        return obs, np.sum(rewards), np.all(dones), info

    def reset(self):
        obs = self.env.reset()
        return np.concatenate(obs, axis=0)

    def render(self, mode='human'):
        return self.env.render(mode=mode)

    def close(self):
        return self.env.close()

In [16]:
# Initialize the centralized environment
env = gym.make('Foraging-8x8-2p-4f-v2', max_episode_steps=500, normalize_reward=False)
centralized_env = CentralizedEnv(env)
vec_env = DummyVecEnv([lambda: centralized_env])

# Define the centralized policy (PPO)
centralized_agent = PPO('MlpPolicy', vec_env, verbose=1)

# Train the centralized agent
num_timesteps = int(1e5)
centralized_agent.learn(total_timesteps=num_timesteps)

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 2552 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 2001        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011027468 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.58       |
|    explained_variance   | 0.16        |
|    learning_rate        | 0.0003      |
|    loss                 | 0.433       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.02       |
|    value_loss           | 0.385       |
-----------------------------------------
-----------------

<stable_baselines3.ppo.ppo.PPO at 0x330a82a60>

In [17]:
# Evaluate the trained centralized agent
def evaluate_centralized_agent(env, agent, num_episodes=10):
    episode_rewards = []
    for episode in range(num_episodes):
        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _ = agent.predict(obs)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
        episode_rewards.append(total_reward)
    return episode_rewards

# Run the evaluation
episode_rewards = evaluate_centralized_agent(centralized_env, centralized_agent, num_episodes=100)
print(f"Average rewards per episode: {np.mean(episode_rewards)}")

Average rewards per episode: 11.13


In [19]:
# Render the environment with trained centralized agent
obs = centralized_env.reset()
done = False

while not done:
    action, _ = centralized_agent.predict(obs)
    obs, reward, done, _ = centralized_env.step(action)
    centralized_env.render()
# centralized_env.close()