In [1]:
import time

import numpy as np
import gym # need to install "shimmy>=0.2.1"
from gym import spaces

# pip install rware
import rware

from stable_baselines3 import DQN, PPO
from stable_baselines3.common.env_util import DummyVecEnv, SubprocVecEnv

## 1. Explore Environment

In [2]:
env = gym.make("rware-tiny-2ag-v1") # , sensor_range=3, request_queue_size=6)

In [3]:
env.n_agents

2

In [4]:
env.action_space
# env.observation_space  # Tuple(Box(XX,), Box(XX,))

Tuple(Discrete(5), Discrete(5))

In [5]:
env.reset()
env.render()

2024-05-18 07:23:24.315 Python[21324:989235] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/rd/xnkdmhqx0c19zv0s0z9zcc9c0000gn/T/org.python.python.savedState


True

In [48]:
actions = env.action_space.sample()
nobs, nreward, ndone, ninfo = env.step(actions)
print(actions, nreward)
env.render()

(4, 4) [0.0, 0.0]


True

## 2. Learning Joint Action Learners (JAL)
- treating PPO as a joint action learner

In [50]:
class CentralizedEnv(gym.Env):
    def __init__(self, env):
        super(CentralizedEnv, self).__init__()
        self.env = env
        self.num_agents = self.env.n_agents

        # Combine the observation spaces of all agents
        obs_spaces = self.env.observation_space
        obs_low = np.concatenate([obs_space.low for obs_space in obs_spaces], axis=None)
        obs_high = np.concatenate([obs_space.high for obs_space in obs_spaces], axis=None)
        self.observation_space = spaces.Box(low=obs_low, high=obs_high, dtype=np.float32)

        # Combine the action spaces of all agents
        self.action_space = spaces.MultiDiscrete([self.env.action_space[i].n for i in range(self.num_agents)])

    def step(self, actions):
        actions = np.split(actions, self.num_agents)
        obs, rewards, dones, info = self.env.step(actions)
        obs = np.concatenate(obs, axis=None)
        total_reward = sum(rewards)
        done = all(dones)
        return obs, total_reward, done, info

    def reset(self):
        obs = self.env.reset()
        return np.concatenate(obs, axis=None)

    def render(self, mode='human'):
        return self.env.render(mode=mode)

    def close(self):
        return self.env.close()

In [63]:
# Initialize the centralized environment
layout = """
.......
...x...
..x.x..
.x...x.
..x.x..
...x...
.g...g.
"""
env = gym.make("rware:rware-tiny-2ag-v1", layout=layout)
# env = gym.make('rware-tiny-2ag-v1')
centralized_env = CentralizedEnv(env)
vec_env = DummyVecEnv([lambda: centralized_env])

# Define the centralized policy (PPO)
centralized_agent = PPO('MlpPolicy', vec_env, verbose=1)

# Train the centralized agent
num_timesteps = int(1e6)
centralized_agent.learn(total_timesteps=num_timesteps)

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 2106 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1728        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008204583 |
|    clip_fraction        | 0.0604      |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.21       |
|    explained_variance   | -0.645      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00206     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0112     |
|    value_loss           | 0.0127      |
-----------------------------------------
-----------------

KeyboardInterrupt: 

In [64]:
# Evaluate the trained centralized agent
def evaluate_centralized_agent(env, agent, num_episodes=10):
    episode_rewards = []
    for episode in range(num_episodes):
        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _ = agent.predict(obs)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
        episode_rewards.append(total_reward)
    return episode_rewards

# Run the evaluation
episode_rewards = evaluate_centralized_agent(centralized_env, centralized_agent, num_episodes=100)
print(f"Average rewards per episode: {np.mean(episode_rewards)}")

Average rewards per episode: 0.66


In [65]:
# Render the environment with trained centralized agent
obs = centralized_env.reset()
done = False

while not done:
    action, _ = centralized_agent.predict(obs)
    obs, reward, done, _ = centralized_env.step(action)
    centralized_env.render()
# centralized_env.close()