In [1]:
import numpy as np
import gymnasium as gym
import time
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import rware.warehouse as rware

In [2]:
layout = """
g......
...x...
..x.x..
.x...x.
..x.x..
...x...
......g
"""
env = gym.make("rware:rware-tiny-2ag-v2", layout=layout)

In [3]:
#os.environ['MKL_THREADING_LAYER'] = 'GNU'
#env.unwrapped.n_agents
#env.action_space
#env.observation_space

In [13]:
class JointActionSpaceWrapper(gym.Env):
    def __init__(self, env):
        super(JointActionSpaceWrapper, self).__init__()
        self.env = env
        self.n_agents = env.n_agents

        # Define the joint action space (multi-discrete actions for each agent)
        # The action_space of the environment is a Tuple, so we iterate through it
        self.action_space = gym.spaces.MultiDiscrete(
            np.concatenate([space.nvec for space in env.action_space], axis=0)
        )
        
        # Define the joint observation space
        obs_shape = (self.n_agents * env.observation_space[0].shape[0],)
        self.observation_space = gym.spaces.Box(
            low=-np.inf, 
            high=np.inf, 
            shape=obs_shape, 
            dtype=np.float32
        )

    # Modified reset function to accept seed
    def reset(self, seed=None, **kwargs):
        obss, info = self.env.reset(seed=seed, **kwargs)  # Reset with seed
        # Handle scalar observations
        obss = [np.array([o]) if np.isscalar(o) else o for o in obss]
        return np.concatenate(obss), info  # Return concatenated observations and info

    def step(self, actions):
        # 将 actions 拆分为各个智能体的动作
        actions = [actions[i] for i in range(self.n_agents)]
        
        # 从环境中获取返回值，env.step() 应该返回 5 个值：obss, rewards, done, truncated, info
        obss, rewards, done, truncated, info = self.env.step(actions)
        
        # 合并多个智能体的观测值
        obss = [np.array([o]) if np.isscalar(o) else o for o in obss]
        joint_obs = np.concatenate(obss)
        
        # 将多个智能体的奖励相加
        joint_reward = np.sum(rewards)
        
        # 处理 done 和 truncated 的情况，合并它们
        joint_done = done or truncated
        
        # 返回 5 个值：观测值、奖励、done、truncated 和 info
        return joint_obs, joint_reward, joint_done, truncated, info



# Create and wrap the Warehouse environment
env = rware.Warehouse(
    shelf_columns=5,           # Example value
    column_height=3,           # Example value
    shelf_rows=3,              # Example value
    n_agents=2,                # 2 agents
    msg_bits=2,                # Communication bits
    sensor_range=2,            # Sensor range
    request_queue_size=5,      # Example value
    max_inactivity_steps=50,   # Example value
    max_steps=1000,            # Maximum steps per episode
    reward_type='dense'        # Example reward type ('dense' or 'sparse')
)

wrapped_env = JointActionSpaceWrapper(env)
vec_env = DummyVecEnv([lambda: wrapped_env])

# Initialize PPO model
model = PPO('MlpPolicy', vec_env, verbose=1)

# Train the model
model.learn(total_timesteps=50000)

# Save the trained model
model.save("ppo_rware_joint_policy")


Using cpu device


IndexError: invalid index to scalar variable.

In [None]:
model = PPO.load("ppo_rware_joint_policy")
obs = vec_env.reset()
done = False

env.render()

In [None]:
for step in range(500):
    actions, _states = model.predict(obs)
    observations, rewards, done, truncated, info = env.step(actions) 
    
    print(rewards)
    print(done)
   
    env.render()
    time.sleep(0.1)
    if done or truncated:
        observations = env.reset()

In [None]:
env.close()