In [1]:
!git clone https://github.com/openai/multiagent-particle-envs.git

Cloning into 'multiagent-particle-envs'...
remote: Enumerating objects: 234, done.[K
remote: Total 234 (delta 0), reused 0 (delta 0), pack-reused 234[K
Receiving objects: 100% (234/234), 100.83 KiB | 2.80 MiB/s, done.
Resolving deltas: 100% (127/127), done.


In [2]:
%cd multiagent-particle-envs
!ls
!pip install -e .

/content/multiagent-particle-envs
bin  LICENSE.txt  make_env.py  multiagent  README.md  setup.py
Obtaining file:///content/multiagent-particle-envs
Collecting numpy-stl
[?25l  Downloading https://files.pythonhosted.org/packages/ef/08/2d8533798a08e1878120a1bf4970eb8ee50f6860cd50db917c9defe5dda2/numpy-stl-2.11.2.tar.gz (484kB)
[K     |████████████████████████████████| 491kB 2.8MB/s 
Building wheels for collected packages: numpy-stl
  Building wheel for numpy-stl (setup.py) ... [?25l[?25hdone
  Created wheel for numpy-stl: filename=numpy_stl-2.11.2-cp36-cp36m-linux_x86_64.whl size=134859 sha256=cf0e5bbb9af469ebfe8fd3f6527810d2e6f5070bc213534afd65bbdca3a3e0c2
  Stored in directory: /root/.cache/pip/wheels/bd/c8/18/436f6b7a2601408d9e5f8c20afb4f5cac5ef0dabe222becbf4
Successfully built numpy-stl
Installing collected packages: numpy-stl, multiagent
  Running setup.py develop for multiagent
Successfully installed multiagent numpy-stl-2.11.2


In [3]:
!pip install gym==0.10.5

Collecting gym==0.10.5
[?25l  Downloading https://files.pythonhosted.org/packages/9b/50/ed4a03d2be47ffd043be2ee514f329ce45d98a30fe2d1b9c61dea5a9d861/gym-0.10.5.tar.gz (1.5MB)
[K     |▏                               | 10kB 18.4MB/s eta 0:00:01[K     |▍                               | 20kB 1.7MB/s eta 0:00:01[K     |▋                               | 30kB 2.3MB/s eta 0:00:01[K     |▉                               | 40kB 2.6MB/s eta 0:00:01[K     |█                               | 51kB 2.0MB/s eta 0:00:01[K     |█▎                              | 61kB 2.3MB/s eta 0:00:01[K     |█▌                              | 71kB 2.5MB/s eta 0:00:01[K     |█▊                              | 81kB 2.7MB/s eta 0:00:01[K     |██                              | 92kB 2.9MB/s eta 0:00:01[K     |██▏                             | 102kB 2.8MB/s eta 0:00:01[K     |██▍                             | 112kB 2.8MB/s eta 0:00:01[K     |██▋                             | 122kB 2.8MB/s eta 0:00:01[K

# **Model: ACTOR AND CRITIC**

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd

In [0]:
class CentralizedCritic(nn.Module):

    def __init__(self, obs_dim, action_dim):
        super(CentralizedCritic, self).__init__()

        # obs_dim = n_agents * local_obs_dim
        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.linear1 = nn.Linear(self.obs_dim, 1024)
        self.linear2 = nn.Linear(1024 + self.action_dim, 512)
        self.linear3 = nn.Linear(512, 300)
        self.linear4 = nn.Linear(300, 1)

    def forward(self, x, a):
        x = F.relu(self.linear1(x))
        xa_cat = torch.cat([x,a], 1)
        xa = F.relu(self.linear2(xa_cat))
        xa = F.relu(self.linear3(xa))
        qval = self.linear4(xa)

        return qval

In [0]:
class Actor(nn.Module):

    def __init__(self, obs_dim, action_dim):
        super(Actor, self).__init__()

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.linear1 = nn.Linear(self.obs_dim, 512)
        self.linear2 = nn.Linear(512, 128)
        self.linear3 = nn.Linear(128, self.action_dim)

    def forward(self, obs):
        x = F.relu(self.linear1(obs))
        x = F.relu(self.linear2(x))
        x = torch.tanh(self.linear3(x))

        return x

# **Agent**

In [0]:
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd

In [0]:
class DDPGAgent:

    def __init__(self, env, agent_id, actor_lr=1e-4, critic_lr=1e-3, gamma=0.99, tau=1e-2):
        self.env = env
        self.agent_id = agent_id
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.tau = tau

        self.device = "cpu"
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.device = "cuda"

        self.obs_dim = self.env.observation_space[agent_id].shape[0]
        self.action_dim = self.env.action_space[agent_id].n
        self.num_agents = self.env.n

        self.critic_input_dim = int(np.sum([env.observation_space[agent].shape[0] for agent in range(env.n)]))
        self.actor_input_dim = self.obs_dim

        self.critic = CentralizedCritic(self.critic_input_dim, self.action_dim * self.num_agents).to(self.device)
        self.critic_target = CentralizedCritic(self.critic_input_dim, self.action_dim * self.num_agents).to(self.device)
        self.actor = Actor(self.actor_input_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.actor_input_dim, self.action_dim).to(self.device)

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)
        
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data)
        
        self.MSELoss = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)

    def get_action(self, state):
        state = autograd.Variable(torch.from_numpy(state).float().squeeze(0)).to(self.device)
        action = self.actor.forward(state)
        action = self.onehot_from_logits(action)

        return action
    
    def onehot_from_logits(self, logits, eps=0.0):
        # get best (according to current policy) actions in one-hot form
        argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float()
        if eps == 0.0:
            return argmax_acs
        # get random actions in one-hot form
        rand_acs = Variable(torch.eye(logits.shape[1])[[np.random.choice(
            range(logits.shape[1]), size=logits.shape[0])]], requires_grad=False)
        # chooses between best and random actions using epsilon greedy
        return torch.stack([argmax_acs[i] if r > eps else rand_acs[i] for i, r in
                            enumerate(torch.rand(logits.shape[0]))])
    
    def update(self, indiv_reward_batch, indiv_obs_batch, global_state_batch, global_actions_batch, global_next_state_batch, next_global_actions):
        """
        indiv_reward_batch      : only rewards of agent i
        indiv_obs_batch         : only observations of agent i
        global_state_batch      : observations of all agents are concatenated
        global actions_batch    : actions of all agents are concatenated
        global_next_state_batch : observations of all agents are concatenated
        next_global_actions     : actions of all agents are concatenated
        """
        indiv_reward_batch = torch.FloatTensor(indiv_reward_batch).to(self.device)
        indiv_reward_batch = indiv_reward_batch.view(indiv_reward_batch.size(0), 1).to(self.device) 
        indiv_obs_batch = torch.FloatTensor(indiv_obs_batch).to(self.device)          
        global_state_batch = torch.FloatTensor(global_state_batch).to(self.device)    
        global_actions_batch = torch.stack(global_actions_batch).to(self.device)      
        global_next_state_batch = torch.FloatTensor(global_next_state_batch).to(self.device)
        next_global_actions = next_global_actions

        # update critic        
        self.critic_optimizer.zero_grad()
        
        curr_Q = self.critic.forward(global_state_batch, global_actions_batch)
        next_Q = self.critic_target.forward(global_next_state_batch, next_global_actions)
        estimated_Q = indiv_reward_batch + self.gamma * next_Q
        
        critic_loss = self.MSELoss(curr_Q, estimated_Q.detach())
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.critic_optimizer.step()

        # update actor
        self.actor_optimizer.zero_grad()

        policy_loss = -self.critic.forward(global_state_batch, global_actions_batch).mean()
        curr_pol_out = self.actor.forward(indiv_obs_batch)
        policy_loss += -(curr_pol_out**2).mean() * 1e-3 
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.actor_optimizer.step()
    
    def target_update(self):
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))

# **Replay Buffer**

In [0]:
import numpy as np
import random
from collections import deque
import torch

In [0]:
class MultiAgentReplayBuffer:
    
    def __init__(self, num_agents, max_size):
        self.max_size = max_size
        self.num_agents = num_agents
        self.buffer = deque(maxlen=max_size)
    
    def push(self, state, action, reward, next_state, done):
        experience = (state, action, np.array(reward), next_state, done)
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        obs_batch = [[] for _ in range(self.num_agents)]  # [ [states of agent 1], ... ,[states of agent n] ]    ]
        indiv_action_batch = [[] for _ in range(self.num_agents)] # [ [actions of agent 1], ... , [actions of agent n]]
        indiv_reward_batch = [[] for _ in range(self.num_agents)]
        next_obs_batch = [[] for _ in range(self.num_agents)]

        global_state_batch = []
        global_next_state_batch = []
        global_actions_batch = []
        done_batch = []

        batch = random.sample(self.buffer, batch_size)


        for experience in batch:
            state, action, reward, next_state, done = experience
            
            for i in range(self.num_agents):
                obs_i = state[i]
                action_i = action[i]
                reward_i = reward[i]
                next_obs_i = next_state[i]
            
                obs_batch[i].append(obs_i)
                indiv_action_batch[i].append(action_i)
                indiv_reward_batch[i].append(reward_i)
                next_obs_batch[i].append(next_obs_i)

            global_state_batch.append(np.concatenate(state))
            global_actions_batch.append(torch.cat(action))
            global_next_state_batch.append(np.concatenate(next_state))
            done_batch.append(done)
        
        return obs_batch, indiv_action_batch, indiv_reward_batch, next_obs_batch, global_state_batch, global_actions_batch, global_next_state_batch, done_batch

    def __len__(self):
        return len(self.buffer)

# **Multi Agent Deep Deterministic Policy Gradient -  MADDPG**

In [0]:
import torch 
import numpy as np

In [0]:
class MADDPG:

    def __init__(self, env, buffer_maxlen):
        self.env = env
        self.num_agents = env.n
        self.replay_buffer = MultiAgentReplayBuffer(self.num_agents, buffer_maxlen)
        self.agents = [DDPGAgent(self.env, i) for i in range(self.num_agents)]

    def get_actions(self, states):
        actions = []
        for i in range(self.num_agents):
            action = self.agents[i].get_action(states[i])
            actions.append(action)
        return actions

    def update(self, batch_size):
        obs_batch, indiv_action_batch, indiv_reward_batch, next_obs_batch, \
            global_state_batch, global_actions_batch, global_next_state_batch, done_batch = self.replay_buffer.sample(batch_size)
        
        for i in range(self.num_agents):
            obs_batch_i = obs_batch[i]
            indiv_action_batch_i = indiv_action_batch[i]
            indiv_reward_batch_i = indiv_reward_batch[i]
            next_obs_batch_i = next_obs_batch[i]

            next_global_actions = []
            for agent in self.agents:
                next_obs_batch_i = torch.FloatTensor(next_obs_batch_i)
                indiv_next_action = agent.actor.forward(next_obs_batch_i)
                indiv_next_action = [agent.onehot_from_logits(indiv_next_action_j) for indiv_next_action_j in indiv_next_action]
                indiv_next_action = torch.stack(indiv_next_action)
                next_global_actions.append(indiv_next_action)
            next_global_actions = torch.cat([next_actions_i for next_actions_i in next_global_actions], 1)

            self.agents[i].update(indiv_reward_batch_i, obs_batch_i, global_state_batch, global_actions_batch, global_next_state_batch, next_global_actions)
            self.agents[i].target_update()
    
    def run(self, max_episode, max_steps, batch_size):
        episode_rewards = []
        for episode in range(max_episode):
            states = self.env.reset()
            episode_reward = 0
            for step in range(max_steps):
                actions = self.get_actions(states)
                next_states, rewards, dones, _ = self.env.step(actions)
                episode_reward += np.mean(rewards)
        
                if all(dones) or step == max_steps - 1:
                    dones = [1 for _ in range(self.num_agents)]
                    self.replay_buffer.push(states, actions, rewards, next_states, dones)
                    episode_rewards.append(episode_reward)
                    print("episode: {}  |  reward: {}  \n".format(episode, np.round(episode_reward, decimals=4)))
                    break
                else:
                    dones = [0 for _ in range(self.num_agents)]
                    self.replay_buffer.push(states, actions, rewards, next_states, dones)
                    states = next_states 

                    if len(self.replay_buffer) > batch_size:
                        self.update(batch_size)

# **Spread Environment**

In [0]:
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
import torch 
import numpy as np

In [0]:
def make_env(scenario_name, benchmark=False):
    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    if benchmark:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    return env


In [0]:
env = make_env(scenario_name="simple_spread")

ma_controller = MADDPG(env,1000000)
ma_controller.run(500,300,32)

episode: 0  |  reward: -21595.2177  

episode: 1  |  reward: -18577.4276  

episode: 2  |  reward: -3423.1299  

episode: 3  |  reward: -7592.8313  

episode: 4  |  reward: -3700.3921  

episode: 5  |  reward: -5478.0207  

episode: 6  |  reward: -4667.8869  

episode: 7  |  reward: -8766.3405  

episode: 8  |  reward: -4248.9941  

episode: 9  |  reward: -5295.7437  

episode: 10  |  reward: -2591.3975  

episode: 11  |  reward: -3178.2135  

episode: 12  |  reward: -6445.7617  

episode: 13  |  reward: -5258.8974  

episode: 14  |  reward: -5533.5524  

episode: 15  |  reward: -5308.5812  

episode: 16  |  reward: -5588.9711  

episode: 17  |  reward: -3295.8217  

episode: 18  |  reward: -4077.646  

episode: 19  |  reward: -2478.0453  

episode: 20  |  reward: -3928.5667  

episode: 21  |  reward: -5452.3531  

episode: 22  |  reward: -3197.169  

episode: 23  |  reward: -4466.73  

episode: 24  |  reward: -3353.5174  

episode: 25  |  reward: -5352.8482  

episode: 26  |  reward: 