# 1.引入必要模块

In [None]:
%git clone https://github.com/openai/multiagent-particle-envs

Cloning into 'multiagent-particle-envs'...
remote: Enumerating objects: 281, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 281 (delta 104), reused 88 (delta 88), pack-reused 158[K
Receiving objects: 100% (281/281), 109.97 KiB | 18.33 MiB/s, done.
Resolving deltas: 100% (158/158), done.


In [None]:
%pip install gym==0.10.5 --upgrade

Collecting gym==0.10.5
  Downloading gym-0.10.5.tar.gz (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyglet>=1.2.0 (from gym==0.10.5)
  Downloading pyglet-2.0.8-py3-none-any.whl (853 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m853.1/853.1 kB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.10.5-py3-none-any.whl size=1581280 sha256=00b72a9237f35910427f2f04ec4e395c86281d5565dba5c2f5ad0080735e90db
  Stored in directory: /root/.cache/pip/wheels/f6/e4/3b/b3b32d8cdedd0e70545cc0a9139f3d66f8fd5d0c95d828d38e
Successfully built gym
Installing collected packages: pyglet, gym
  Attempting uninstall: gym
    Found existing installation: gym 0.25.2
    Uninstalling gym-0.25.2:
      Successfully

In [None]:
# 将mpeEnv添加到环境变量里，便于import
# 如果是通过git下载的mpe环境的话，那么需要运行这一部分，如果不是，直接把multiagent文件夹与ipynb放在同级目录下即可
import sys
sys.path.append("multiagent-particle-envs")


In [9]:
import threading
import os
from tqdm import tqdm
import numpy as np
import inspect
import argparse
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
import matplotlib.pyplot as plt

# 2.构建必要的utils

In [10]:
# 经验回放池
class Buffer:
    def __init__(self, args):
        self.size = args.buffer_size
        self.args = args
        # memory management
        self.current_size = 0
        # create the buffer to store info
        self.buffer = dict()
        for i in range(self.args.n_agents):
            self.buffer['o_%d' % i] = np.empty([self.size, self.args.obs_shape[i]])
            self.buffer['u_%d' % i] = np.empty([self.size, self.args.action_shape[i]])
            self.buffer['r_%d' % i] = np.empty([self.size])
            self.buffer['o_next_%d' % i] = np.empty([self.size, self.args.obs_shape[i]])
        # thread lock
        self.lock = threading.Lock()

    # store the episode
    def store_episode(self, o, u, r, o_next):
        idxs = self._get_storage_idx(inc=1)  # 以transition的形式存，每次只存一条经验
        for i in range(self.args.n_agents):
            with self.lock:
                self.buffer['o_%d' % i][idxs] = o[i]
                self.buffer['u_%d' % i][idxs] = u[i]
                self.buffer['r_%d' % i][idxs] = r[i]
                self.buffer['o_next_%d' % i][idxs] = o_next[i]

    # sample the data from the replay buffer
    def sample(self, batch_size):
        temp_buffer = {}
        idx = np.random.randint(0, self.current_size, batch_size)
        for key in self.buffer.keys():
            temp_buffer[key] = self.buffer[key][idx]
        return temp_buffer

    def _get_storage_idx(self, inc=None):
        inc = inc or 1
        if self.current_size+inc <= self.size:
            idx = np.arange(self.current_size, self.current_size+inc)
        elif self.current_size < self.size:
            overflow = inc - (self.size - self.current_size)
            idx_a = np.arange(self.current_size, self.size)
            idx_b = np.random.randint(0, self.current_size, overflow)
            idx = np.concatenate([idx_a, idx_b])
        else:
            idx = np.random.randint(0, self.size, inc)
        self.current_size = min(self.size, self.current_size+inc)
        if inc == 1:
            idx = idx[0]
        return idx


In [11]:
# 配置参数
def get_args():
    parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
    # Environment
    parser.add_argument("--scenario-name", type=str, default="simple_tag", help="name of the scenario script")
    parser.add_argument("--max-episode-len", type=int, default=100, help="maximum episode length")
    parser.add_argument("--time-steps", type=int, default=2000000, help="number of time steps")
    # 一个地图最多env.n个agents，用户可以定义min(env.n,num-adversaries)个敌人，剩下的是好的agent
    parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries")
    # Core training parameters
    parser.add_argument("--lr-actor", type=float, default=1e-4, help="learning rate of actor")
    parser.add_argument("--lr-critic", type=float, default=1e-3, help="learning rate of critic")
    parser.add_argument("--epsilon", type=float, default=0.1, help="epsilon greedy")
    parser.add_argument("--noise_rate", type=float, default=0.1, help="noise rate for sampling from a standard normal distribution ")
    parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
    parser.add_argument("--tau", type=float, default=0.01, help="parameter for updating the target network")
    parser.add_argument("--buffer-size", type=int, default=int(5e5), help="number of transitions can be stored in buffer")
    parser.add_argument("--batch-size", type=int, default=256, help="number of episodes to optimize at the same time")
    # Checkpointing
    parser.add_argument("--save-dir", type=str, default="./model", help="directory in which training state and model should be saved")
    parser.add_argument("--save-rate", type=int, default=2000, help="save model once every time this many episodes are completed")
    parser.add_argument("--model-dir", type=str, default="", help="directory in which training state and model are loaded")

    # Evaluate
    parser.add_argument("--evaluate-episodes", type=int, default=10, help="number of episodes for evaluating")
    parser.add_argument("--evaluate-episode-len", type=int, default=100, help="length of episodes for evaluating")
    parser.add_argument("--evaluate", type=bool, default=False, help="whether to evaluate the model")
    parser.add_argument("--evaluate-rate", type=int, default=1000, help="how often to evaluate model")

    # Fix Bug
    # 如果是在colab需要解注下面这个参数
    # parser.add_argument('-f')

    # 如果在vscode需要解注下列参数
    parser.add_argument("--ip")
    parser.add_argument("--stdin")
    parser.add_argument("--control")
    parser.add_argument("--hb")
    parser.add_argument("--Session.signature_scheme")
    parser.add_argument("--Session.key")
    parser.add_argument("--shell")
    parser.add_argument("--transport")
    parser.add_argument("--iopub")
    parser.add_argument("--f")
    args = parser.parse_args()

    return args

In [12]:
def store_args(method):
    """Stores provided method args as instance attributes.
    """
    argspec = inspect.getfullargspec(method)
    defaults = {}
    if argspec.defaults is not None:
        defaults = dict(
            zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
    if argspec.kwonlydefaults is not None:
        defaults.update(argspec.kwonlydefaults)
    arg_names = argspec.args[1:]

    @functools.wraps(method)
    def wrapper(*positional_args, **keyword_args):
        self = positional_args[0]
        # Get default arg values
        args = defaults.copy()
        # Add provided arg values
        for name, value in zip(arg_names, positional_args[1:]):
            args[name] = value
        args.update(keyword_args)
        self.__dict__.update(args)
        return method(*positional_args, **keyword_args)

    return wrapper


def make_env(args):
    # load scenario from script
    scenario = scenarios.load(args.scenario_name + ".py").Scenario()

    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    # env = MultiAgentEnv(world)
    args.n_players = env.n  # 包含敌人的所有玩家个数
    args.n_agents = env.n - args.num_adversaries  # 需要操控的玩家个数，虽然敌人也可以控制，但是双方都学习的话需要不同的算法
    args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)]  # 每一维代表该agent的obs维度
    action_shape = []
    for content in env.action_space:
        action_shape.append(content.n)
    args.action_shape = action_shape[:args.n_agents]  # 每一维代表该agent的act维度
    args.high_action = 1
    args.low_action = -1
    return env, args


In [13]:
# Testing block
scenario = scenarios.load("simple_tag.py").Scenario()

# create world
world = scenario.make_world()
# create multiagent environment
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
args=get_args()
# env = MultiAgentEnv(world)
args.n_players = env.n  # 包含敌人的所有玩家个数
args.n_agents = env.n - args.num_adversaries  # 需要操控的玩家个数，虽然敌人也可以控制，但是双方都学习的话需要不同的算法
args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)]  # 每一维代表该agent的obs维度
action_shape = []
for content in env.action_space:
    action_shape.append(content.n)
args.action_shape = action_shape[:args.n_agents]  # 每一维代表该agent的act维度
print(args.action_shape)
args.high_action = 1
args.low_action = -1

[5, 5, 5]


# 3.定义MADDPG网络

In [14]:
# define the actor network
class Actor(nn.Module):
    def __init__(self, args, agent_id):
        super(Actor, self).__init__()
        self.max_action = args.high_action
        self.fc1 = nn.Linear(args.obs_shape[agent_id], 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.action_out = nn.Linear(64, args.action_shape[agent_id])

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        actions = self.max_action * torch.tanh(self.action_out(x))

        return actions


class Critic(nn.Module):
    def __init__(self, args):
        super(Critic, self).__init__()
        self.max_action = args.high_action
        self.fc1 = nn.Linear(sum(args.obs_shape) + sum(args.action_shape), 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.q_out = nn.Linear(64, 1)

    def forward(self, state, action):
        state = torch.cat(state, dim=1)
        for i in range(len(action)):
            action[i] /= self.max_action
        action = torch.cat(action, dim=1)
        x = torch.cat([state, action], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        q_value = self.q_out(x)
        return q_value


In [15]:
class MADDPG:
    
    def __init__(self, args, agent_id):
        self.args = args
        self.agent_id = agent_id
        self.train_step = 0

        # create the network
        self.actor_network = Actor(args, agent_id)
        self.critic_network = Critic(args)

        # build up the target network
        self.actor_target_network = Actor(args, agent_id)
        self.critic_target_network = Critic(args)

        # load the weights into the target networks
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(self.critic_network.state_dict())

        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)

        # create the dict for store the model
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
        # path to save the model
        self.model_path = self.args.save_dir + '/' + self.args.scenario_name
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.model_path = self.model_path + '/' + 'agent_%d' % agent_id
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)

        # 加载模型
        if os.path.exists(self.model_path + '/actor_params.pkl'):
            self.actor_network.load_state_dict(torch.load(self.model_path + '/actor_params.pkl'))
            self.critic_network.load_state_dict(torch.load(self.model_path + '/critic_params.pkl'))
            print('Agent {} successfully loaded actor_network: {}'.format(self.agent_id,
                                                                          self.model_path + '/actor_params.pkl'))
            print('Agent {} successfully loaded critic_network: {}'.format(self.agent_id,
                                                                           self.model_path + '/critic_params.pkl'))

    # soft update
    def _soft_update_target_network(self):
        for target_param, param in zip(self.actor_target_network.parameters(), self.actor_network.parameters()):
            target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data)

        for target_param, param in zip(self.critic_target_network.parameters(), self.critic_network.parameters()):
            target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data)

    # update the network
    def train(self, transitions, other_agents):
        for key in transitions.keys():
            transitions[key] = torch.tensor(transitions[key], dtype=torch.float32)
        r = transitions['r_%d' % self.agent_id]  # 训练时只需要自己的reward
        o, u, o_next = [], [], []  # 用来装每个agent经验中的各项
        for agent_id in range(self.args.n_agents):
            o.append(transitions['o_%d' % agent_id])
            u.append(transitions['u_%d' % agent_id])
            o_next.append(transitions['o_next_%d' % agent_id])

        # calculate the target Q value function
        u_next = []
        with torch.no_grad():
            # 得到下一个状态对应的动作
            index = 0
            for agent_id in range(self.args.n_agents):
                if agent_id == self.agent_id:
                    u_next.append(self.actor_target_network(o_next[agent_id]))
                else:
                    # 因为传入的other_agents要比总数少一个，可能中间某个agent是当前agent，不能遍历去选择动作
                    u_next.append(other_agents[index].policy.actor_target_network(o_next[agent_id]))
                    index += 1
            q_next = self.critic_target_network(o_next, u_next).detach()

            target_q = (r.unsqueeze(1) + self.args.gamma * q_next).detach()

        # the q loss
        q_value = self.critic_network(o, u)
        critic_loss = (target_q - q_value).pow(2).mean()

        # the actor loss
        # 重新选择联合动作中当前agent的动作，其他agent的动作不变
        u[self.agent_id] = self.actor_network(o[self.agent_id])
        actor_loss = - self.critic_network(o, u).mean()
        # if self.agent_id == 0:
        #     print('critic_loss is {}, actor_loss is {}'.format(critic_loss, actor_loss))
        # update the network
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        self._soft_update_target_network()
        # 自动保存，由于目前只是在调试，暂不保存
        # if self.train_step > 0 and self.train_step % self.args.save_rate == 0:
        #     self.save_model(self.train_step)
        # self.train_step += 1

    def save_model(self, train_step):
        num = str(train_step // self.args.save_rate)
        model_path = os.path.join(self.args.save_dir, self.args.scenario_name)
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model_path = os.path.join(model_path, 'agent_%d' % self.agent_id)
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        torch.save(self.actor_network.state_dict(), model_path + '/' + num + '_actor_params.pkl')
        torch.save(self.critic_network.state_dict(),  model_path + '/' + num + '_critic_params.pkl')




# 4.创建Agent类

In [16]:
class Agent:
    def __init__(self, agent_id, args):
        self.args = args
        self.agent_id = agent_id
        self.policy = MADDPG(args, agent_id)

    def select_action(self, o, noise_rate, epsilon):
        # 用于实现噪声的添加以及epsilon-greedy算法的实现
        if np.random.uniform() < epsilon:
            u = np.random.uniform(-self.args.high_action, self.args.high_action, self.args.action_shape[self.agent_id])
        else:
            inputs = torch.tensor(o, dtype=torch.float32).unsqueeze(0)
            pi = self.policy.actor_network(inputs).squeeze(0)
            # print('{} : {}'.format(self.name, pi))
            u = pi.cpu().numpy()
            noise = noise_rate * self.args.high_action * np.random.randn(*u.shape)  # gaussian noise
            u += noise
            u = np.clip(u, -self.args.high_action, self.args.high_action)
        return u.copy()

    def learn(self, transitions, other_agents):
        self.policy.train(transitions, other_agents)



# 5.构建Runner类用于集成

In [8]:
class Runner:
    def __init__(self, args, env):
        self.args = args
        self.noise = args.noise_rate
        self.epsilon = args.epsilon
        self.episode_limit = args.max_episode_len
        self.env = env
        self.agents = self._init_agents()
        self.buffer = Buffer(args)
        self.save_path = self.args.save_dir + '/' + self.args.scenario_name
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def _init_agents(self):
        agents = []
        for i in range(self.args.n_agents):
            agent = Agent(i, self.args)
            agents.append(agent)
        return agents

    def run(self):
        returns = []
        for time_step in tqdm(range(self.args.time_steps)):
            # reset the environment
            if time_step % self.episode_limit == 0:
                s = self.env.reset()
            u = []
            actions = []
            with torch.no_grad():
                for agent_id, agent in enumerate(self.agents):
                    action = agent.select_action(s[agent_id], self.noise, self.epsilon)
                    u.append(action)
                    actions.append(action)
            for i in range(self.args.n_agents, self.args.n_players):
                actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
            s_next, r, done, info = self.env.step(actions)
            self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents])
            s = s_next
            if self.buffer.current_size >= self.args.batch_size:
                transitions = self.buffer.sample(self.args.batch_size)
                for agent in self.agents:
                    other_agents = self.agents.copy()
                    other_agents.remove(agent)
                    agent.learn(transitions, other_agents)
            if time_step > 0 and time_step % self.args.evaluate_rate == 0:
                returns.append(self.evaluate())
                # plt.figure()
                # plt.plot(range(len(returns)), returns)
                # plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit))
                # plt.ylabel('average returns')
                # plt.savefig(self.save_path + '/plt.png', format='png')
            self.noise = max(0.05, self.noise - 0.0000005)
            self.epsilon = max(0.05, self.epsilon - 0.0000005)
            np.save(self.save_path + '/returns.pkl', returns)

    def evaluate(self,isRender=False):
        returns = []
        for episode in range(self.args.evaluate_episodes):
            # reset the environment
            s = self.env.reset()
            rewards = 0
            for time_step in range(self.args.evaluate_episode_len):
                if isRender:
                    self.env.render()
                actions = []
                with torch.no_grad():
                    for agent_id, agent in enumerate(self.agents):
                        action = agent.select_action(s[agent_id], 0, 0)
                        actions.append(action)
                for i in range(self.args.n_agents, self.args.n_players):
                    actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
                s_next, r, done, info = self.env.step(actions)
                rewards += r[0]
                s = s_next
            returns.append(rewards)
            print('Returns is', rewards)
        return sum(returns) / self.args.evaluate_episodes


# 6.构建默认参数

In [9]:
args = get_args()

# 7.构建环境

In [10]:
env, args = make_env(args)

# 8.构建基本Runner类并运行训练

In [11]:
runner = Runner(args, env)

Agent 0 successfully loaded actor_network: ./model/simple_tag/agent_0/actor_params.pkl
Agent 0 successfully loaded critic_network: ./model/simple_tag/agent_0/critic_params.pkl
Agent 1 successfully loaded actor_network: ./model/simple_tag/agent_1/actor_params.pkl
Agent 1 successfully loaded critic_network: ./model/simple_tag/agent_1/critic_params.pkl
Agent 2 successfully loaded actor_network: ./model/simple_tag/agent_2/actor_params.pkl
Agent 2 successfully loaded critic_network: ./model/simple_tag/agent_2/critic_params.pkl


In [12]:
# 训练,解注下面的代码即可训练MPE
# runner.run()

# 测试(在colab环境中由于不存在显示屏，所以无法进行render，基本电脑上可以进行该操作)
runner.evaluate(True)

agent 1 to agent 0: _   agent 2 to agent 0: _   agent 3 to agent 0: _   agent 0 to agent 1: _   agent 2 to agent 1: _   agent 3 to agent 1: _   agent 0 to agent 2: _   agent 1 to agent 2: _   agent 3 to agent 2: _   agent 0 to agent 3: _   agent 1 to agent 3: _   agent 2 to agent 3: _   


agent 1 to agent 0: _   agent 2 to agent 0: _   agent 3 to agent 0: _   agent 0 to agent 1: _   agent 2 to agent 1: _   agent 3 to agent 1: _   agent 0 to agent 2: _   agent 1 to agent 2: _   agent 3 to agent 2: _   agent 0 to agent 3: _   agent 1 to agent 3: _   agent 2 to agent 3: _   
agent 1 to agent 0: _   agent 2 to agent 0: _   agent 3 to agent 0: _   agent 0 to agent 1: _   agent 2 to agent 1: _   agent 3 to agent 1: _   agent 0 to agent 2: _   agent 1 to agent 2: _   agent 3 to agent 2: _   agent 0 to agent 3: _   agent 1 to agent 3: _   agent 2 to agent 3: _   
agent 1 to agent 0: _   agent 2 to agent 0: _   agent 3 to agent 0: _   agent 0 to agent 1: _   agent 2 to agent 1: _   agent 3 to agent 1: _   agent 0 to agent 2: _   agent 1 to agent 2: _   agent 3 to agent 2: _   agent 0 to agent 3: _   agent 1 to agent 3: _   agent 2 to agent 3: _   
agent 1 to agent 0: _   agent 2 to agent 0: _   agent 3 to agent 0: _   agent 0 to agent 1: _   agent 2 to agent 1: _   agent 3 to ag

1338.0

## 创新一下训练方式，原本的训练方式我不太喜欢

```py
# 参考的预期训练部分代码
for i in range(10):
    with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
        for i_episode in range(int(num_episodes / 10)):
            episode_return = 0
            state = env.reset()
            done = False
            while not done:
                action = agent.take_action(state)
                next_state, reward, done, _ = env.step(action)
                replay_buffer.add(state, action, reward, next_state, done)
                state = next_state
                episode_return += reward
                # 当buffer数据的数量超过一定值后,才进行Q网络训练
                if replay_buffer.size() > minimal_size:
                    b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
                    transition_dict = {
                        'states': b_s,
                        'actions': b_a,
                        'next_states': b_ns,
                        'rewards': b_r,
                        'dones': b_d
                    }
                    agent.update(transition_dict)
            return_list.append(episode_return)
            if (i_episode + 1) % 10 == 0:
                pbar.set_postfix({
                    'episode':
                    '%d' % (num_episodes / 10 * i + i_episode + 1),
                    'return':
                    '%.3f' % np.mean(return_list[-10:])
                })
            pbar.update(1)
```

In [17]:
args = get_args()
env, args = make_env(args)

In [18]:
agents = []
for i in range(args.n_agents):
    agent = Agent(i, args)
    agents.append(agent)

agents

Agent 0 successfully loaded actor_network: ./model/simple_tag/agent_0/actor_params.pkl
Agent 0 successfully loaded critic_network: ./model/simple_tag/agent_0/critic_params.pkl
Agent 1 successfully loaded actor_network: ./model/simple_tag/agent_1/actor_params.pkl
Agent 1 successfully loaded critic_network: ./model/simple_tag/agent_1/critic_params.pkl
Agent 2 successfully loaded actor_network: ./model/simple_tag/agent_2/actor_params.pkl
Agent 2 successfully loaded critic_network: ./model/simple_tag/agent_2/critic_params.pkl


[<__main__.Agent at 0x1b3a90ef550>,
 <__main__.Agent at 0x1b3a90cfba8>,
 <__main__.Agent at 0x1b3a90f02b0>]

In [None]:
s=env.reset()
buffer = Buffer(args)

# 单步步长移动对应的代码
u=[]
actions=[]

# 为每个智能体确定动作
with torch.no_grad():
    for agent_id,agent in enumerate(agents):
        action=agent.select_action(s[agent_id],args.noise_rate,args.epsilon)
        u.append(action)
        actions.append(action)

# 为每个非智能体确定动作
for i in range(args.n_agents,args.n_players):
    # 非智能体仅通过随机移动改变状态
    actions.append([0,np.random.rand()*2-1,0,np.random.rand()*2-1,0])

# 获取环境反馈
s_next,r,done,info=env.step(actions)

# 存放至缓冲区
buffer.store_episode(s[:args.n_agents],u,r[:args.n_agents],s_next[:args.n_agents])

# 状态更新
s=s_next

# 缓冲区满再进行对应的q-learning
if buffer.current_size>args.batch_size:
    transitions=buffer.sample(args.batch_size)
    for agent in agents:
        other_agents=agents.copy()
        other_agents.remove(agent)
        # 送入网络以及训练等众多事宜都是从这个函数内部进行的
        agent.learn(transitions,other_agents)


## 新的运行环境！

In [1]:
import sys
sys.path.append("../")

from utils import *
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def make_env(args):
    # load scenario from script
    scenario = scenarios.load(args.scenario_name + ".py").Scenario()

    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    # env = MultiAgentEnv(world)
    args.n_players = env.n  # 包含敌人的所有玩家个数
    args.n_agents = env.n - args.num_adversaries  # 需要操控的玩家个数，虽然敌人也可以控制，但是双方都学习的话需要不同的算法
    args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)]  # 每一维代表该agent的obs维度
    action_shape = []
    for content in env.action_space:
        action_shape.append(content.n)
    args.action_shape = action_shape[:args.n_agents]  # 每一维代表该agent的act维度
    args.high_action = 1
    args.low_action = -1
    return env, args


In [3]:
# 获取参数并创建环境
args = get_args()
env, args = make_env(args)
print("Env创建完毕!")

# 智能体更新
agents = []
for i in range(args.n_agents):
    agent = Agent(i, args)
    agents.append(agent)
    agent.policy.load_model()
print("agents创建完毕!")

# 总的训练数，对应的是整个回合的个数
num_episodes=5000

# 回报列表，用于进行reward均值图像的绘制
return_list=[]

# 创建一个经验缓冲区，用于经验回放
buffer=Buffer(args)

# 训练开始~
for i in range(10):
    with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
        for i_episode in range(int(num_episodes / 10)):
            episode_return = [0 for i in range(args.n_players)]
            s = env.reset()
            done = False
            while not done:
                # 单步步长移动对应的代码
                u=[]
                actions=[]

                # 为每个智能体确定动作
                with torch.no_grad():
                    for agent_id,agent in enumerate(agents):
                        action=agent.select_action(s[agent_id],args.noise_rate,args.epsilon)
                        u.append(action)
                        actions.append(action)

                # 为每个非智能体确定动作
                for i in range(args.n_agents,args.n_players):
                    # 非智能体仅通过随机移动改变状态
                    actions.append([0,np.random.rand()*2-1,0,np.random.rand()*2-1,0])

                # 获取环境反馈
                s_next,r,done,info=env.step(actions)
                
                # 存放至缓冲区
                buffer.store_episode(s[:args.n_agents],u,r[:args.n_agents],s_next[:args.n_agents])

                # 状态更新
                s=s_next

                episode_return =[r[i]+episode_return[i] for i in range(args.n_players)]
                # 当buffer数据的数量超过一定值后,才进行Q网络训练
                if buffer.current_size>args.batch_size:
                    transitions=buffer.sample(args.batch_size)
                    for agent in agents:
                        other_agents=agents.copy()
                        other_agents.remove(agent)
                        # 送入网络以及训练等众多事宜都是从这个函数内部进行的
                        agent.learn(transitions,other_agents)

            return_list.append(episode_return)
            if (i_episode + 1) % 10 == 0:
                pbar.set_postfix({
                    'episode':
                    '%d' % (num_episodes / 10 * i + i_episode + 1),
                    'adv_0 reward':
                    '%.3f' % np.mean(np.array(return_list)[-10:,0]),
                    'agent_0 reward':
                    '%.3f' % np.mean(np.array(return_list)[-10:,3])
                })
            pbar.update(1)

Env创建完毕!
Agent 0 successfully loaded actor_network: ./model/simple_tag/agent_0/actor_params.pkl
Agent 0 successfully loaded critic_network: ./model/simple_tag/agent_0/critic_params.pkl
Agent 0 successfully loaded actor_network: ./model/simple_tag/agent_0/bst_actor_params.pkl
Agent 0 successfully loaded critic_network: ./model/simple_tag/agent_0/bst_critic_params.pkl
Agent 1 successfully loaded actor_network: ./model/simple_tag/agent_1/actor_params.pkl
Agent 1 successfully loaded critic_network: ./model/simple_tag/agent_1/critic_params.pkl
Agent 1 successfully loaded actor_network: ./model/simple_tag/agent_1/bst_actor_params.pkl
Agent 1 successfully loaded critic_network: ./model/simple_tag/agent_1/bst_critic_params.pkl
Agent 2 successfully loaded actor_network: ./model/simple_tag/agent_2/actor_params.pkl
Agent 2 successfully loaded critic_network: ./model/simple_tag/agent_2/critic_params.pkl
Agent 2 successfully loaded actor_network: ./model/simple_tag/agent_2/bst_actor_params.pkl
Agen

Iteration 0:   0%|          | 0/500 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_addmm)