In [3]:
import sys
sys.path.append(".")

from utils import *
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 配置参数
def config_args():
    parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
    # Environment
    parser.add_argument("--scenario-name", type=str, default="simple_speaker_listener", help="name of the scenario script")
    parser.add_argument("--max-episode-len", type=int, default=100, help="maximum episode length")
    parser.add_argument("--time-steps", type=int, default=2000000, help="number of time steps")
    # 一个地图最多env.n个agents，用户可以定义min(env.n,num-adversaries)个敌人，剩下的是好的agent
    parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries")
    # Core training parameters
    parser.add_argument("--lr-actor", type=float, default=1e-4, help="learning rate of actor")
    parser.add_argument("--lr-critic", type=float, default=1e-3, help="learning rate of critic")
    parser.add_argument("--epsilon", type=float, default=0.1, help="epsilon greedy")
    parser.add_argument("--noise_rate", type=float, default=0.1, help="noise rate for sampling from a standard normal distribution ")
    parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
    parser.add_argument("--tau", type=float, default=0.01, help="parameter for updating the target network")
    parser.add_argument("--buffer-size", type=int, default=int(5e5), help="number of transitions can be stored in buffer")
    parser.add_argument("--batch-size", type=int, default=256, help="number of episodes to optimize at the same time")
    # Checkpointing
    parser.add_argument("--save-dir", type=str, default="./model", help="directory in which training state and model should be saved")
    parser.add_argument("--save-rate", type=int, default=2000, help="save model once every time this many episodes are completed")
    parser.add_argument("--model-dir", type=str, default="", help="directory in which training state and model are loaded")

    # Evaluate
    parser.add_argument("--evaluate-episodes", type=int, default=10, help="number of episodes for evaluating")
    parser.add_argument("--evaluate-episode-len", type=int, default=100, help="length of episodes for evaluating")
    parser.add_argument("--evaluate", type=bool, default=False, help="whether to evaluate the model")
    parser.add_argument("--evaluate-rate", type=int, default=1000, help="how often to evaluate model")

    # Fix Bug
    # 如果是在colab需要解注下面这个参数
    # parser.add_argument('-f')

    # 如果在vscode需要解注下列参数
    parser.add_argument("--ip")
    parser.add_argument("--stdin")
    parser.add_argument("--control")
    parser.add_argument("--hb")
    parser.add_argument("--Session.signature_scheme")
    parser.add_argument("--Session.key")
    parser.add_argument("--shell")
    parser.add_argument("--transport")
    parser.add_argument("--iopub")
    parser.add_argument("--f")
    args = parser.parse_args()

    return args


In [5]:
args = config_args()
# args.scenario_name:'simple_speaker_listener'

scenario = scenarios.load(args.scenario_name + ".py").Scenario()

# create world
world = scenario.make_world()
# create multiagent environment
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)

args.obs_shape = [env.observation_space[i].shape[0] for i in range(env.n)]
args.n_agents=env.n
args.action_shape=[content.n for content in env.action_space]

args.high_action = 1
args.low_action = -1

# 智能体更新
agents = []
for i in range(args.n_agents):
    agent = Agent(i, args)
    agents.append(agent)
print("agents创建完毕!")


agents创建完毕!


In [6]:
# 创建一个经验缓冲区，用于经验回放
buffer=Buffer(args)

train_episodes=10
train_episodes_len=1000

In [7]:
# 训练代码1：
# 训练的结果是一塌糊涂！
for i in range(train_episodes):    
    rewards_l = [0 for i in range(args.n_agents)]
    with tqdm(total=int(train_episodes_len), desc='Iteration %d' % i) as pbar:
        for i_episode in range(train_episodes_len):                        
            s = env.reset()
            done = False
            while not done:
                u=[]
                actions=[]
                # 单步步长移动对应的代码
                with torch.no_grad():
                    for agent_id,agent in enumerate(agents):
                        action=agent.select_action(s[agent_id],args.noise_rate,args.epsilon)
                        u.append(action)
                        actions.append(action)

                # 获取环境反馈
                s_next,r,done,info=env.step(actions)

                # 存放至缓冲区
                buffer.store_episode(s[:args.n_agents],u,r[:args.n_agents],s_next[:args.n_agents])

                # 状态更新
                s=s_next

                # 当buffer数据的数量超过一定值后,才进行Q网络训练
                if buffer.current_size>args.batch_size:
                    transitions=buffer.sample(args.batch_size)
                    for agent in agents:
                        other_agents=agents.copy()
                        other_agents.remove(agent)
                        # 送入网络以及训练等众多事宜都是从这个函数内部进行的
                        agent.learn(transitions,other_agents)

                rewards_l =[r[i]+rewards_l[i] for i in range(args.n_agents)]
            if (i_episode + 1) % 10 == 0:
                pbar.set_postfix({
                    'episode':
                    '%d' % (i*train_episodes_len+i_episode),
                    'speaker reward':
                    '%.3f' % rewards_l[0],
                    'listener reward':
                    '%.3f' % rewards_l[1]
                })
            pbar.update(1)

Iteration 0:   0%|          | 0/1000 [00:00<?, ?it/s]

  transitions[key] = torch.tensor(transitions[key], dtype=torch.float32).to(device)
Iteration 0: 100%|██████████| 1000/1000 [00:17<00:00, 58.65it/s, episode=999, speaker reward=-2802.531, listener reward=-2802.531]
Iteration 1: 100%|██████████| 1000/1000 [00:18<00:00, 53.14it/s, episode=1999, speaker reward=-2748.935, listener reward=-2748.935]
Iteration 2: 100%|██████████| 1000/1000 [00:19<00:00, 52.11it/s, episode=2999, speaker reward=-2857.602, listener reward=-2857.602]
Iteration 3: 100%|██████████| 1000/1000 [00:18<00:00, 53.53it/s, episode=3999, speaker reward=-2735.878, listener reward=-2735.878]
Iteration 4: 100%|██████████| 1000/1000 [00:19<00:00, 51.29it/s, episode=4999, speaker reward=-2780.805, listener reward=-2780.805]
Iteration 5: 100%|██████████| 1000/1000 [00:19<00:00, 51.52it/s, episode=5999, speaker reward=-2699.542, listener reward=-2699.542]
Iteration 6: 100%|██████████| 1000/1000 [00:18<00:00, 53.07it/s, episode=6999, speaker reward=-2665.211, listener reward=-266

In [8]:
evaluate_episodes=10
evaluate_episodes_len=100

# 回报列表，用于进行reward均值图像的绘制
return_list=[]
for episode in range(evaluate_episodes):
    # reset the environment
    s = env.reset()
    reward=[0 for i in range(len(agents))]
    for i in range(evaluate_episodes_len):
        env.render()
        # 单步步长移动对应的代码
        u=[]
        actions=[]

        # 为每个智能体确定动作
        with torch.no_grad():
            for agent_id,agent in enumerate(agents):
                action=agent.select_action(s[agent_id],args.noise_rate,args.epsilon)
                actions.append(action)

        # 获取环境反馈
        s_next,r,_,_=env.step(actions)
        
        # 状态更新
        s=s_next

        # 第0个adv的奖励总和
        # 每轮的reward
        reward=[r[i]+reward[i] for i in range(len(reward))]

    return_list.append(reward)

return_list=np.array(return_list)

for i in range(len(return_list[0])):
    print(sum(return_list[:,i])/evaluate_episodes)


agent 1 to agent 0: _   agent 0 to agent 1: _   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: A   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: A   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: B   
agent 1 to agent 0: _   agent 0 to agent 1: C   
agent 1 to agent 0: _   agent 0 to agent 1: A   
agent 1 to agent 0: _   agent 0 to agent 1: A   
agent 1 to agent 0: _   agent 0 to agent 1: A   
agent 1 to agent 0: _   agent 0 to agent 1: C   
agent 1 to agent 0: 

: 