In [1]:
from pettingzoo.mpe import simple_tag_v3
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# 获取参数并创建环境
args = get_args()
env = simple_tag_v3.env(render_mode='human',max_cycles=10000,continuous_actions=True)
env.reset()

In [14]:
# env.agents
# ['adversary_0', 'adversary_1', 'adversary_2', 'agent_0']
# n_player：总的智能体个数
# n_agents：adv智能体个数
# obs_shape：adv智能体观测空间维度列表
# action_shape：adv智能体动作空间维度列表
args.n_players = len(env.agents)
args.n_agents = args.n_players-1
args.obs_shape = []
args.action_shape=[]
for i in range(args.n_agents):  
    cur_agent=env.agents[i]
    args.obs_shape.append(env.observation_spaces[cur_agent].shape[0])

    # 动作空间为离散空间时
    # args.action_shape.append(env.action_spaces[cur_agent].n)

    args.action_shape.append(env.action_spaces[cur_agent].shape[0])
    
args.high_action = 1
args.low_action = 0

  "The `observation_spaces` dictionary is deprecated. Use the `observation_space` function instead."
  "The `action_spaces` dictionary is deprecated. Use the `action_space` function instead."


In [15]:
# 智能体更新
agents = []
for i in range(args.n_agents):
    agent = Agent(i, args)
    agents.append(agent)
    agent.policy.load_model()
print("agents创建完毕!")

Agent 0 successfully loaded actor_network: ./model/simple_tag/agent_0/bst_actor_params.pkl
Agent 0 successfully loaded critic_network: ./model/simple_tag/agent_0/bst_critic_params.pkl
Agent 1 successfully loaded actor_network: ./model/simple_tag/agent_1/bst_actor_params.pkl
Agent 1 successfully loaded critic_network: ./model/simple_tag/agent_1/bst_critic_params.pkl
Agent 2 successfully loaded actor_network: ./model/simple_tag/agent_2/bst_actor_params.pkl
Agent 2 successfully loaded critic_network: ./model/simple_tag/agent_2/bst_critic_params.pkl
agents创建完毕!


In [19]:
evaluate_episodes=10
evaluate_episodes_len=100
# 回报列表，用于进行reward均值图像的绘制
return_list=[]

# 创建一个经验缓冲区，用于经验回放
buffer=Buffer(args)



## Train部分

In [11]:
train_episodes=10
train_episodes_len=100

return_list=[]
for episode in tqdm(range(train_episodes)):
    env.reset()
    rws=0
    for i in range(train_episodes_len):
        u = []
        actions = []
        with torch.no_grad():
            for agent_id, agent in enumerate(agents):
                # x \in [-1,1]转换为y \in [0,1]，变化的公式为：y=(x+1)/2,x=y*2-1
                action=agent.select_action(env.observe(env.agents[agent_id]),args.noise_rate,args.epsilon)
                action=(action+1)/2
                actions.append(action)
                u.append(action)

        for i in range(args.n_agents, args.n_players):
            actions.append([0, np.random.rand() , 0, np.random.rand() , 0])

        s_next,r=[],[]
        s=[]
        idx=0
            
        # 每次迭代的时候需要将智能体的动作列表一一送入进行遍历，并将每个智能体的对应数据记录
        for agent in env.agent_iter():
            observation, reward, termination, truncation, info = env.last()
            
            s.append(env.observe(env.agents[idx]))
            env.step(np.array(actions[idx]).astype(np.float32))
            s_next.append(env.observe(env.agents[idx]))

            r.append(reward)

            idx+=1
            if idx>3: break
        
        buffer.store_episode(s[:args.n_agents], u, r[:args.n_agents], s_next[:args.n_agents])
        
        s = s_next
        if buffer.current_size >= args.batch_size:
            transitions = buffer.sample(args.batch_size)
            for agent in agents:
                other_agents = agents.copy()
                other_agents.remove(agent)
                agent.learn(transitions, other_agents)
        
        rws+=r[0]

        args.noise = max(0.05, args.noise_rate - 0.0000005)
        args.epsilon = max(0.05, args.epsilon - 0.0000005)
    print(rws)

  transitions[key] = torch.tensor(transitions[key], dtype=torch.float32).to(device)
 10%|█         | 1/10 [00:03<00:35,  3.98s/it]

20.0


 20%|██        | 2/10 [00:07<00:30,  3.78s/it]

0.0


 30%|███       | 3/10 [00:11<00:26,  3.79s/it]

0.0


 40%|████      | 4/10 [00:15<00:22,  3.76s/it]

0.0


 50%|█████     | 5/10 [00:18<00:18,  3.71s/it]

10.0


 60%|██████    | 6/10 [00:22<00:15,  3.79s/it]

10.0


 70%|███████   | 7/10 [00:27<00:12,  4.01s/it]

0.0


 80%|████████  | 8/10 [00:31<00:08,  4.12s/it]

0.0


 90%|█████████ | 9/10 [00:35<00:04,  4.19s/it]

0.0


100%|██████████| 10/10 [00:40<00:00,  4.02s/it]

0.0





## Evaluate部分

In [20]:
return_list=[]
for episode in range(evaluate_episodes):
    # reset the environment
    env.reset()
    
    rewards=[0 for i in range(args.n_players)]
    done = False
    for i in range(evaluate_episodes_len):
        # env.render()
        # 单步步长移动对应的代码
        actions=[]

        # 为每个智能体确定动作
        with torch.no_grad():
            for agent_id,agent in enumerate(agents):
                action=agent.select_action(env.observe(env.agents[agent_id]),args.noise_rate,args.epsilon)
                action=(action+1)/2
                actions.append(action)

        # 为每个非智能体确定动作
        for i in range(args.n_agents,args.n_players):
            # 非智能体仅通过随机移动改变状态
            actions.append([0,np.random.rand(),0,np.random.rand(),0])


        s_next,r=[],[]
        idx=0
        
        # 每次迭代的时候需要将智能体的动作列表一一送入进行遍历，并将每个智能体的对应数据记录
        for agent in env.agent_iter():
            observation, reward, termination, truncation, info = env.last()

            env.step(np.array(actions[idx]).astype(np.float32))

            s_next.append(env.observe(env.agents[idx]))
            r.append(reward)

            idx+=1
            if idx>3:
                break

        # 第0个adv的奖励总和
        # 每轮的reward
        rewards=[r[i]+rewards[i] for i in range(len(rewards))]

    return_list.append(rewards)

return_list=np.array(return_list)

for i in range(len(return_list[0])):
    print(sum(return_list[:,i])/evaluate_episodes)

5.0
5.0
5.0
-1541.4595537035675
