# Comparison of PPO and A2C in Multi-Agent Environment With Continuous Action Space

In [48]:
# utility imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#env imports
import gym
from pettingzoo.sisl import multiwalker_v6
import supersuit as ss

#Algoritm Imports
from stable_baselines3 import PPO, A2C
from stable_baselines3.ppo import MlpPolicy as ppo_mlp_policy
from stable_baselines3.a2c import MlpPolicy as a2c_mlp_policy
import torch as th

In [49]:
env = multiwalker_v6.parallel_env(n_walkers=2,
                                  position_noise=1e-3,
                                  angle_noise=1e-3,
                                  local_ratio=1.0,
                                  forward_reward=1.0,
                                  terminate_reward=-100.0,
                                  fall_reward=-10.0,
                                  terminate_on_fall=True,
                                  remove_on_fall=True,
                                  max_cycles=1000) #create env

agents= ['walker_0', 'walker_1'] #name agents

#env = ss.frame_stack_v1(env, 4)#stack 4 frames together to see velocity/direction

env = ss.pettingzoo_env_to_vec_env_v0(env) #convert to vec env

env = ss.concat_vec_envs_v0(env, 1, num_cpus=8, base_class='stable_baselines3') #parallelize

In [50]:
ppo_model = PPO(ppo_mlp_policy, env,  learning_rate=0.0003, n_steps=5, batch_size=10, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2,
            clip_range_vf=1, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, use_sde=False, sde_sample_freq=- 1, target_kl=None, 
            tensorboard_log=None, create_eval_env=False, policy_kwargs=dict(optimizer_class=th.optim.Adam), verbose=0, seed=314, device='auto', _init_setup_model=True)

print(f'PPO Model Optimizer: {ppo_model.policy.optimizer_class}')
print('*'*40)
print(f'PPO Model Policy Network: {ppo_model.policy}')

PPO Model Optimizer: <class 'torch.optim.adam.Adam'>
****************************************
PPO Model Policy Network: ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=31, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=31, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=4, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


In [51]:
a2c_model = A2C(a2c_mlp_policy, env,  learning_rate=0.0003, n_steps=5, gamma=0.99, gae_lambda=1.0, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, 
                use_rms_prop=False, use_sde=False, sde_sample_freq=- 1, normalize_advantage=False, tensorboard_log=None, 
                create_eval_env=False, policy_kwargs=dict(optimizer_class=th.optim.Adam), verbose=0, seed=314, device='auto', _init_setup_model=True)

print(f'A2C Model Optimizer: {a2c_model.policy.optimizer_class}')
print('*'*40)
print(f'A2C Model Policy Network: {a2c_model.policy}')

A2C Model Optimizer: <class 'torch.optim.adam.Adam'>
****************************************
A2C Model Policy Network: ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=31, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=31, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=4, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)


In [67]:
steps = []
for i in range(1386500,(2000000),100000):
    steps.append(i)

In [68]:
def training_a2c(training_steps):
    for step in training_steps:
        a2c_model.learn(total_timesteps = step)

        policy = str(step)
        a2c_model.save(policy)
        print(policy + ' complete!')

In [54]:
def training(mod, algo):
    for i in range(len(steps)):
        if i == 0:
            mod.learn(total_timesteps = 500)

            policy = str(steps[i])
            mod.save(policy)
        else:
            mod = algo.load(str(steps[i-1]), env=env)
            mod.learn(total_timesteps = 10000)

            policy = str(steps[i])
            mod.save(policy)
        print(policy + ' complete!')

In [69]:
os.chdir('cont_a2c_log')
training_a2c(steps)
os.chdir('..')

1386500 complete!
1486500 complete!
1586500 complete!
1686500 complete!
1786500 complete!
1886500 complete!
1986500 complete!


In [56]:
#os.chdir('cont_ppo_log')
#training(ppo_model, PPO)
#os.chdir('..')

In [71]:
env1 = multiwalker_v6.parallel_env(n_walkers=2,
                                  position_noise=1e-3,
                                  angle_noise=1e-3,
                                  local_ratio=1.0,
                                  forward_reward=1.0,
                                  terminate_reward=-100.0,
                                  fall_reward=-10.0,
                                  terminate_on_fall=True,
                                  remove_on_fall=True,
                                  max_cycles=1000) #create env

agents= ['walker_0', 'walker_1'] #name agents

#env = ss.frame_stack_v1(env, 4)#stack 4 frames together to see velocity/direction

env1 = ss.pettingzoo_env_to_vec_env_v0(env1) #convert to vec env

env1 = ss.concat_vec_envs_v0(env1, 1, num_cpus=8, base_class='stable_baselines3') #parallelize

In [72]:
def testing(model, algo, num_episodes):
    
    results = pd.DataFrame(columns = ['num_training_steps', 'avg_reward'])
    
    for count, filename in enumerate(os.listdir()):
        
        if filename.endswith('.zip'):
            print(filename + ' beginning')
            mod = model.load(filename.split('.')[0], env=env1)
            rewards_per_ep = []
            
            for ep in range(num_episodes):
                rewards = []
                steps = 0
                obs = env1.reset()
                done = np.array([0,0])
                
                while all(done) != 1:
                    action, _states = mod.predict(obs)
                    obs, reward, done, info = env1.step(action)
                    steps+=1
                    rewards.append(reward[0])
                    
                rewards_per_ep.append(sum(rewards)/len(rewards))
            print(filename + ' complete!')
                
            results = results.append({'num_training_steps' : int(filename.split('.')[0]), 'avg_reward' : sum(rewards_per_ep)/len(rewards_per_ep)}, ignore_index = True)
            results.to_csv('results_a2c_cont.csv', index=False)
        
        
        
        
    return results

In [73]:
os.chdir('cont_a2c_log')
results_a2c_continuous = testing(a2c_model, A2C, 100)
results_a2c_continuous.sort_values(by=['num_training_steps'])
os.chdir('..')
results_a2c_continuous.to_csv('results_a2c_cont.csv', index=False)


1155500.zip beginning
1155500.zip complete!
1287500.zip beginning
1287500.zip complete!
215000.zip beginning
215000.zip complete!
1040000.zip beginning
1040000.zip complete!
264500.zip beginning
264500.zip complete!
479000.zip beginning
479000.zip complete!
20500.zip beginning
20500.zip complete!
165500.zip beginning
165500.zip complete!
1304000.zip beginning
1304000.zip complete!
1337000.zip beginning
1337000.zip complete!
40500.zip beginning
40500.zip complete!
1370000.zip beginning
1370000.zip complete!
413000.zip beginning
413000.zip complete!
1238000.zip beginning
1238000.zip complete!
1386500.zip beginning
1386500.zip complete!
693500.zip beginning
693500.zip complete!
297500.zip beginning
297500.zip complete!
512000.zip beginning
512000.zip complete!
957500.zip beginning
957500.zip complete!
83000.zip beginning
83000.zip complete!
495500.zip beginning
495500.zip complete!
182000.zip beginning
182000.zip complete!
1007000.zip beginning
1007000.zip complete!
1686500.zip beginning


In [None]:
#os.chdir('cont_ppo_log')
#results_ppo_continuous = testing(ppo_model, PPO, 100)
#results_ppo_continuous.sort_values(by=['num_training_steps'])
#os.chdir('..')
#results_ppo_continuous.to_csv('results_ppo_cont.csv', index=False)


In [74]:
results_ppo_continuous = pd.read_csv('results_ppo_cont.csv')
results_ppo_continuous['algorithm'] = 'PPO'

results_a2c_continuous = pd.read_csv('results_a2c_cont.csv')
results_a2c_continuous['algorithm'] = 'A2C'

results_continuous = pd.concat([results_ppo_continuous, results_a2c_continuous])
results_continuous.to_csv('results_continuous.csv', index=False)