In [26]:
# utility imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#env imports
import gym
from pettingzoo.butterfly import cooperative_pong_v2
import supersuit as ss
from gym import spaces

#Algoritm Imports
from stable_baselines3 import PPO, A2C
from stable_baselines3.ppo import CnnPolicy as ppo_cnn_policy
from stable_baselines3.a2c import CnnPolicy as a2c_cnn_policy
import torch as th

In [27]:
env = cooperative_pong_v2.parallel_env(ball_speed=9,
                                       left_paddle_speed=12,
                                       right_paddle_speed=12,
                                       cake_paddle=False,
                                       max_cycles=2048,
                                       bounce_randomness=False) #create env

agents= ['paddle_0', 'paddle_1'] #name agents

env = ss.color_reduction_v0(env, mode='B') #convert to grayscale for less computation

env = ss.resize_v0(env, x_size=84, y_size=84) #resive

env = ss.frame_stack_v1(env, 4)#stack 4 frames together to see velocity/direction

env = ss.pettingzoo_env_to_vec_env_v0(env) #convert to vec env

env = ss.concat_vec_envs_v0(env, 1, num_cpus=8, base_class='stable_baselines3') #parallelize

In [28]:
ppo_model = PPO(ppo_cnn_policy, env,  learning_rate=0.0003, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2,
            clip_range_vf=1, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, use_sde=False, sde_sample_freq=- 1, target_kl=None, 
            tensorboard_log=None, create_eval_env=False, policy_kwargs=dict(optimizer_class=th.optim.Adam), verbose=0, seed=314, device='auto', _init_setup_model=True)

print(f'PPO Model Optimizer: {ppo_model.policy.optimizer_class}')
print('*'*40)
print(f'PPO Model Policy Network: {ppo_model.policy}')

PPO Model Optimizer: <class 'torch.optim.adam.Adam'>
****************************************
PPO Model Policy Network: ActorCriticCnnPolicy(
  (features_extractor): NatureCNN(
    (cnn): Sequential(
      (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
      (3): ReLU()
      (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
      (5): ReLU()
      (6): Flatten(start_dim=1, end_dim=-1)
    )
    (linear): Sequential(
      (0): Linear(in_features=3136, out_features=512, bias=True)
      (1): ReLU()
    )
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential()
    (value_net): Sequential()
  )
  (action_net): Linear(in_features=512, out_features=3, bias=True)
  (value_net): Linear(in_features=512, out_features=1, bias=True)
)


In [29]:
a2c_model = A2C(a2c_cnn_policy, env,  learning_rate=0.0003, n_steps=2058, gamma=0.99, gae_lambda=1.0, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, 
            use_rms_prop=False, use_sde=False, sde_sample_freq=- 1, normalize_advantage=False, tensorboard_log=None, 
            create_eval_env=False, policy_kwargs=dict(optimizer_class=th.optim.Adam), verbose=0, seed=314, device='auto', _init_setup_model=True)

print(f'A2C Model Optimizer: {a2c_model.policy.optimizer_class}')
print('*'*40)
print(f'A2C Model Policy Network: {a2c_model.policy}')

A2C Model Optimizer: <class 'torch.optim.adam.Adam'>
****************************************
A2C Model Policy Network: ActorCriticCnnPolicy(
  (features_extractor): NatureCNN(
    (cnn): Sequential(
      (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
      (3): ReLU()
      (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
      (5): ReLU()
      (6): Flatten(start_dim=1, end_dim=-1)
    )
    (linear): Sequential(
      (0): Linear(in_features=3136, out_features=512, bias=True)
      (1): ReLU()
    )
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential()
    (value_net): Sequential()
  )
  (action_net): Linear(in_features=512, out_features=3, bias=True)
  (value_net): Linear(in_features=512, out_features=1, bias=True)
)


In [30]:
steps = [4096]
for i in range(16384,(16384*124),16384):
    steps.append(i)

In [31]:
def training(mod, algo):
    for i in range(len(steps)):
        if i == 0:
            mod.learn(total_timesteps = 4096)

            policy = str(steps[i])
            mod.save(policy)
        else:
            mod = algo.load(str(steps[i-1]), env=env)
            mod.learn(total_timesteps = 16384)

            policy = str(steps[i])
            mod.save(policy)
        print(policy + ' complete!')

In [32]:
#os.chdir('a2c_log')
#training(a2c_model, A2C)
#os.chdir('..')

In [33]:
#os.chdir('ppo_log')
#training(ppo_model, PPO)
#os.chdir('..')

In [34]:
#set up test env
env1 = cooperative_pong_v2.parallel_env(ball_speed=9,
                                        left_paddle_speed=12,
                                        right_paddle_speed=12,
                                        cake_paddle=False,
                                        max_cycles=2048,
                                        bounce_randomness=False) #create env

agents= ['paddle_0', 'paddle_1'] #name agents

env1 = ss.color_reduction_v0(env1, mode='B') #convert to grayscale for less computation

env1 = ss.resize_v0(env1, x_size=84, y_size=84) #resive

env1 = ss.frame_stack_v1(env1, 4)#stack 4 frames together to see velocity/direction

env1 = ss.pettingzoo_env_to_vec_env_v0(env1) #convert to vec env

env1 = ss.concat_vec_envs_v0(env1, 1, num_cpus=8, base_class='stable_baselines3') #parallelize, only 1 env

In [35]:
def testing(model, algo, num_episodes):
    
    results = pd.DataFrame(columns = ['num_training_steps', 'avg_reward'])
    
    for count, filename in enumerate(os.listdir()):
        
        if filename.endswith('.zip'):
            print(filename + ' beginning!')
            mod = model.load(filename.split('.')[0], env=env1)
            rewards_per_ep = []
            
            for ep in range(num_episodes):
                rewards = []
                steps = 0
                obs = env1.reset()
                done = np.array([0,0])
                
                while all(done) != 1:
                    action, _states = mod.predict(obs)
                    obs, reward, done, info = env1.step(action)
                    steps+=1
                    rewards.append(reward[0])
                    
                rewards_per_ep.append(sum(rewards)/len(rewards))
            print(filename + ' complete!')
                
            results = results.append({'num_training_steps' : int(filename.split('.')[0]), 'avg_reward' : sum(rewards_per_ep)/len(rewards_per_ep)}, ignore_index = True)
            results.to_csv('results_a2c_cont.csv', index=False)
        
        
        
        
    return results

In [36]:
#os.chdir('a2c_log')
results_a2c_discrete = testing(a2c_model, A2C, 100)
results_a2c_discrete.sort_values(by=['num_training_steps'])
os.chdir('..')
results_a2c_discrete.to_csv('results_a2c_discrete.csv', index=False)


704512.zip beginning!
704512.zip complete!
376832.zip beginning!
376832.zip complete!
1687552.zip beginning!
1687552.zip complete!
1228800.zip beginning!
1228800.zip complete!
1310720.zip beginning!
1310720.zip complete!
163840.zip beginning!
163840.zip complete!
4096.zip beginning!
4096.zip complete!
393216.zip beginning!
393216.zip complete!
573440.zip beginning!
573440.zip complete!
868352.zip beginning!
868352.zip complete!
1753088.zip beginning!
1753088.zip complete!
950272.zip beginning!
950272.zip complete!
737280.zip beginning!
737280.zip complete!
442368.zip beginning!
442368.zip complete!
786432.zip beginning!
786432.zip complete!
1802240.zip beginning!
1802240.zip complete!
655360.zip beginning!
655360.zip complete!
294912.zip beginning!
294912.zip complete!
1359872.zip beginning!
1359872.zip complete!
1114112.zip beginning!
1114112.zip complete!
425984.zip beginning!
425984.zip complete!
278528.zip beginning!
278528.zip complete!
1867776.zip beginning!
1867776.zip complete!

In [37]:
os.chdir('ppo_log')
results_ppo_discrete = testing(ppo_model, PPO, 100)
results_ppo_discrete.sort_values(by=['num_training_steps'])
os.chdir('..')
results_ppo_discrete.to_csv('results_ppo_discrete.csv', index=False)


704512.zip beginning!
704512.zip complete!
376832.zip beginning!
376832.zip complete!
1687552.zip beginning!
1687552.zip complete!
1228800.zip beginning!
1228800.zip complete!
1310720.zip beginning!
1310720.zip complete!
163840.zip beginning!
163840.zip complete!
4096.zip beginning!
4096.zip complete!
393216.zip beginning!
393216.zip complete!
573440.zip beginning!
573440.zip complete!
868352.zip beginning!
868352.zip complete!
1753088.zip beginning!
1753088.zip complete!
950272.zip beginning!
950272.zip complete!
737280.zip beginning!
737280.zip complete!
442368.zip beginning!
442368.zip complete!
786432.zip beginning!
786432.zip complete!
1802240.zip beginning!
1802240.zip complete!
655360.zip beginning!
655360.zip complete!
294912.zip beginning!
294912.zip complete!
1359872.zip beginning!
1359872.zip complete!
1114112.zip beginning!
1114112.zip complete!
425984.zip beginning!
425984.zip complete!
278528.zip beginning!
278528.zip complete!
1867776.zip beginning!
1867776.zip complete!

In [38]:

results_ppo_discrete = pd.read_csv('results_ppo_discrete.csv')
results_ppo_discrete['algorithm'] = 'PPO'

results_a2c_discrete = pd.read_csv('results_a2c_discrete.csv')
results_a2c_discrete['algorithm'] = 'A2C'

results_discrete = pd.concat([results_ppo_discrete, results_a2c_discrete])
results_discrete.to_csv('results_discrete.csv', index=False)