In [1]:
import gym
import os
from pettingzoo.butterfly import cooperative_pong_v2
import supersuit as ss
import matplotlib
import numpy as np
import pandas as pd


from stable_baselines3.ppo import CnnPolicy
from stable_baselines3 import PPO

from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.results_plotter import load_results, ts2xy

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-lb6gqxzo because the default path (/home/jovyan/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
env = cooperative_pong_v2.parallel_env(ball_speed=9, left_paddle_speed=12, right_paddle_speed=12, cake_paddle=False, max_cycles=2000, bounce_randomness=False) #create env
agents= ['paddle_0', 'paddle_1'] #name agents
env = ss.color_reduction_v0(env, mode='B') #convert to grayscale for less computation
env = ss.resize_v0(env, x_size=84, y_size=84) #resive
env = ss.frame_stack_v1(env, 4)#stack 4 frames together to see velocity/direction
env = ss.pettingzoo_env_to_vec_env_v0(env) #convert to vec env
env = ss.concat_vec_envs_v0(env, 8, num_cpus=1, base_class='stable_baselines3') #parallelize

In [3]:
model = PPO(CnnPolicy, env,  learning_rate=0.0003, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2,
            clip_range_vf=1, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, use_sde=False, sde_sample_freq=- 1, target_kl=None, 
            tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=1, seed=314, device='auto', _init_setup_model=True)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [407]:
for steps in range(2048,(20480*5),2048):
    model.learn(total_timesteps = steps)
    
    policy = str(steps)
    model.save(policy)
    print(policy + ' complete!')

---------------------------------
| time/              |          |
|    fps             | 687      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 4096     |
| train/             |          |
|    learning_rate   | 0.0003   |
---------------------------------




log/2048 complete!
-------------------------------------------
| time/                   |               |
|    fps                  | 650           |
|    iterations           | 1             |
|    time_elapsed         | 6             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | -0.0033580007 |
|    clip_fraction        | 0.121         |
|    clip_range           | 0.2           |
|    clip_range_vf        | 1             |
|    entropy_loss         | -1.09         |
|    explained_variance   | -0.000185     |
|    learning_rate        | 0.0003        |
|    loss                 | 104           |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.0189       |
|    value_loss           | 289           |
-------------------------------------------
log/4096 complete!
------------------------------------------
| time/                   |              |
|    fps                  | 695         

In [408]:
#set up test env
env1 = cooperative_pong_v2.parallel_env(ball_speed=9, left_paddle_speed=12, right_paddle_speed=12, cake_paddle=False, max_cycles=2000, bounce_randomness=False) #create env
agents= ['paddle_0', 'paddle_1'] #name agents
env1 = ss.color_reduction_v0(env1, mode='B') #convert to grayscale for less computation
env1 = ss.resize_v0(env1, x_size=84, y_size=84) #resive
env1 = ss.frame_stack_v1(env1, 4)#stack 4 frames together to see velocity/direction
env1 = ss.pettingzoo_env_to_vec_env_v0(env1) #convert to vec env
env1 = ss.concat_vec_envs_v0(env1, 1, num_cpus=0, base_class='stable_baselines3') #parallelize, only 1 env

In [409]:
results = pd.DataFrame(columns = ['num_training_steps', 'avg_reward'])

In [4]:
os.chdir('log')

In [416]:
num_episodes = range(100)

for count, filename in enumerate(os.listdir()):
    if filename.endswith(".zip"):
        model = PPO.load(filename.split('.')[0], env=env1)
        reward_per_ep = []

        for ep in num_episodes:

            rewards = []
            steps = 0


            obs = env1.reset()
            done = np.array([0,0])

            while all(done) != 1:
                action, _states = model.predict(obs)
                obs, reward, done, info = env1.step(action)
                steps+=1
                rewards.append(reward[0])

            reward_per_ep.append(sum(rewards)/len(rewards))


        results = results.append({'num_training_steps' : int(filename.split('.')[0]), 'avg_reward' : sum(reward_per_ep)/len(reward_per_ep)}, ignore_index = True)

Wrapping the env in a VecTransposeImage.
Wrapping the env in a VecTransposeImage.
Wrapping the env in a VecTransposeImage.
Wrapping the env in a VecTransposeImage.
Wrapping the env in a VecTransposeImage.
Wrapping the env in a VecTransposeImage.
Wrapping the env in a VecTransposeImage.
Wrapping the env in a VecTransposeImage.
Wrapping the env in a VecTransposeImage.


In [417]:
results.sort_values(by=['num_training_steps'])
results.to_csv('results_')

Unnamed: 0,num_training_steps,avg_reward
8,2048.0,-1.127084
17,2048.0,-1.182053
10,4096.0,-1.11793
1,4096.0,-0.951497
9,6144.0,-1.085327
0,6144.0,-1.309513
6,8192.0,-0.966854
15,8192.0,-0.985863
3,10240.0,-0.896531
12,10240.0,-1.064802
