#Setup Colab

In [0]:
from google.colab import drive 

def run_setup():
    drive.mount('/content/gdrive')
    %cd "/content/gdrive/My Drive/Colab Notebooks/PBT_MARL_RockPaperScissorsEnv/"
    !pwd
    !ls -l

    !pip install tensorflow==2.2.0
    !pip install ray[rllib]==0.8.5   

    !pip show tensorflow
    !pip show ray


g_drive_path = "/content/gdrive/My Drive/Colab Notebooks/PBT_MARL_RockPaperScissorsEnv/"
#run_setup()

#Chkpt/restore & log path

In [0]:
local_dir = g_drive_path + "chkpt/"
chkpt_freq = 10
chkpt = 150
restore_path = "{}checkpoint_{}/checkpoint-{}".format(local_dir, chkpt, chkpt)
is_restore = False

log_dir = g_drive_path + "ray_results/"

#Imports

In [0]:
from collections import defaultdict
from typing import Dict
import random
import numpy as np

from gym.spaces import Discrete

import ray
from ray import tune

from ray.tune.registry import register_env
from ray.rllib.models import ModelCatalog

from ray.rllib.policy import Policy

from ray.rllib.agents.ppo import ppo
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo import appo
from ray.rllib.agents.ppo.appo import APPOTrainer
from ray.rllib.agents.ppo import ddppo
from ray.rllib.agents.ppo.ddppo import DDPPOTrainer

from ray.rllib.env import BaseEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv

from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.evaluation import MultiAgentEpisode, RolloutWorker
from ray.rllib.agents.callbacks import DefaultCallbacks

from ray.rllib.utils import try_import_tf
from ray.tune.logger import pretty_print

tf = try_import_tf()

ROCK = 0
PAPER = 1
SCISSORS = 2

# The environment: RockPaperScissorsEnv class

In [0]:
class RockPaperScissorsEnv(MultiAgentEnv):
    """Two-player environment for rock paper scissors.
    The observation is simply the last opponent action."""

    def __init__(self, _, population_size):
        self.population_size = population_size
        self.action_space = Discrete(3)
        self.observation_space = Discrete(3)
        self.player_A = None 
        self.player_B = None 
        #self.player_A = "agt_0" 
        #self.player_B = "agt_1"  
        self.last_move = None
        self.num_moves = 0

    def reset(self):
        g_helper = ray.util.get_actor("g_helper")
        agt_i, agt_j = ray.get(g_helper.get_pair.remote())        
        self.player_A = agt_i
        self.player_B = agt_j
        self.last_move = (0, 0)
        self.num_moves = 0
        return {
            self.player_A: self.last_move[1],
            self.player_B: self.last_move[0],
        }

    def step(self, action_dict):
        move1 = action_dict[self.player_A]
        move2 = action_dict[self.player_B]
        self.last_move = (move1, move2)
        obs = {
            self.player_A: self.last_move[1],
            self.player_B: self.last_move[0],
        }
        
        r1, r2 = {
            (ROCK, ROCK): (0, 0),
            (ROCK, PAPER): (-1, 1),
            (ROCK, SCISSORS): (1, -1),
            (PAPER, ROCK): (1, -1),
            (PAPER, PAPER): (0, 0),
            (PAPER, SCISSORS): (-1, 1),
            (SCISSORS, ROCK): (-1, 1),
            (SCISSORS, PAPER): (1, -1),
            (SCISSORS, SCISSORS): (0, 0),
        }[move1, move2]
        rew = {
            self.player_A: r1,
            self.player_B: r2,
        }
        self.num_moves += 1
        done = {
            "__all__": self.num_moves >= 10,
        }

        #print('obs', obs)

        return obs, rew, done, {}

#PBT_MARL class

In [0]:
class PBT_MARL:
    def __init__(self, population_size, 
                 K, T_select, 
                 binomial_n, inherit_prob, 
                 perturb_prob, perturb_val):
        self.population_size = population_size      # num of agents to choose from                
        self.K = K      # # step size of Elo rating update given one match result.
        self.T_select = T_select      # agt_j selection threshold
        # inherit variables
        self.binomial_n = binomial_n     # bernoulli is special case of binomial when n=1
        self.inherit_prob = inherit_prob     # hyperparameters are either inherited or not independently with probability 0.5        
        # mutation variables
        self.perturb_prob = perturb_prob     # resample_probability
        self.perturb_val = perturb_val      # lower & upper bound for perturbation value          

    def _is_eligible(self, agt_i_key):
        """
        If agt_i completed certain training steps > threshold after 
        last evolution, return true.
        """
        return True

    def _is_parent(self, agt_j_key):
        """
        If agt_i completed certain training steps > threshold after 
        last evolution, return true.
        """      
        return True

    def _s_elo(self, rating_i, rating_j):
        return 1 / (1 + 10**((rating_j - rating_i) / 400))

    def compute_rating(self, prev_rating_i, prev_rating_j, score_i, score_j):
        s = (np.sign(score_i - score_j) + 1) / 2
        s_elo_val = self._s_elo(prev_rating_i, prev_rating_j)
        rating_i = prev_rating_i + self.K * (s - s_elo_val)
        rating_j = prev_rating_j + self.K * (s - s_elo_val)

        return rating_i, rating_j

    def _select_agt_j(self, pol_i_id, population_size, store, T_select):
        pol_j_id = np.random.randint(low=0, high=population_size, size=None)
        while pol_i_id == pol_j_id:
            pol_j_id = np.random.randint(low=0, high=population_size, size=None)

        agt_i_key = "agt_{}".format(str(pol_i_id))
        agt_j_key = "agt_{}".format(str(pol_j_id))
        rating_i = store[agt_i_key]["rating"][-1]
        rating_j = store[agt_j_key]["rating"][-1]

        s_elo_val = self._s_elo(rating_j, rating_i)
        print("s_elo_val:", s_elo_val)

        if s_elo_val < T_select:
            return pol_j_id
        else:
            return None

    def _inherit(self, trainer, pol_i_id, pol_j_id):
        pol_i = "p_" + str(pol_i_id)
        pol_j = "p_" + str(pol_j_id)
        print("{}_vs_{}".format(pol_i, pol_j))

        # cpy param_j to param_i
        self._cp_weight(trainer, pol_j, pol_i)

        # inherit hyperparam_j to hyperparam_i
        m = np.random.binomial(self.binomial_n, self.inherit_prob, size=1)[0]      # weightage to inherit from agt_i
        self._inherit_hyperparameters(trainer, pol_j, pol_i, m)              

    def _cp_weight(self, trainer, src, dest):
        """
        Copy weights of source policy to destination policy.
        """

        P0key_P1val = {}
        for (k,v), (k2,v2) in zip(trainer.get_policy(dest).get_weights().items(),
                                  trainer.get_policy(src).get_weights().items()):
            P0key_P1val[k] = v2

        trainer.set_weights({dest:P0key_P1val,
                             src:trainer.get_policy(src).get_weights()})

        for (k,v), (k2,v2) in zip(trainer.get_policy(dest).get_weights().items(),
                                  trainer.get_policy(src).get_weights().items()):
            assert (v == v2).all()   

    def _inherit_hyperparameters(self, trainer, src, dest, m):
        src_pol = trainer.get_policy(src)
        print("src_pol.config['lr']", src_pol.config["lr"])

        dest_pol = trainer.get_policy(dest)
        print("dest_pol.config['lr']", dest_pol.config["lr"])

        dest_pol.config["lr"] = m * dest_pol.config["lr"] + (1-m) * src_pol.config["lr"]
        dest_pol.config["gamma"] = m * dest_pol.config["gamma"] + (1-m) * src_pol.config["gamma"]
        print("src_pol.config['lr']", src_pol.config["lr"])
        print("dest_pol.config['lr']", dest_pol.config["lr"])      

    def _mutate(self, trainer, pol_i_id, store):  
        """
        Don't perturb gamma, just resample when applicable.
        """
        pol_i = "p_" + str(pol_i_id)
        pol = trainer.get_policy(pol_i)

        if random.random() < self.perturb_prob:     # resample
            pol.config["lr"] = np.random.uniform(low=0.00001, high=0.1, size=None)
            pol.config["gamma"] = np.random.uniform(low=0.9, high=0.999, size=None)     
        elif random.random() < 0.5:     # perturb_val = 0.8   
            pol.config["lr"] = pol.config["lr"] * self.perturb_val[0]
            #pol.config["gamma"] = pol.config["gamma"] * self.perturb_val[0]                     
        else:     # perturb_val = 1.2        
            pol.config["lr"] = pol.config["lr"] * self.perturb_val[1]
            #pol.config["gamma"] = pol.config["gamma"] * self.perturb_val[1]               

        # update hyperparameters in storage
        key = "agt_" + str(pol_i_id)
        store[key]["hyperparameters"]["lr"].append(pol.config["lr"])
        store[key]["hyperparameters"]["gamma"].append(pol.config["gamma"])        

    def PBT(self, trainer, store):      
        """  
        For all agents in population, if agt_i is eligible, 
        select agt_j, (i != j), if agt_j is a parent, 
        inherit (exploit) & mutate (explore: pertube/resample)    
        """        
        for i in range(self.population_size):
            pol_i_id = i
            if self._is_eligible(pol_i_id):
                pol_j_id = self._select_agt_j(pol_i_id, self.population_size, store, self.T_select)
                if pol_j_id is not None:
                    if self._is_parent(pol_j_id):
                        self._inherit(trainer, pol_i_id, pol_j_id)
                        self._mutate(trainer, pol_i_id, store)              

#Helper class

In [0]:
@ray.remote(num_cpus=0.25, num_gpus=0)
class Helper:
    def __init__(self, population_size, policies):
        self.population_size = population_size
        self.agt_i, self.agt_j = None, None
        self.policies = policies
        self.agt_store = self._create_agt_store(population_size, policies)

    def set_pair(self):
        i, j = np.random.randint(low=0, high=self.population_size, size=2)
        while i == j:
            j = np.random.randint(low=0, high=self.population_size, size=None)
            
        self.agt_i = "agt_" + str(i)
        self.agt_j = "agt_" + str(j)
        
    def get_pair(self):
        return self.agt_i, self.agt_j   

    def _create_agt_store(self, population_size, policies):
        """
        Storage for stats of agents in the population.
        """
        store = {}
        for i in range(0, population_size):
            agt_name = "agt_{}".format(str(i))
            store[agt_name] = {"hyperparameters": {"lr":[], 
                                                   "gamma":[]}, 
                               "score": [],
                               "rating": [],
                               "step": []}      # Steps since last evolved.
                                
        store = self._init_hyperparameters(store, policies)

        return store

    def _init_hyperparameters(self, store, policies):
        """
        """
        for key, val in store.items():
            _, str_i = key.split("_")
            pol_key = "p_" + str_i
            lr = policies[pol_key][3]["lr"]
            gamma = policies[pol_key][3]["gamma"]
            score = 0
            #rating = np.random.uniform(low=0.0, high=1.0, size=None)
            rating = 0.0
            step = 0

            store[key]["hyperparameters"]["lr"].append(lr)
            store[key]["hyperparameters"]["gamma"].append(gamma)
            store[key]["score"].append(score)
            store[key]["rating"].append(rating)
            store[key]["step"].append(step)

        return store

    def get_agt_store(self):
        return self.agt_store         

    def update_rating(self, agt_i_key, agt_j_key, rating_i, rating_j, score_i, score_j):
        self.agt_store[agt_i_key]["score"].append(score_i)
        self.agt_store[agt_j_key]["score"].append(score_j)
        self.agt_store[agt_i_key]["rating"].append(rating_i)
        self.agt_store[agt_j_key]["rating"].append(rating_j)           

    def get_rating(self, agt_key):
        return self.agt_store[agt_key]["rating"]  

#Callbacks

In [0]:
class MyCallbacks(DefaultCallbacks):
    def on_episode_start(self, worker: RolloutWorker, base_env: BaseEnv,
                         policies: Dict[str, Policy],
                         episode: MultiAgentEpisode, **kwargs):
        print("on_episode_start {}, _agent_to_policy {}".format(episode.episode_id, episode._agent_to_policy))            
        episode.hist_data["episode_id"] = []        

    def on_episode_step(self, worker: RolloutWorker, base_env: BaseEnv,
                        episode: MultiAgentEpisode, **kwargs):
          """
          pole_angle = abs(episode.last_observation_for()[2])
          raw_angle = abs(episode.last_raw_obs_for()[2])
          assert pole_angle == raw_angle
          episode.user_data["pole_angles"].append(pole_angle)
          """
          pass

    def on_episode_end(self, worker: RolloutWorker, base_env: BaseEnv,
                       policies: Dict[str, Policy], episode: MultiAgentEpisode,
                       **kwargs):        
        print("on_episode_end {}, episode.agent_rewards {}".format(episode.episode_id, episode.agent_rewards))            
        
        player_policy = []
        score = []
        for k,v in episode.agent_rewards.items():
            player_policy.append(k)
            score.append(v)

        pol_i_key = player_policy[0][1]
        pol_j_key = player_policy[1][1]
        _, str_i = pol_i_key.split("_")
        _, str_j = pol_j_key.split("_")
        agt_i_key = "agt_" + str_i
        agt_j_key = "agt_" + str_j
                
        g_helper = ray.util.get_actor("g_helper")     # get global object
        prev_rating_i = ray.get(g_helper.get_rating.remote(agt_i_key))
        prev_rating_j = ray.get(g_helper.get_rating.remote(agt_j_key))
        score_i = score[0]
        score_j = score[1]
        rating_i, rating_j = l_PBT_MARL.compute_rating(prev_rating_i[0], prev_rating_j[0], score_i, score_j)    
        ray.get(g_helper.update_rating.remote(agt_i_key, agt_j_key, rating_i, rating_j, score_i, score_j))          
        print("on_episode_end ray.get(g_helper.get_agt_store.remote())", ray.get(g_helper.get_agt_store.remote()))
        
    def on_sample_end(self, worker: RolloutWorker, samples: SampleBatch, 
                      **kwargs):
        print("on_sample_end returned sample batch of size {}".format(samples.count))

    def on_train_result(self, trainer, result: dict, **kwargs):
        """
        #print("on_train_result result['hist_stats']", result["hist_stats"])

        print("on_train_result result['episodes_this_iter']", result["episodes_this_iter"])
        print("on_train_result result['optimizer_steps_this_iter']", result["optimizer_steps_this_iter"])
        print("on_train_result result['timesteps_this_iter']", result["timesteps_this_iter"])
        #print("on_train_result result['done']", result["done"])
        print("on_train_result result['timesteps_total']", result["timesteps_total"])
        print("on_train_result result['episodes_total']", result["episodes_total"])
        print("on_train_result result['training_iteration']", result["training_iteration"])

        print("on_train_result result['info']", result["info"])        
        print("on_train_result result['info']['num_steps_trained']", result["info"]["num_steps_trained"])
        print("on_train_result result['info']['num_steps_sampled']", result["info"]["num_steps_sampled"])
        """
        print("trainer.train() result: {} -> {} episodes".format(trainer, result["episodes_this_iter"]))
        # you can mutate the result dict to add new fields to return
        result["callback_ok"] = True
        print("on_train_result result", result)

        """
        lastest_eps_id = result["hist_stats"]["episode_id"][result["episodes_this_iter"]-1]
        print("on_train_result lastest_eps_id", lastest_eps_id)
        # print newest eps_id in this iter (10 env with 1 worker will have 10 eps in 1 iter)
        for i, eps_id in enumerate(result["hist_stats"]["episode_id"]):
            print("on_train_result eps_id", eps_id)
            if i == result["episodes_this_iter"]-1:
                break      
        """

        g_helper = ray.util.get_actor("g_helper")     # get global object
        agt_store = ray.get(g_helper.get_agt_store.remote())
        l_PBT_MARL.PBT(trainer, agt_store)     # perform PBT
        print("on_train_result agt_store", agt_store)        
        ray.get(g_helper.set_pair.remote())     # set the lastest pair
        print("on_train_result g_helper.get_pair.remote()", ray.get(g_helper.get_pair.remote()))      

    def on_postprocess_trajectory(
            self, worker: RolloutWorker, episode: MultiAgentEpisode,
            agent_id: str, policy_id: str, policies: Dict[str, Policy],
            postprocessed_batch: SampleBatch,
            original_batches: Dict[str, SampleBatch], **kwargs):
        print("postprocessed {}, {}, {}, {} steps".format(episode, agent_id, policy_id, postprocessed_batch.count))
        """
        if "num_batches" not in episode.custom_metrics:
            episode.custom_metrics["num_batches"] = 0
        episode.custom_metrics["num_batches"] += 1
        """

#Policy

In [0]:
def init_policies(population_size, obs_space, act_space, use_lstm, hyperparameters_range):
    """
    Sample hyper-parameter from the hyper-parameter distribution.
    """
    policies = {}
    for i in range(population_size):
        pol_key = "p_" + str(i)
        lr = np.random.uniform(low=hyperparameters_range["lr"][0], high=hyperparameters_range["lr"][1], size=None)
        gamma = np.random.uniform(low=hyperparameters_range["gamma"][0], high=hyperparameters_range["gamma"][1], size=None)
        policies[pol_key] = (None, obs_space, act_space, {"model": {"use_lstm": use_lstm},
                                                          "lr": lr,
                                                          "gamma": gamma})
    return policies

def train_policies(population_size):    
    train_policies = []
    for i in range(population_size):
        pol_key = "p_" + str(i)
        train_policies.append(pol_key)

    return policies

def select_policy(agent_id):
    _, i = agent_id.split("_")
    policy = "p_" + str(i)
    print("select_policy {} {}".format(agent_id , policy))
    return policy     

#Global variables

In [9]:
population_size = 6
K = 0.1     
T_select = 0.6 #0.47
binomial_n = 1
inherit_prob = 0.5
perturb_prob = 0.1
perturb_val = [0.8, 1.2]
hyperparameters_range = {"lr": [0.00001, 0.01], 
                         "gamma": [0.9, 0.999]}

register_env("RockPaperScissorsEnv", lambda _: RockPaperScissorsEnv(_, population_size))     # register RockPaperScissorsEnv with RLlib     
# get obs & act spaces from dummy CDA env
dummy_env = RockPaperScissorsEnv(_, population_size=0)
obs_space = dummy_env.observation_space
act_space = dummy_env.action_space

use_lstm=False
policies = init_policies(population_size, obs_space, act_space, use_lstm, hyperparameters_range)
train_policies = train_policies(population_size)

l_PBT_MARL = PBT_MARL(population_size, 
                      K, T_select, 
                      binomial_n, inherit_prob,
                      perturb_prob, perturb_val)

ray.shutdown()
#ray.init(ignore_reinit_error=True, log_to_driver=True, webui_host='127.0.0.1', num_cpus=2)      #start ray
ray.init(ignore_reinit_error=True, log_to_driver=True, webui_host='127.0.0.1', num_cpus=2, num_gpus=1)      #start ray
print("ray.nodes()", ray.nodes())

g_helper = Helper.options(name="g_helper").remote(population_size, policies) 
ray.get(g_helper.set_pair.remote())

num_iters = 5     # num of main training loop

2020-06-10 04:01:21,167	INFO resource_spec.py:212 -- Starting Ray with 7.13 GiB memory available for workers and up to 3.58 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-10 04:01:21,580	INFO services.py:1170 -- View the Ray dashboard at [1m[32m127.0.0.1:8265[39m[22m


ray.nodes() [{'NodeID': '921753200830eebae7bcc7f8461a0e8a90e4bf19', 'Alive': True, 'NodeManagerAddress': '172.28.0.2', 'NodeManagerHostname': '685afc1c1019', 'NodeManagerPort': 57387, 'ObjectManagerPort': 45665, 'ObjectStoreSocketName': '/tmp/ray/session_2020-06-10_04-01-21_166152_4535/sockets/plasma_store', 'RayletSocketName': '/tmp/ray/session_2020-06-10_04-01-21_166152_4535/sockets/raylet', 'Resources': {'node:172.28.0.2': 1.0, 'GPU': 1.0, 'object_store_memory': 50.0, 'memory': 146.0, 'CPU': 2.0}, 'alive': True}]


#Config

In [0]:
def get_config():
    #config = ppo.DEFAULT_CONFIG.copy()
    #config = appo.DEFAULT_CONFIG.copy()
    config = ddppo.DEFAULT_CONFIG.copy()

    config["env"] = RockPaperScissorsEnv
    config["multiagent"] = {"policies_to_train": train_policies,
                            "policies": policies,
                            "policy_mapping_fn": select_policy}        
    # Number of CPUs to allocate for the trainer. Note: this only takes effect
    # when running in Tune. Otherwise, the trainer runs in the main program.
    #config["num_cpus_for_driver"] = 1                          
    #config["num_gpus"] = 0.5                                   # trainer only, can be fractional
    config["num_cpus_per_worker"] = 0.25                          # if using tune in colab, use 0.5                   
    config["num_gpus_per_worker"] = 0.125
    config["num_workers"] = 2      
    """
    "num_envs_per_worker" vectorized envs

    https://docs.ray.io/en/master/rllib-env.html#vectorized

    your envs will still be stepped one at a time. 
    If you would like your envs to be stepped in parallel, 
    you can set "remote_worker_envs": True. 
    This will create env instances in Ray actors and 
    step them in parallel. 
    These remote processes introduce communication overheads, 
    so this only helps if your env is very expensive to step / reset.
    """
    config["num_envs_per_worker"] = 3
    #config["remote_worker_envs"] = False       
    #config["batch_mode"] = "complete_episodes"              # "complete_episodes" or "truncate_episodes"    
    config["rollout_fragment_length"] = 30                  # let's sample 10 steps per episode which is the same as batch_mode="complete_episodes"
    config["train_batch_size"] = -1                         # Training batch size, if applicable 
                                                            # = total rollout steps of all workers with all envs if this total rollout steps > train_batch_size . 
                                                            # Should be >= rollout_fragment_length.
                                                            # Samples batches will be concatenated together to a batch of this size,
                                                            # which is then passed to SGD.
                                                            # If batch_mode is "complete_episodes", 
    config["sgd_minibatch_size"] = 10                       # default=128, sgd_minibatch_size, must be <= train_batch_size. (not in appo config)
    config["num_sgd_iter"] = 3                              # default=30, number of epochs to execute per train batch.

    """
    # IMPALA config for APPO
    config["min_iter_time_s"] = 3 #10
    config["replay_buffer_num_slots"] = 10 #100      
    config["learner_queue_size"] = 10 #16      
    config["learner_queue_timeout"] = 30 #300      
    """

    config["callbacks"] = MyCallbacks
    config["log_level"] = "WARN"                            # WARN/INFO/DEBUG 
    config["output"] = log_dir

    return config

#Train

In [11]:
def go_train(config):     
    #trainer = ppo.PPOTrainer(config=config, env="RockPaperScissorsEnv")         
    #trainer = appo.APPOTrainer(config=config, env="RockPaperScissorsEnv")         
    trainer = ddppo.DDPPOTrainer(config=config, env="RockPaperScissorsEnv")         

    if is_restore == True:
        trainer.restore(restore_path) 
    
    result = None
    for i in range(num_iters):
        result = trainer.train()       
        print("training loop = {} of {}".format(i + 1, num_iters))            
        print(pretty_print(result))     # includes result["custom_metrics"]
    """
        if i % chkpt_freq == 0:
            checkpoint = trainer.save(local_dir)
            print("checkpoint saved at", checkpoint)
    
    checkpoint = trainer.save(local_dir)
    print("checkpoint saved at", checkpoint)
    """

# run everything
go_train(get_config())    

ray.shutdown()

2020-06-10 04:01:24,079	INFO trainer.py:421 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-06-10 04:01:24,124	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2020-06-10 04:01:36,322	INFO trainable.py:180 -- _setup took 12.200 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2020-06-10 04:01:36,323	INFO trainable.py:217 -- Getting current IP.


[2m[36m(pid=4591)[0m on_episode_start 1574408856, _agent_to_policy {}
[2m[36m(pid=4591)[0m select_policy agt_0 p_0
[2m[36m(pid=4591)[0m select_policy agt_2 p_2
[2m[36m(pid=4591)[0m on_episode_start 221929406, _agent_to_policy {}
[2m[36m(pid=4591)[0m select_policy agt_0 p_0
[2m[36m(pid=4591)[0m select_policy agt_2 p_2
[2m[36m(pid=4591)[0m on_episode_start 651246668, _agent_to_policy {}
[2m[36m(pid=4591)[0m select_policy agt_0 p_0
[2m[36m(pid=4591)[0m select_policy agt_2 p_2
[2m[36m(pid=4644)[0m on_episode_start 472991024, _agent_to_policy {}
[2m[36m(pid=4644)[0m select_policy agt_0 p_0
[2m[36m(pid=4644)[0m select_policy agt_2 p_2
[2m[36m(pid=4644)[0m on_episode_start 642268686, _agent_to_policy {}
[2m[36m(pid=4644)[0m select_policy agt_0 p_0
[2m[36m(pid=4644)[0m select_policy agt_2 p_2
[2m[36m(pid=4644)[0m on_episode_start 812934918, _agent_to_policy {}
[2m[36m(pid=4644)[0m select_policy agt_0 p_0
[2m[36m(pid=4644)[0m select_policy a



trainer.train() result: <ray.rllib.agents.trainer_template.DDPPO object at 0x7f87490afe10> -> 18 episodes
on_train_result result {'episode_reward_max': 0.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 0.0, 'episode_len_mean': 10.0, 'episodes_this_iter': 18, 'policy_reward_min': {'p_0': -2.0, 'p_2': -5.0}, 'policy_reward_max': {'p_0': 5.0, 'p_2': 2.0}, 'policy_reward_mean': {'p_0': 0.3888888888888889, 'p_2': -0.3888888888888889}, 'custom_metrics': {}, 'hist_stats': {'episode_id': [], 'episode_reward': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'episode_lengths': [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10], 'policy_p_0_reward': [-2.0, -2.0, -1.0, 2.0, 0.0, 2.0, 1.0, 1.0, -2.0, 2.0, -1.0, -2.0, 3.0, 1.0, 5.0, -1.0, 0.0, 1.0], 'policy_p_2_reward': [2.0, 2.0, 1.0, -2.0, 0.0, -2.0, -1.0, -1.0, 2.0, -2.0, 1.0, 2.0, -3.0, -1.0, -5.0, 1.0, 0.0, -1.0]}, 'sampler_perf': {'mean_env_wait_ms': 0.2979155509702621, 'me