In [1]:
# Needed to switch directory in Google drive so as to import MARL env.
from google.colab import drive 
drive.mount('/content/gdrive')
%cd "/content/gdrive/My Drive/Colab Notebooks/gym-continuousDoubleAuction/"
!pwd
!pip install -r requirements.txt
!pip show tensorflow
!pip show ray

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/gym-continuousDoubleAuction
/content/gdrive/My Drive/Colab Notebooks/gym-continuousDoubleAuction
Obtaining gym_continuousDoubleAuction from git+https://github.com/ChuaCheowHuan/gym-continuousDoubleAuction.git@c897137cbcc93ca71cbd51c27e683c3298f6562d#egg=gym_continuousDoubleAuction (from -r requirements.txt (line 24))
  Skipping because already up-to-date.
Installing collected packages: gym-continuousDoubleAuction
  Found existing installation: gym-continuousDoubleAuction 0.0.1
    Can't uninstall 'gym-continuousDoubleAuction'. No files were found to uninstall.
  Running setup.py develop for gym-continuousDoubleAuction
Successfully installed gym-continuousDoubleAuction
Name: tensorflow
Version: 2.1.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
A

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
os.environ['RAY_DEBUG_DISABLE_MEMORY_MONITOR'] = "True"

import argparse
import gym
import random
import numpy as np

import ray
from ray import tune
from ray.rllib.utils import try_import_tf
from ray.tune.registry import register_env
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.models import Model, ModelCatalog
from ray.rllib.policy.policy import Policy
from ray.rllib.agents.ppo import ppo
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.tune.logger import pretty_print


import sys
if "../" not in sys.path:
    sys.path.append("../")

from gym_continuousDoubleAuction.envs.continuousDoubleAuction_env import continuousDoubleAuctionEnv

tf = try_import_tf()

In [0]:
class CustomModel_1(Model):
    def _lstm(self, Inputs, cell_size):
        s = tf.expand_dims(Inputs, axis=1, name='time_major')  # [time_step, feature] => [time_step, batch, feature]
        lstm_cell = tf.nn.rnn_cell.LSTMCell(cell_size)
        self.init_state = lstm_cell.zero_state(batch_size=1, dtype=tf.float32)
        # time_major means [time_step, batch, feature] while batch major means [batch, time_step, feature]
        outputs, self.final_state = tf.nn.dynamic_rnn(cell=lstm_cell, inputs=s, initial_state=self.init_state, time_major=True)
        lstm_out = tf.reshape(outputs, [-1, cell_size], name='flatten_rnn_outputs')  # joined state representation
        return lstm_out

    def _build_layers_v2(self, input_dict, num_outputs, options):
        hidden = 512
        cell_size = 256
        #S = input_dict["obs"]
        S = tf.layers.flatten(input_dict["obs"])
        with tf.variable_scope(tf.VariableScope(tf.AUTO_REUSE, "shared"),
                               reuse=tf.AUTO_REUSE,
                               auxiliary_name_scope=False):
            last_layer = tf.layers.dense(S, hidden, activation=tf.nn.relu, name="fc1")
        last_layer = tf.layers.dense(last_layer, hidden, activation=tf.nn.relu, name="fc2")
        last_layer = tf.layers.dense(last_layer, hidden, activation=tf.nn.relu, name="fc3")

        last_layer = self._lstm(last_layer, cell_size)

        output = tf.layers.dense(last_layer, num_outputs, activation=tf.nn.softmax, name="mu")

        return output, last_layer

In [0]:
def make_RandomPolicy(_seed):

    # a hand-coded policy that acts at random in the env (doesn't learn)
    class RandomPolicy(Policy):
        """Hand-coded policy that returns random actions."""
        def __init__(self, observation_space, action_space, config):
            self.observation_space = observation_space
            self.action_space = action_space
            self.action_space.seed(_seed)

        def compute_actions(self,
                            obs_batch,
                            state_batches,
                            prev_action_batch=None,
                            prev_reward_batch=None,
                            info_batch=None,
                            episodes=None,
                            **kwargs):
            """Compute actions on a batch of observations."""
            return [self.action_space.sample() for _ in obs_batch], [], {}

        def learn_on_batch(self, samples):
            """No learning."""
            #return {}
            pass

        def get_weights(self):
            pass

        def set_weights(self, weights):
            pass

    return RandomPolicy

In [5]:
ray.init(ignore_reinit_error=True, log_to_driver=False, webui_host='127.0.0.1', num_cpus=2)

2020-03-19 07:55:30,681	INFO resource_spec.py:212 -- Starting Ray with 6.74 GiB memory available for workers and up to 3.37 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-19 07:55:31,198	INFO services.py:1078 -- View the Ray dashboard at [1m[32m127.0.0.1:8265[39m[22m


{'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2020-03-19_07-55-30_679121_5591/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-03-19_07-55-30_679121_5591/sockets/raylet',
 'redis_address': '172.28.0.2:38993',
 'session_dir': '/tmp/ray/session_2020-03-19_07-55-30_679121_5591',
 'webui_url': '127.0.0.1:8265'}

In [0]:
num_agents = 4
num_policies = num_agents
num_iters = 3
simple = False #store_true
num_of_traders = num_agents
tape_display_length = 10 
tick_size = 1
init_cash = 1000000
max_step = 700 # per episode 
episode = 5 

In [7]:
single_CDA_env = continuousDoubleAuctionEnv(num_of_traders, init_cash, tick_size, tape_display_length, max_step)
obs_space = single_CDA_env.observation_space
act_space = single_CDA_env.action_space
register_env("continuousDoubleAuction-v0", lambda _: continuousDoubleAuctionEnv(num_of_traders, init_cash, tick_size, tape_display_length, max_step))
ModelCatalog.register_custom_model("model_disc", CustomModel_1)



In [0]:
# Each policy can have a different configuration (including custom model)
def gen_policy(i):
    config = {"model": {"custom_model": "model_disc"},
              "gamma": 0.99,}
    return (None, obs_space, act_space, config)

In [0]:
def policy_mapper(agent_id):
    for i in range(num_agents):
        if agent_id == i:
            return "policy_{}".format(i)

In [0]:
# Dictionary of policies
policies = {"policy_{}".format(i): gen_policy(i) for i in range(num_policies)}

In [11]:
# override policy with random policy

def set_RandomPolicy(policies):
    """
    Set 1st policy as PPO & override all other policies as RandomPolicy with
    different seed.
    """

    for i in range(num_agents):
        if i == num_agents-1:
            break
        x = i + 1
        policies["policy_{}".format(num_policies-x)] = (make_RandomPolicy(num_policies-x), obs_space, act_space, {})

    print('policies:', policies)
    return 0

set_RandomPolicy(policies)

policy_ids = list(policies.keys())

policies: {'policy_0': (None, Box(4, 10), Tuple(Discrete(3), Discrete(4), Box(1,), Box(1,), Discrete(12)), {'model': {'custom_model': 'model_disc'}, 'gamma': 0.99}), 'policy_1': (<class '__main__.make_RandomPolicy.<locals>.RandomPolicy'>, Box(4, 10), Tuple(Discrete(3), Discrete(4), Box(1,), Box(1,), Discrete(12)), {}), 'policy_2': (<class '__main__.make_RandomPolicy.<locals>.RandomPolicy'>, Box(4, 10), Tuple(Discrete(3), Discrete(4), Box(1,), Box(1,), Discrete(12)), {}), 'policy_3': (<class '__main__.make_RandomPolicy.<locals>.RandomPolicy'>, Box(4, 10), Tuple(Discrete(3), Discrete(4), Box(1,), Box(1,), Discrete(12)), {})}


In [12]:
def main(args):    
    config = ppo.DEFAULT_CONFIG.copy()
    config["log_level"] = "DEBUG"
    config["multiagent"] = {"policies_to_train": ["policy_0"],
                            "policies": policies,
                            "policy_mapping_fn": policy_mapper,
                           }
    config["num_workers"] = 1
    config["num_envs_per_worker"] = 4   
    config["train_batch_size"] = 128
    config["batch_mode"] = "complete_episodes"
    config["sample_batch_size"] = 32

    local_dir="/content/gdrive/My Drive/Colab Notebooks/gym-continuousDoubleAuction/chkpt/" # dir where your chkpts are saved
    chkpt = 59 # set the restore chkpt
    restore_path = "{}checkpoint_{}/checkpoint-{}".format(local_dir, chkpt, chkpt)

    trainer = ppo.PPOTrainer(config=config, 
                             env="continuousDoubleAuction-v0")    
    #trainer.restore(restore_path) # uncomment this to restore from chkpt

    for i in range(30):
        # Perform one iteration of training the policy with PPO
        result = trainer.train()
        print(pretty_print(result))
     
        if i % 3 == 0:
            checkpoint = trainer.save(local_dir)
            print("checkpoint saved at", checkpoint)

    checkpoint = trainer.save(local_dir)
    print("checkpoint saved at", checkpoint)


# run main
main(None)            

2020-03-19 07:55:32,298	INFO trainer.py:420 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-03-19 07:55:32,477	INFO ppo.py:165 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2020-03-19 07:55:32,539	DEBUG worker_set.py:179 -- Creating TF session {'intra_op_parallelism_threads': 8, 'inter_op_parallelism_threads': 8, 'gpu_options': {'allow_growth': True}, 'log_device_placement': False, 'device_count': {'CPU': 1}, 'allow_soft_placement': True}
2020-03-19 07:55:32,619	DEBUG rollout_worker.py:791 -- Creating policy for policy_0
2020-03-19 07:55:32,640	DEBUG catalog.py:406 -- Created preprocessor <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7fdac77f9e48>: Box(4, 10) -> (4, 10)
2020-03-19 07:55:32,668	DEBUG catalog.py:532 -- Using custom model model_disc
2020-03-19 07:55:33,549	DEBUG catalog.py:522 -- Created model <__main__.Cus

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected EOF while parsing (<unknown>, line 5)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected EOF while parsing (<unknown>, line 5)


2020-03-19 07:55:35,484	DEBUG catalog.py:532 -- Using custom model model_disc
2020-03-19 07:55:35,594	DEBUG catalog.py:522 -- Created model <__main__.CustomModel_1 object at 0x7fdab6c5dda0>: ({'obs': <tf.Tensor 'policy_0/observation:0' shape=(?, 4, 10) dtype=float32>, 'prev_actions': <tf.Tensor 'policy_0/prev_action:0' shape=(?, 5) dtype=float32>, 'prev_rewards': <tf.Tensor 'policy_0/prev_reward:0' shape=(?,) dtype=float32>, 'is_training': <tf.Tensor 'policy_0/is_training:0' shape=() dtype=bool>} of Box(4, 10), Tuple(Discrete(3), Discrete(4), Box(1,), Box(1,), Discrete(12)), None, None) -> Tensor("policy_0/default_model_2/value_function/mu/Softmax:0", shape=(?, 1), dtype=float32), []
2020-03-19 07:55:35,671	DEBUG catalog.py:532 -- Using custom model model_disc
2020-03-19 07:55:35,756	DEBUG catalog.py:522 -- Created model <__main__.CustomModel_1 object at 0x7fdab6bac5f8>: ({'obs': <tf.Tensor 'policy_0/packed:0' shape=(1, 4, 10) dtype=float32>, 'prev_actions': <tf.Tensor 'policy_0/packed

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected EOF while parsing (<unknown>, line 5)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected EOF while parsing (<unknown>, line 5)


2020-03-19 07:55:38,673	DEBUG catalog.py:522 -- Created model <__main__.CustomModel_1 object at 0x7fdab5c8d3c8>: ({'obs': <tf.Tensor 'policy_0/observation:0' shape=(?, 4, 10) dtype=float32>, 'prev_actions': <tf.Tensor 'policy_0/prev_action:0' shape=(?, 5) dtype=float32>, 'prev_rewards': <tf.Tensor 'policy_0/prev_reward:0' shape=(?,) dtype=float32>, 'is_training': <tf.Tensor 'policy_0_1/tower/is_training:0' shape=() dtype=bool>} of Box(4, 10), Tuple(Discrete(3), Discrete(4), Box(1,), Box(1,), Discrete(12)), None, None) -> Tensor("policy_0_1/tower/default_model_1/value_function/mu/Softmax:0", shape=(?, 1), dtype=float32), []
2020-03-19 07:55:38,677	DEBUG catalog.py:532 -- Using custom model model_disc
2020-03-19 07:55:38,790	DEBUG catalog.py:522 -- Created model <__main__.CustomModel_1 object at 0x7fdab5da7c88>: ({'obs': <tf.Tensor 'policy_0/observation:0' shape=(?, 4, 10) dtype=float32>, 'is_training': True, 'prev_actions': <tf.Tensor 'policy_0/prev_action:0' shape=(?, 5) dtype=float32>

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected EOF while parsing (<unknown>, line 5)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected EOF while parsing (<unknown>, line 5)


2020-03-19 07:55:41,807	DEBUG catalog.py:522 -- Created model <__main__.CustomModel_1 object at 0x7fdab55b9fd0>: ({'obs': <tf.Tensor 'policy_0_1/tower_1/Slice_7:0' shape=(?, 4, 10) dtype=float32>, 'prev_actions': <tf.Tensor 'policy_0_1/tower_1/Slice_8:0' shape=(?, 5) dtype=float32>, 'prev_rewards': <tf.Tensor 'policy_0_1/tower_1/Slice_9:0' shape=(?,) dtype=float32>, 'is_training': <tf.Tensor 'policy_0_1/tower_1/is_training:0' shape=() dtype=bool>} of Box(4, 10), Tuple(Discrete(3), Discrete(4), Box(1,), Box(1,), Discrete(12)), None, None) -> Tensor("policy_0_1/tower_1/default_model_1/value_function/mu/Softmax:0", shape=(?, 1), dtype=float32, device=/device:CPU:0), []
2020-03-19 07:55:41,810	DEBUG catalog.py:532 -- Using custom model model_disc
2020-03-19 07:55:41,930	DEBUG catalog.py:522 -- Created model <__main__.CustomModel_1 object at 0x7fdab5572908>: ({'obs': <tf.Tensor 'policy_0_1/tower_1/Slice_7:0' shape=(?, 4, 10) dtype=float32>, 'is_training': True, 'prev_actions': <tf.Tensor 'p

custom_metrics: {}
date: 2020-03-19_07-58-27
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 30
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78841.899
  learner:
    policy_0:
      cur_kl_coeff: 0.20000000298023224
      cur_lr: 4.999999873689376e-05
      entropy: 7.936832427978516
      entropy_coeff: 0.0
      kl: 0.023239200934767723
      policy_loss: -0.06438010931015015
      total_loss: 109154048.0
      vf_explained_var: 0.0
      vf_loss: 109154048.0
  load_time_ms: 178.916
  num_steps_sampled: 21030
  num_steps_trained: 19200
  sample_time_ms: 77889.172
  update_time_ms: 1990.241
iterations_since_restore: 1
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 76.99649122807018
  ram_util_percent: 16.387719298245617
pid: 5591
policy_reward_max:
  policy_0: 13713.0
  policy_1: 15304.0
  poli

2020-03-19 07:58:30,104	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.30000001192092896, 'cur_lr': 4.999999873689376e-05, 'total_loss': 44014260.0, 'policy_loss': 0.01968052, 'vf_loss': 44014260.0, 'vf_explained_var': 0.0, 'kl': 0.026977161, 'entropy': 7.9116945, 'entropy_coeff': 0.0}
2020-03-19 07:58:32,717	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.30000001192092896, 'cur_lr': 4.999999873689376e-05, 'total_loss': 44014260.0, 'policy_loss': -0.009047096, 'vf_loss': 44014260.0, 'vf_explained_var': 0.0, 'kl': 0.020211179, 'entropy': 7.8842125, 'entropy_coeff': 0.0}
2020-03-19 07:58:35,313	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.30000001192092896, 'cur_lr': 4.999999873689376e-05, 'total_loss': 44014260.0, 'policy_loss': -0.021288987, 'vf_loss': 44014260.0, 'vf_explained_var': 0.0, 'kl': 0.019801695, 'entropy': 7.871657, 'entropy_coeff': 0.0}
2020-03-19 07:58:37,878	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.30000001192092896, 'cur_l

custom_metrics: {}
date: 2020-03-19_07-59-44
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 31
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78018.323
  learner:
    policy_0:
      cur_kl_coeff: 0.30000001192092896
      cur_lr: 4.999999873689376e-05
      entropy: 7.867575645446777
      entropy_coeff: 0.0
      kl: 0.02151726558804512
      policy_loss: -0.08853279054164886
      total_loss: 44014260.0
      vf_explained_var: 0.0
      vf_loss: 44014260.0
  load_time_ms: 92.757
  num_steps_sampled: 21731
  num_steps_trained: 19840
  sample_time_ms: 38953.712
  update_time_ms: 1004.438
iterations_since_restore: 2
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.46666666666665
  ram_util_percent: 16.59999999999999
pid: 5591
policy_reward_max:
  policy_0: 13713.0
  policy_1: 15304.0
  policy_2:

2020-03-19 07:59:47,387	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.44999998807907104, 'cur_lr': 4.999999873689376e-05, 'total_loss': 386820100.0, 'policy_loss': 0.0202053, 'vf_loss': 386820100.0, 'vf_explained_var': 0.0, 'kl': 0.019114908, 'entropy': 7.8755827, 'entropy_coeff': 0.0}
2020-03-19 07:59:49,958	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.44999998807907104, 'cur_lr': 4.999999873689376e-05, 'total_loss': 386820100.0, 'policy_loss': -0.0009257272, 'vf_loss': 386820100.0, 'vf_explained_var': 0.0, 'kl': 0.018304283, 'entropy': 7.875163, 'entropy_coeff': 0.0}
2020-03-19 07:59:52,503	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.44999998807907104, 'cur_lr': 4.999999873689376e-05, 'total_loss': 386820100.0, 'policy_loss': -0.014823252, 'vf_loss': 386820100.0, 'vf_explained_var': 0.0, 'kl': 0.017951032, 'entropy': 7.896291, 'entropy_coeff': 0.0}
2020-03-19 07:59:55,055	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.44999998807907104, '

custom_metrics: {}
date: 2020-03-19_08-01-02
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 32
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78020.689
  learner:
    policy_0:
      cur_kl_coeff: 0.44999998807907104
      cur_lr: 4.999999873689376e-05
      entropy: 7.923636436462402
      entropy_coeff: 0.0
      kl: 0.026573460549116135
      policy_loss: -0.11607272922992706
      total_loss: 386820096.0
      vf_explained_var: 0.0
      vf_loss: 386820096.0
  load_time_ms: 72.731
  num_steps_sampled: 22432
  num_steps_trained: 20480
  sample_time_ms: 25975.872
  update_time_ms: 676.315
iterations_since_restore: 3
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.70720720720722
  ram_util_percent: 16.7
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 15304.0
  policy_2: 7681.0
  

2020-03-19 08:01:05,565	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 19377280.0, 'policy_loss': -0.019454587, 'vf_loss': 19377280.0, 'vf_explained_var': 0.0, 'kl': 0.026360055, 'entropy': 7.93011, 'entropy_coeff': 0.0}
2020-03-19 08:01:08,166	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 19377280.0, 'policy_loss': -0.037374903, 'vf_loss': 19377280.0, 'vf_explained_var': 0.0, 'kl': 0.022316437, 'entropy': 7.90038, 'entropy_coeff': 0.0}
2020-03-19 08:01:10,810	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 19377280.0, 'policy_loss': -0.044678174, 'vf_loss': 19377280.0, 'vf_explained_var': 0.0, 'kl': 0.020008132, 'entropy': 7.8777285, 'entropy_coeff': 0.0}
2020-03-19 08:01:13,387	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999

custom_metrics: {}
date: 2020-03-19_08-02-21
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 33
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78226.561
  learner:
    policy_0:
      cur_kl_coeff: 0.675000011920929
      cur_lr: 4.999999873689376e-05
      entropy: 7.8822340965271
      entropy_coeff: 0.0
      kl: 0.01272624172270298
      policy_loss: -0.08991803973913193
      total_loss: 19377280.0
      vf_explained_var: 0.0
      vf_loss: 19377280.0
  load_time_ms: 57.414
  num_steps_sampled: 23133
  num_steps_trained: 21120
  sample_time_ms: 19485.868
  update_time_ms: 512.079
iterations_since_restore: 4
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.79823008849559
  ram_util_percent: 16.800000000000004
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 15304.0
  policy_2: 76

2020-03-19 08:03:41,625	INFO multi_gpu_optimizer.py:143 -- Collected more training samples than expected (actual=701, train_batch_size=128). This may be because you have many workers or long episodes in 'complete_episodes' batch mode.
2020-03-19 08:03:41,627	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:03:41,631	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:03:41,633	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:03:41,639	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:03:41,640	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/rnn/lstm_cell/kernel:0' shape=(768, 1024) dtype

custom_metrics: {}
date: 2020-03-19_08-05-00
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 34
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78343.883
  learner:
    policy_0:
      cur_kl_coeff: 0.675000011920929
      cur_lr: 4.999999873689376e-05
      entropy: 7.843354225158691
      entropy_coeff: 0.0
      kl: 0.01177929900586605
      policy_loss: -0.09506279230117798
      total_loss: 66049736.0
      vf_explained_var: 0.0
      vf_loss: 66049736.0
  load_time_ms: 53.205
  num_steps_sampled: 23834
  num_steps_trained: 21760
  sample_time_ms: 31529.595
  update_time_ms: 413.121
iterations_since_restore: 5
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 77.41592920353982
  ram_util_percent: 17.09911504424778
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 15304.0
  policy_2: 7

2020-03-19 08:05:03,216	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 2016908.6, 'policy_loss': 0.0036339473, 'vf_loss': 2016908.4, 'vf_explained_var': 0.0, 'kl': 0.012216893, 'entropy': 7.8294554, 'entropy_coeff': 0.0}
2020-03-19 08:05:05,875	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 2016908.4, 'policy_loss': -0.013700339, 'vf_loss': 2016908.4, 'vf_explained_var': 0.0, 'kl': 0.0072521544, 'entropy': 7.7860136, 'entropy_coeff': 0.0}
2020-03-19 08:05:08,536	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 2016908.6, 'policy_loss': -0.022161266, 'vf_loss': 2016908.4, 'vf_explained_var': 0.0, 'kl': 0.0065895454, 'entropy': 7.7798223, 'entropy_coeff': 0.0}
2020-03-19 08:05:11,205	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999

custom_metrics: {}
date: 2020-03-19_08-06-19
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 35
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78515.328
  learner:
    policy_0:
      cur_kl_coeff: 0.675000011920929
      cur_lr: 4.999999873689376e-05
      entropy: 7.802001953125
      entropy_coeff: 0.0
      kl: 0.010201130993664265
      policy_loss: -0.07592214643955231
      total_loss: 2016908.375
      vf_explained_var: 0.0
      vf_loss: 2016908.375
  load_time_ms: 46.066
  num_steps_sampled: 24535
  num_steps_trained: 22400
  sample_time_ms: 26277.395
  update_time_ms: 347.462
iterations_since_restore: 6
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.9300884955752
  ram_util_percent: 17.230088495575217
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 15304.0
  policy_2: 7

2020-03-19 08:06:22,748	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 7977289.0, 'policy_loss': -0.0073422743, 'vf_loss': 7977289.0, 'vf_explained_var': 0.0, 'kl': 0.009117768, 'entropy': 7.793579, 'entropy_coeff': 0.0}
2020-03-19 08:06:25,419	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 7977289.5, 'policy_loss': -0.018997353, 'vf_loss': 7977289.0, 'vf_explained_var': 0.0, 'kl': 0.008179818, 'entropy': 7.7844415, 'entropy_coeff': 0.0}
2020-03-19 08:06:28,010	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 7977289.5, 'policy_loss': -0.028338963, 'vf_loss': 7977289.5, 'vf_explained_var': 0.0, 'kl': 0.008176775, 'entropy': 7.783399, 'entropy_coeff': 0.0}
2020-03-19 08:06:30,638	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999

custom_metrics: {}
date: 2020-03-19_08-07-39
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 36
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78663.347
  learner:
    policy_0:
      cur_kl_coeff: 0.675000011920929
      cur_lr: 4.999999873689376e-05
      entropy: 7.804590702056885
      entropy_coeff: 0.0
      kl: 0.012971621938049793
      policy_loss: -0.0802493542432785
      total_loss: 7977289.5
      vf_explained_var: 0.0
      vf_loss: 7977289.5
  load_time_ms: 44.356
  num_steps_sampled: 25236
  num_steps_trained: 23040
  sample_time_ms: 22526.259
  update_time_ms: 300.811
iterations_since_restore: 7
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.77456140350877
  ram_util_percent: 17.300000000000004
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 15304.0
  policy_2: 76

2020-03-19 08:07:42,481	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 465797120.0, 'policy_loss': 0.010329889, 'vf_loss': 465797120.0, 'vf_explained_var': 0.0, 'kl': 0.011420257, 'entropy': 7.8162575, 'entropy_coeff': 0.0}
2020-03-19 08:07:45,194	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 465797120.0, 'policy_loss': -0.0075361123, 'vf_loss': 465797120.0, 'vf_explained_var': 0.0, 'kl': 0.006000401, 'entropy': 7.7978578, 'entropy_coeff': 0.0}
2020-03-19 08:07:47,894	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 465797120.0, 'policy_loss': -0.018078428, 'vf_loss': 465797120.0, 'vf_explained_var': 0.0, 'kl': 0.0046164775, 'entropy': 7.776094, 'entropy_coeff': 0.0}
2020-03-19 08:07:50,577	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.675000011920929, 'cur_

custom_metrics: {}
date: 2020-03-19_08-09-00
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 37
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78905.681
  learner:
    policy_0:
      cur_kl_coeff: 0.675000011920929
      cur_lr: 4.999999873689376e-05
      entropy: 7.796273708343506
      entropy_coeff: 0.0
      kl: 0.00965329259634018
      policy_loss: -0.0700482726097107
      total_loss: 465797120.0
      vf_explained_var: 0.0
      vf_loss: 465797120.0
  load_time_ms: 39.791
  num_steps_sampled: 25937
  num_steps_trained: 23680
  sample_time_ms: 19712.517
  update_time_ms: 265.936
iterations_since_restore: 8
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.76695652173913
  ram_util_percent: 17.400000000000006
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 40708.0
  policy_2:

2020-03-19 08:10:19,574	INFO multi_gpu_optimizer.py:143 -- Collected more training samples than expected (actual=701, train_batch_size=128). This may be because you have many workers or long episodes in 'complete_episodes' batch mode.
2020-03-19 08:10:19,576	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:10:19,577	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:10:19,578	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:10:19,578	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:10:19,579	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/rnn/lstm_cell/kernel:0' shape=(768, 1024) dtype

custom_metrics: {}
date: 2020-03-19_08-11-39
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 38
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78982.782
  learner:
    policy_0:
      cur_kl_coeff: 0.675000011920929
      cur_lr: 4.999999873689376e-05
      entropy: 7.8046555519104
      entropy_coeff: 0.0
      kl: 0.008046374656260014
      policy_loss: -0.07860682159662247
      total_loss: 5800195.5
      vf_explained_var: 0.0
      vf_loss: 5800195.5
  load_time_ms: 37.342
  num_steps_sampled: 26638
  num_steps_trained: 24320
  sample_time_ms: 26315.255
  update_time_ms: 238.677
iterations_since_restore: 9
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 77.49030837004406
  ram_util_percent: 17.762114537444933
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 40708.0
  policy_2: 117

2020-03-19 08:11:41,937	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 14453640.0, 'policy_loss': -0.003817226, 'vf_loss': 14453638.0, 'vf_explained_var': 0.0, 'kl': 0.008973213, 'entropy': 7.8121734, 'entropy_coeff': 0.0}
2020-03-19 08:11:44,621	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 14453638.0, 'policy_loss': -0.01525601, 'vf_loss': 14453638.0, 'vf_explained_var': 0.0, 'kl': 0.005230055, 'entropy': 7.783909, 'entropy_coeff': 0.0}
2020-03-19 08:11:47,268	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 14453638.0, 'policy_loss': -0.01898318, 'vf_loss': 14453638.0, 'vf_explained_var': 0.0, 'kl': 0.003899961, 'entropy': 7.7694106, 'entropy_coeff': 0.0}
2020-03-19 08:11:49,885	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.99

custom_metrics: {}
date: 2020-03-19_08-12-58
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 39
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79010.709
  learner:
    policy_0:
      cur_kl_coeff: 0.675000011920929
      cur_lr: 4.999999873689376e-05
      entropy: 7.8010406494140625
      entropy_coeff: 0.0
      kl: 0.01150750182569027
      policy_loss: -0.07276133447885513
      total_loss: 14453638.0
      vf_explained_var: 0.0
      vf_loss: 14453638.0
  load_time_ms: 34.482
  num_steps_sampled: 27339
  num_steps_trained: 24960
  sample_time_ms: 23685.367
  update_time_ms: 217.201
iterations_since_restore: 10
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.68407079646018
  ram_util_percent: 17.90088495575222
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 40708.0
  policy_2:

2020-03-19 08:13:01,391	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 145525140.0, 'policy_loss': 0.01271175, 'vf_loss': 145525140.0, 'vf_explained_var': 0.0, 'kl': 0.008063286, 'entropy': 7.7931366, 'entropy_coeff': 0.0}
2020-03-19 08:13:04,098	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 145525140.0, 'policy_loss': -0.008268451, 'vf_loss': 145525140.0, 'vf_explained_var': 0.0, 'kl': 0.0031562995, 'entropy': 7.7639527, 'entropy_coeff': 0.0}
2020-03-19 08:13:06,828	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.675000011920929, 'cur_lr': 4.999999873689376e-05, 'total_loss': 145525140.0, 'policy_loss': -0.012903045, 'vf_loss': 145525140.0, 'vf_explained_var': 0.0, 'kl': 0.0017027734, 'entropy': 7.7436905, 'entropy_coeff': 0.0}
2020-03-19 08:13:09,546	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.675000011920929, 'cur_

custom_metrics: {}
date: 2020-03-19_08-14-17
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 40
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79019.941
  learner:
    policy_0:
      cur_kl_coeff: 0.675000011920929
      cur_lr: 4.999999873689376e-05
      entropy: 7.759302616119385
      entropy_coeff: 0.0
      kl: 0.004250717349350452
      policy_loss: -0.0466962493956089
      total_loss: 145525136.0
      vf_explained_var: 0.0
      vf_loss: 145525136.0
  load_time_ms: 18.365
  num_steps_sampled: 28040
  num_steps_trained: 25600
  sample_time_ms: 15898.687
  update_time_ms: 20.034
iterations_since_restore: 11
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.58230088495574
  ram_util_percent: 18.0
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2: 67115.0
  po

2020-03-19 08:14:20,353	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 35650436.0, 'policy_loss': 0.010938791, 'vf_loss': 35650430.0, 'vf_explained_var': 0.0, 'kl': 0.0031302269, 'entropy': 7.7521334, 'entropy_coeff': 0.0}
2020-03-19 08:14:22,954	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 35650430.0, 'policy_loss': 0.0022052466, 'vf_loss': 35650430.0, 'vf_explained_var': 0.0, 'kl': 0.0031652835, 'entropy': 7.751616, 'entropy_coeff': 0.0}
2020-03-19 08:14:25,576	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 35650436.0, 'policy_loss': -0.00259652, 'vf_loss': 35650436.0, 'vf_explained_var': 0.0, 'kl': 0.0036345695, 'entropy': 7.7560577, 'entropy_coeff': 0.0}
2020-03-19 08:14:28,189	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr

custom_metrics: {}
date: 2020-03-19_08-15-35
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 41
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79063.921
  learner:
    policy_0:
      cur_kl_coeff: 0.3375000059604645
      cur_lr: 4.999999873689376e-05
      entropy: 7.7909345626831055
      entropy_coeff: 0.0
      kl: 0.017055654898285866
      policy_loss: -0.06792061775922775
      total_loss: 35650432.0
      vf_explained_var: 0.0
      vf_loss: 35650432.0
  load_time_ms: 18.537
  num_steps_sampled: 28741
  num_steps_trained: 26240
  sample_time_ms: 15898.448
  update_time_ms: 20.091
iterations_since_restore: 12
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.73153153153154
  ram_util_percent: 18.09999999999999
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2

2020-03-19 08:16:52,511	INFO multi_gpu_optimizer.py:143 -- Collected more training samples than expected (actual=701, train_batch_size=128). This may be because you have many workers or long episodes in 'complete_episodes' batch mode.
2020-03-19 08:16:52,512	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:16:52,513	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:16:52,514	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:16:52,515	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:16:52,516	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/rnn/lstm_cell/kernel:0' shape=(768, 1024) dtype

custom_metrics: {}
date: 2020-03-19_08-18-09
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 42
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78977.74
  learner:
    policy_0:
      cur_kl_coeff: 0.3375000059604645
      cur_lr: 4.999999873689376e-05
      entropy: 7.819113731384277
      entropy_coeff: 0.0
      kl: 0.015107600018382072
      policy_loss: -0.05937281250953674
      total_loss: 562950464.0
      vf_explained_var: 0.0
      vf_loss: 562950464.0
  load_time_ms: 16.911
  num_steps_sampled: 29442
  num_steps_trained: 26880
  sample_time_ms: 23604.176
  update_time_ms: 20.217
iterations_since_restore: 13
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 77.50818181818182
  ram_util_percent: 18.35045454545455
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2

2020-03-19 08:18:12,406	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 15586542.0, 'policy_loss': -0.0020583495, 'vf_loss': 15586542.0, 'vf_explained_var': 0.0, 'kl': 0.013948739, 'entropy': 7.8011045, 'entropy_coeff': 0.0}
2020-03-19 08:18:15,094	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 15586542.0, 'policy_loss': -0.009202952, 'vf_loss': 15586542.0, 'vf_explained_var': 0.0, 'kl': 0.007260305, 'entropy': 7.774273, 'entropy_coeff': 0.0}
2020-03-19 08:18:17,729	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 15586542.0, 'policy_loss': -0.015531704, 'vf_loss': 15586542.0, 'vf_explained_var': 0.0, 'kl': 0.0052677365, 'entropy': 7.760051, 'entropy_coeff': 0.0}
2020-03-19 08:18:20,327	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr

custom_metrics: {}
date: 2020-03-19_08-19-27
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 43
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78854.345
  learner:
    policy_0:
      cur_kl_coeff: 0.3375000059604645
      cur_lr: 4.999999873689376e-05
      entropy: 7.772514343261719
      entropy_coeff: 0.0
      kl: 0.012002800591289997
      policy_loss: -0.05799741670489311
      total_loss: 15586544.0
      vf_explained_var: 0.0
      vf_loss: 15586544.0
  load_time_ms: 16.559
  num_steps_sampled: 30143
  num_steps_trained: 27520
  sample_time_ms: 23604.176
  update_time_ms: 20.246
iterations_since_restore: 14
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.57747747747749
  ram_util_percent: 18.5
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2: 67115.0
  po

2020-03-19 08:19:30,191	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 128997030.0, 'policy_loss': 0.005563992, 'vf_loss': 128997030.0, 'vf_explained_var': 0.0, 'kl': 0.007120122, 'entropy': 7.756196, 'entropy_coeff': 0.0}
2020-03-19 08:19:32,757	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 128997030.0, 'policy_loss': -0.00546828, 'vf_loss': 128997030.0, 'vf_explained_var': 0.0, 'kl': 0.003955153, 'entropy': 7.742877, 'entropy_coeff': 0.0}
2020-03-19 08:19:35,336	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 128997030.0, 'policy_loss': -0.006851062, 'vf_loss': 128997030.0, 'vf_explained_var': 0.0, 'kl': 0.0019204989, 'entropy': 7.7286897, 'entropy_coeff': 0.0}
2020-03-19 08:19:37,864	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.3375000059604645, 'cur

custom_metrics: {}
date: 2020-03-19_08-20-44
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 44
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78675.863
  learner:
    policy_0:
      cur_kl_coeff: 0.3375000059604645
      cur_lr: 4.999999873689376e-05
      entropy: 7.798672676086426
      entropy_coeff: 0.0
      kl: 0.015593928284943104
      policy_loss: -0.0682244524359703
      total_loss: 128997032.0
      vf_explained_var: 0.0
      vf_loss: 128997032.0
  load_time_ms: 15.245
  num_steps_sampled: 30844
  num_steps_trained: 28160
  sample_time_ms: 15635.979
  update_time_ms: 20.883
iterations_since_restore: 15
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.58545454545458
  ram_util_percent: 18.59999999999999
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2

2020-03-19 08:20:47,267	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 70376880.0, 'policy_loss': 0.015809702, 'vf_loss': 70376890.0, 'vf_explained_var': 0.0, 'kl': 0.010867467, 'entropy': 7.790448, 'entropy_coeff': 0.0}
2020-03-19 08:20:49,802	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 70376880.0, 'policy_loss': 0.004633525, 'vf_loss': 70376890.0, 'vf_explained_var': 0.0, 'kl': 0.007119385, 'entropy': 7.768338, 'entropy_coeff': 0.0}
2020-03-19 08:20:52,358	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 4.999999873689376e-05, 'total_loss': 70376880.0, 'policy_loss': -0.0026842714, 'vf_loss': 70376890.0, 'vf_explained_var': 0.0, 'kl': 0.007861075, 'entropy': 7.7682467, 'entropy_coeff': 0.0}
2020-03-19 08:20:54,932	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.3375000059604645, 'cur_lr': 

custom_metrics: {}
date: 2020-03-19_08-22-01
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 45
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78449.093
  learner:
    policy_0:
      cur_kl_coeff: 0.3375000059604645
      cur_lr: 4.999999873689376e-05
      entropy: 7.792583465576172
      entropy_coeff: 0.0
      kl: 0.016021009534597397
      policy_loss: -0.055099379271268845
      total_loss: 70376888.0
      vf_explained_var: 0.0
      vf_loss: 70376888.0
  load_time_ms: 15.281
  num_steps_sampled: 31545
  num_steps_trained: 28800
  sample_time_ms: 15636.056
  update_time_ms: 20.855
iterations_since_restore: 16
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.64272727272729
  ram_util_percent: 18.652727272727265
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_

2020-03-19 08:23:21,890	INFO multi_gpu_optimizer.py:143 -- Collected more training samples than expected (actual=701, train_batch_size=128). This may be because you have many workers or long episodes in 'complete_episodes' batch mode.
2020-03-19 08:23:21,892	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:23:21,898	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:23:21,899	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:23:21,905	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:23:21,906	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/rnn/lstm_cell/kernel:0' shape=(768, 1024) dtype

custom_metrics: {}
date: 2020-03-19_08-24-45
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 46
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78803.44
  learner:
    policy_0:
      cur_kl_coeff: 0.3375000059604645
      cur_lr: 4.999999873689376e-05
      entropy: 7.843637943267822
      entropy_coeff: 0.0
      kl: 0.020146071910858154
      policy_loss: -0.06826046109199524
      total_loss: 71951728.0
      vf_explained_var: 0.0
      vf_loss: 71951728.0
  load_time_ms: 15.88
  num_steps_sampled: 32246
  num_steps_trained: 29440
  sample_time_ms: 23631.496
  update_time_ms: 20.863
iterations_since_restore: 17
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 78.03347639484979
  ram_util_percent: 18.968240343347638
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2: 

2020-03-19 08:24:47,868	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 846541950.0, 'policy_loss': -0.005519976, 'vf_loss': 846541950.0, 'vf_explained_var': 0.0, 'kl': 0.016443033, 'entropy': 7.8103547, 'entropy_coeff': 0.0}
2020-03-19 08:24:50,628	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 846541950.0, 'policy_loss': -0.023771089, 'vf_loss': 846541950.0, 'vf_explained_var': 0.0, 'kl': 0.0069947233, 'entropy': 7.7706757, 'entropy_coeff': 0.0}
2020-03-19 08:24:53,363	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 846541900.0, 'policy_loss': -0.029785141, 'vf_loss': 846541900.0, 'vf_explained_var': 0.0, 'kl': 0.004067362, 'entropy': 7.7505937, 'entropy_coeff': 0.0}
2020-03-19 08:24:56,086	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 

custom_metrics: {}
date: 2020-03-19_08-26-07
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 47
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78965.005
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.768510341644287
      entropy_coeff: 0.0
      kl: 0.012102310545742512
      policy_loss: -0.07724713534116745
      total_loss: 846541952.0
      vf_explained_var: 0.0
      vf_loss: 846541952.0
  load_time_ms: 15.801
  num_steps_sampled: 32947
  num_steps_trained: 30080
  sample_time_ms: 23631.543
  update_time_ms: 20.793
iterations_since_restore: 18
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.70854700854701
  ram_util_percent: 19.106837606837598
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy

2020-03-19 08:26:10,167	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 56873730.0, 'policy_loss': 0.008222054, 'vf_loss': 56873730.0, 'vf_explained_var': 0.0, 'kl': 0.0052274303, 'entropy': 7.7595816, 'entropy_coeff': 0.0}
2020-03-19 08:26:12,896	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 56873730.0, 'policy_loss': 0.004718813, 'vf_loss': 56873730.0, 'vf_explained_var': 0.0, 'kl': 0.003118026, 'entropy': 7.746263, 'entropy_coeff': 0.0}
2020-03-19 08:26:15,630	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 56873730.0, 'policy_loss': 0.002694133, 'vf_loss': 56873730.0, 'vf_explained_var': 0.0, 'kl': 0.0022615306, 'entropy': 7.738916, 'entropy_coeff': 0.0}
2020-03-19 08:26:18,346	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 

custom_metrics: {}
date: 2020-03-19_08-27-27
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 48
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79021.06
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.7787981033325195
      entropy_coeff: 0.0
      kl: 0.009931285865604877
      policy_loss: -0.031003784388303757
      total_loss: 56873728.0
      vf_explained_var: 0.0
      vf_loss: 56873728.0
  load_time_ms: 16.123
  num_steps_sampled: 33648
  num_steps_trained: 30720
  sample_time_ms: 15720.113
  update_time_ms: 21.014
iterations_since_restore: 19
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.81999999999998
  ram_util_percent: 19.199999999999992
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_

2020-03-19 08:27:30,380	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 120554024.0, 'policy_loss': 0.021937968, 'vf_loss': 120554024.0, 'vf_explained_var': 0.0, 'kl': 0.0035199926, 'entropy': 7.751046, 'entropy_coeff': 0.0}
2020-03-19 08:27:32,959	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 120554024.0, 'policy_loss': 0.014660841, 'vf_loss': 120554024.0, 'vf_explained_var': 0.0, 'kl': 0.0023605912, 'entropy': 7.7389803, 'entropy_coeff': 0.0}
2020-03-19 08:27:35,596	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 120554010.0, 'policy_loss': 0.009995611, 'vf_loss': 120554010.0, 'vf_explained_var': 0.0, 'kl': 0.002131133, 'entropy': 7.732291, 'entropy_coeff': 0.0}
2020-03-19 08:27:38,201	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 'cur

custom_metrics: {}
date: 2020-03-19_08-28-45
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 49
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78831.302
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.743892669677734
      entropy_coeff: 0.0
      kl: 0.006777608301490545
      policy_loss: -0.027484914287924767
      total_loss: 120554024.0
      vf_explained_var: 0.0
      vf_loss: 120554024.0
  load_time_ms: 15.749
  num_steps_sampled: 34349
  num_steps_trained: 31360
  sample_time_ms: 15720.54
  update_time_ms: 21.099
iterations_since_restore: 20
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.57363636363635
  ram_util_percent: 19.300000000000008
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy

2020-03-19 08:30:01,310	INFO multi_gpu_optimizer.py:143 -- Collected more training samples than expected (actual=701, train_batch_size=128). This may be because you have many workers or long episodes in 'complete_episodes' batch mode.
2020-03-19 08:30:01,316	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:30:01,317	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:30:01,324	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:30:01,327	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:30:01,330	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/rnn/lstm_cell/kernel:0' shape=(768, 1024) dtype

custom_metrics: {}
date: 2020-03-19_08-31-17
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 50
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78590.069
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.809530735015869
      entropy_coeff: 0.0
      kl: 0.013568231835961342
      policy_loss: -0.08325065672397614
      total_loss: 53369036.0
      vf_explained_var: 0.0
      vf_loss: 53369036.0
  load_time_ms: 16.963
  num_steps_sampled: 35050
  num_steps_trained: 32000
  sample_time_ms: 23329.035
  update_time_ms: 21.683
iterations_since_restore: 21
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 77.35917431192662
  ram_util_percent: 19.60550458715596
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2:

2020-03-19 08:31:20,545	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 456133300.0, 'policy_loss': -0.0071105435, 'vf_loss': 456133300.0, 'vf_explained_var': 0.0, 'kl': 0.018279476, 'entropy': 7.8219275, 'entropy_coeff': 0.0}
2020-03-19 08:31:23,118	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 456133300.0, 'policy_loss': -0.018189598, 'vf_loss': 456133300.0, 'vf_explained_var': 0.0, 'kl': 0.00861659, 'entropy': 7.781405, 'entropy_coeff': 0.0}
2020-03-19 08:31:25,646	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 456133300.0, 'policy_loss': -0.02243932, 'vf_loss': 456133300.0, 'vf_explained_var': 0.0, 'kl': 0.0060540186, 'entropy': 7.762108, 'entropy_coeff': 0.0}
2020-03-19 08:31:28,177	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 'cu

custom_metrics: {}
date: 2020-03-19_08-32-35
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 51
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78603.155
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.794765472412109
      entropy_coeff: 0.0
      kl: 0.016187306493520737
      policy_loss: -0.08043887466192245
      total_loss: 456133312.0
      vf_explained_var: 0.0
      vf_loss: 456133312.0
  load_time_ms: 16.92
  num_steps_sampled: 35751
  num_steps_trained: 32640
  sample_time_ms: 23329.181
  update_time_ms: 22.457
iterations_since_restore: 22
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.618018018018
  ram_util_percent: 19.741441441441427
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2:

2020-03-19 08:32:38,578	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 267654880.0, 'policy_loss': 0.0025431155, 'vf_loss': 267654880.0, 'vf_explained_var': 0.0, 'kl': 0.010113177, 'entropy': 7.7882957, 'entropy_coeff': 0.0}
2020-03-19 08:32:41,175	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 267654860.0, 'policy_loss': -0.0036355257, 'vf_loss': 267654860.0, 'vf_explained_var': 0.0, 'kl': 0.006930546, 'entropy': 7.7720094, 'entropy_coeff': 0.0}
2020-03-19 08:32:43,886	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 267654860.0, 'policy_loss': -0.0065078186, 'vf_loss': 267654860.0, 'vf_explained_var': 0.0, 'kl': 0.004380462, 'entropy': 7.758085, 'entropy_coeff': 0.0}
2020-03-19 08:32:46,493	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 

custom_metrics: {}
date: 2020-03-19_08-33-54
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 52
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78735.244
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.799253940582275
      entropy_coeff: 0.0
      kl: 0.010191596113145351
      policy_loss: -0.05303140729665756
      total_loss: 267654864.0
      vf_explained_var: 0.0
      vf_loss: 267654864.0
  load_time_ms: 16.97
  num_steps_sampled: 36452
  num_steps_trained: 33280
  sample_time_ms: 15623.65
  update_time_ms: 22.861
iterations_since_restore: 23
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.65357142857144
  ram_util_percent: 19.811607142857145
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2

2020-03-19 08:33:57,190	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 51427024.0, 'policy_loss': 0.004306498, 'vf_loss': 51427024.0, 'vf_explained_var': 0.0, 'kl': 0.0072600865, 'entropy': 7.789557, 'entropy_coeff': 0.0}
2020-03-19 08:33:59,837	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 51427020.0, 'policy_loss': -0.0053658546, 'vf_loss': 51427020.0, 'vf_explained_var': 0.0, 'kl': 0.0037237878, 'entropy': 7.7592344, 'entropy_coeff': 0.0}
2020-03-19 08:34:02,425	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 51427020.0, 'policy_loss': -0.008382696, 'vf_loss': 51427020.0, 'vf_explained_var': 0.0, 'kl': 0.0026941982, 'entropy': 7.7444153, 'entropy_coeff': 0.0}
2020-03-19 08:34:05,027	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 'cur_

custom_metrics: {}
date: 2020-03-19_08-35-12
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 53
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 78787.288
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.781811714172363
      entropy_coeff: 0.0
      kl: 0.011742925271391869
      policy_loss: -0.05333109572529793
      total_loss: 51427020.0
      vf_explained_var: 0.0
      vf_loss: 51427020.0
  load_time_ms: 17.278
  num_steps_sampled: 37153
  num_steps_trained: 33920
  sample_time_ms: 15623.874
  update_time_ms: 23.062
iterations_since_restore: 24
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.72767857142857
  ram_util_percent: 19.900000000000002
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2

2020-03-19 08:36:30,321	INFO multi_gpu_optimizer.py:143 -- Collected more training samples than expected (actual=701, train_batch_size=128). This may be because you have many workers or long episodes in 'complete_episodes' batch mode.
2020-03-19 08:36:30,322	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:36:30,328	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:36:30,330	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:36:30,331	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:36:30,331	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/rnn/lstm_cell/kernel:0' shape=(768, 1024) dtype

custom_metrics: {}
date: 2020-03-19_08-37-51
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 54
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79166.109
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.8424811363220215
      entropy_coeff: 0.0
      kl: 0.01885565184056759
      policy_loss: -0.09853583574295044
      total_loss: 5884383.0
      vf_explained_var: 0.0
      vf_loss: 5884383.0
  load_time_ms: 17.189
  num_steps_sampled: 37854
  num_steps_trained: 34560
  sample_time_ms: 23384.579
  update_time_ms: 22.846
iterations_since_restore: 25
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 77.71858407079645
  ram_util_percent: 20.329646017699115
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2: 

2020-03-19 08:37:54,037	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 21763476.0, 'policy_loss': 0.0061854436, 'vf_loss': 21763476.0, 'vf_explained_var': 0.0, 'kl': 0.011442553, 'entropy': 7.811034, 'entropy_coeff': 0.0}
2020-03-19 08:37:56,746	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 21763476.0, 'policy_loss': 0.00039393306, 'vf_loss': 21763472.0, 'vf_explained_var': 0.0, 'kl': 0.004990487, 'entropy': 7.7786584, 'entropy_coeff': 0.0}
2020-03-19 08:37:59,585	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 21763476.0, 'policy_loss': -0.0022594752, 'vf_loss': 21763476.0, 'vf_explained_var': 0.0, 'kl': 0.0027740828, 'entropy': 7.7564406, 'entropy_coeff': 0.0}
2020-03-19 08:38:02,232	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 'cur_

custom_metrics: {}
date: 2020-03-19_08-39-13
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 55
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79624.496
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.816381931304932
      entropy_coeff: 0.0
      kl: 0.014014827087521553
      policy_loss: -0.05105217173695564
      total_loss: 21763476.0
      vf_explained_var: 0.0
      vf_loss: 21763472.0
  load_time_ms: 16.876
  num_steps_sampled: 38555
  num_steps_trained: 35200
  sample_time_ms: 23384.63
  update_time_ms: 23.342
iterations_since_restore: 26
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.77777777777776
  ram_util_percent: 20.5
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2: 67115.0
  pol

2020-03-19 08:39:15,894	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 28478042.0, 'policy_loss': 0.029746776, 'vf_loss': 28478042.0, 'vf_explained_var': 0.0, 'kl': 0.010115964, 'entropy': 7.7992477, 'entropy_coeff': 0.0}
2020-03-19 08:39:18,612	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 28478042.0, 'policy_loss': 0.018841984, 'vf_loss': 28478042.0, 'vf_explained_var': 0.0, 'kl': 0.0073009073, 'entropy': 7.7773185, 'entropy_coeff': 0.0}
2020-03-19 08:39:21,338	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 28478042.0, 'policy_loss': 0.012919143, 'vf_loss': 28478042.0, 'vf_explained_var': 0.0, 'kl': 0.006099367, 'entropy': 7.7673197, 'entropy_coeff': 0.0}
2020-03-19 08:39:24,064	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr':

custom_metrics: {}
date: 2020-03-19_08-40-35
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 56
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79528.914
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.766786098480225
      entropy_coeff: 0.0
      kl: 0.013120660558342934
      policy_loss: -0.040953006595373154
      total_loss: 28478038.0
      vf_explained_var: 0.0
      vf_loss: 28478038.0
  load_time_ms: 14.431
  num_steps_sampled: 39256
  num_steps_trained: 35840
  sample_time_ms: 15389.437
  update_time_ms: 23.297
iterations_since_restore: 27
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.81452991452991
  ram_util_percent: 20.599999999999994
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_

2020-03-19 08:40:38,040	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 1244473.9, 'policy_loss': 0.002398199, 'vf_loss': 1244473.8, 'vf_explained_var': 0.0, 'kl': 0.008823524, 'entropy': 7.758541, 'entropy_coeff': 0.0}
2020-03-19 08:40:40,715	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 1244473.8, 'policy_loss': -0.008708214, 'vf_loss': 1244473.8, 'vf_explained_var': 0.0, 'kl': 0.0035484496, 'entropy': 7.7364655, 'entropy_coeff': 0.0}
2020-03-19 08:40:43,434	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 1244473.8, 'policy_loss': -0.015359233, 'vf_loss': 1244473.8, 'vf_explained_var': 0.0, 'kl': 0.0027115964, 'entropy': 7.7265344, 'entropy_coeff': 0.0}
2020-03-19 08:40:46,091	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.9

custom_metrics: {}
date: 2020-03-19_08-41-55
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 57
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79334.06
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.755210876464844
      entropy_coeff: 0.0
      kl: 0.012456420809030533
      policy_loss: -0.06198450177907944
      total_loss: 1244473.75
      vf_explained_var: 0.0
      vf_loss: 1244473.75
  load_time_ms: 14.561
  num_steps_sampled: 39957
  num_steps_trained: 36480
  sample_time_ms: 15389.408
  update_time_ms: 23.086
iterations_since_restore: 28
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.68434782608695
  ram_util_percent: 20.699999999999992
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2:

2020-03-19 08:43:15,361	INFO multi_gpu_optimizer.py:143 -- Collected more training samples than expected (actual=701, train_batch_size=128). This may be because you have many workers or long episodes in 'complete_episodes' batch mode.
2020-03-19 08:43:15,362	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:43:15,363	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc2/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:43:15,364	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/kernel:0' shape=(512, 512) dtype=float32_ref>
2020-03-19 08:43:15,365	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/fc3/bias:0' shape=(512,) dtype=float32_ref>
2020-03-19 08:43:15,366	INFO tf_policy.py:395 -- Optimizing variable <tf.Variable 'policy_0/default_model/rnn/lstm_cell/kernel:0' shape=(768, 1024) dtype

custom_metrics: {}
date: 2020-03-19_08-44-36
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 58
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79462.244
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.759871006011963
      entropy_coeff: 0.0
      kl: 0.008400293067097664
      policy_loss: -0.041761450469493866
      total_loss: 35709320.0
      vf_explained_var: 0.0
      vf_loss: 35709320.0
  load_time_ms: 14.16
  num_steps_sampled: 40658
  num_steps_trained: 37120
  sample_time_ms: 23349.068
  update_time_ms: 23.13
iterations_since_restore: 29
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 77.86260869565218
  ram_util_percent: 20.86739130434783
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2: 

2020-03-19 08:44:39,538	DEBUG multi_gpu_optimizer.py:205 -- 0 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 20945772.0, 'policy_loss': -0.010612282, 'vf_loss': 20945772.0, 'vf_explained_var': 0.0, 'kl': 0.009538524, 'entropy': 7.758176, 'entropy_coeff': 0.0}
2020-03-19 08:44:42,217	DEBUG multi_gpu_optimizer.py:205 -- 1 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 20945772.0, 'policy_loss': -0.019643921, 'vf_loss': 20945772.0, 'vf_explained_var': 0.0, 'kl': 0.00422633, 'entropy': 7.740348, 'entropy_coeff': 0.0}
2020-03-19 08:44:44,876	DEBUG multi_gpu_optimizer.py:205 -- 2 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr': 4.999999873689376e-05, 'total_loss': 20945772.0, 'policy_loss': -0.026252622, 'vf_loss': 20945772.0, 'vf_explained_var': 0.0, 'kl': 0.0025179698, 'entropy': 7.7299843, 'entropy_coeff': 0.0}
2020-03-19 08:44:47,570	DEBUG multi_gpu_optimizer.py:205 -- 3 {'cur_kl_coeff': 0.5062500238418579, 'cur_lr':

custom_metrics: {}
date: 2020-03-19_08-45-55
done: false
episode_len_mean: 701.0
episode_reward_max: 0.0
episode_reward_mean: 0.0
episode_reward_min: 0.0
episodes_this_iter: 1
episodes_total: 59
experiment_id: b6d1c176f8974af2bfd45911d6888178
hostname: 91380f02bba8
info:
  grad_time_ms: 79619.822
  learner:
    policy_0:
      cur_kl_coeff: 0.5062500238418579
      cur_lr: 4.999999873689376e-05
      entropy: 7.752533912658691
      entropy_coeff: 0.0
      kl: 0.009774331003427505
      policy_loss: -0.07509350031614304
      total_loss: 20945772.0
      vf_explained_var: 0.0
      vf_loss: 20945772.0
  load_time_ms: 14.566
  num_steps_sampled: 41359
  num_steps_trained: 37760
  sample_time_ms: 23349.058
  update_time_ms: 22.844
iterations_since_restore: 30
node_ip: 172.28.0.2
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 97.63893805309733
  ram_util_percent: 20.900000000000002
pid: 5591
policy_reward_max:
  policy_0: 128607.0
  policy_1: 77936.0
  policy_2