In [1]:
import numpy as np
import os
import tensorflow as tf
import warnings

from keras.models import Model
from keras.layers import Dense, Flatten, Convolution2D, Input, Concatenate, Activation
from keras.optimizers import Adam
from pommerman.configs import ffa_v0_env
from pommerman.envs.v0 import Pomme
from pommerman.agents import SimpleAgent, BaseAgent
from pommerman.constants import BOARD_SIZE
from rl.agents import DDPGAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.core import Env, Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint, Callback
from rl.random import OrnsteinUhlenbeckProcess

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
number_of_training_steps = 500000
log_interval = 1000
file_log_path = './dqn/rl_logs/ddgp_cnn128_3_3_dense_64_1/log.txt'
tensorboard_path = './dqn/logs/ddgp_cnn128_3_3_dense_64_1/'
model_path = './dqn/model/ddgp_cnn128_3_3_dense_64_1/model{step}.h4'

In [3]:
if not os.path.isdir(os.path.dirname(file_log_path)):
    os.makedirs(os.path.dirname(file_log_path))
if not os.path.isdir(os.path.dirname(model_path)):
    os.makedirs(os.path.dirname(model_path))

In [4]:
class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass


class TensorboardLogger(Callback):
    """Logging in tensorboard without tensorflow ops."""
    def __init__(self, log_dir):
        # Some algorithms compute multiple episodes at once since they are multi-threaded.
        # We therefore use a dictionary that is indexed by the episode to separate episodes
        # from each other.
        self.observations = {}
        self.rewards = {}
        self.actions = {}
        self.metrics = {}
        self.step = 0
        """Creates a summary writer logging to log_dir."""
        self.writer = tf.summary.FileWriter(log_dir)

    def log_scalar(self, tag, value, step):
        """Log a scalar variable.
        Parameter
        ----------
        tag : basestring
            Name of the scalar
        value
        step : int
            training iteration
        """
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
        self.writer.add_summary(summary, step)

    def on_train_begin(self, logs):
        self.metrics_names = self.model.metrics_names

    def on_episode_begin(self, episode, logs):
        self.observations[episode] = []
        self.rewards[episode] = []
        self.actions[episode] = []
        self.metrics[episode] = []

    def on_episode_end(self, episode, logs):
        episode_steps = len(self.observations[episode])
        variables = {
            'step': self.step,
            'nb_steps': self.params['nb_steps'],
            'episode_steps': episode_steps,
            'episode_reward': np.sum(self.rewards[episode]),
            'reward_mean': np.mean(self.rewards[episode]),
            'reward_min': np.min(self.rewards[episode]),
            'reward_max': np.max(self.rewards[episode]),
            'action_mean': np.mean(np.argmax(self.actions[episode], axis=1)),
            'action_min': np.min(np.argmax(self.actions[episode], axis=1)),
            'action_max': np.max(np.argmax(self.actions[episode], axis=1)),
            'obs_mean': np.mean(self.observations[episode]),
            'obs_min': np.min(self.observations[episode]),
            'obs_max': np.max(self.observations[episode]),
        }

        # Format all metrics.
        metrics = np.array(self.metrics[episode])
        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            for idx, name in enumerate(self.metrics_names):
                try:
                    value = np.nanmean(metrics[:, idx])
                except Warning:
                    value = -1
                variables[name] = value
        for key, value in variables.items():
            self.log_scalar(key, value, episode + 1)

        # Free up resources.
        del self.observations[episode]
        del self.rewards[episode]
        del self.actions[episode]
        del self.metrics[episode]

    def on_step_end(self, step, logs):
        episode = logs['episode']
        self.observations[episode].append(logs['observation'])
        self.rewards[episode].append(logs['reward'])
        self.actions[episode].append(logs['action'])
        self.metrics[episode].append(logs['metrics'])
        self.step += 1

In [5]:
# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
np.random.seed(0)
env.seed(0)

env.set_init_game_state(None)
nb_actions = env.action_space.n


def create_actor(actions, input_shape=(13, 13, 17,)):
    inp = Input(input_shape)
    x = Convolution2D(128, 3, activation='relu')(inp)
    x = Convolution2D(128, 3, activation='relu')(x)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    out = Dense(actions)(x)
    out = Activation('softmax')(out)
    model = Model(inputs=inp, outputs=out)
    return model


def create_critic(actions, input_shape=(13, 13, 17,)):
    action_input = Input(shape=(actions,), name='action_input')
    observation_input = Input(shape=input_shape, name='observation_input')
    x = Convolution2D(128, 3, activation='relu')(observation_input)
    x = Convolution2D(128, 3, activation='relu')(x)
    x = Flatten()(x)    
    x = Concatenate()([action_input, x])
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    return action_input, Model(inputs=[action_input, observation_input], outputs=x)


actor = create_actor(nb_actions)
action_input, critic = create_critic(nb_actions)
print(actor.summary())
print(critic.summary())

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 13, 13, 17)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 11, 128)       19712     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 9, 128)         147584    
_________________________________________________________________
flatten_1 (Flatten)          (None, 10368)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                663616    
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 390       
__________________________________

In [6]:
class EnvWrapper(Env):
    """The abstract environment class that is used by all agents. This class has the exact
        same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the
        OpenAI Gym implementation, this class only defines the abstract methods without any actual
        implementation.
        To implement your own environment, you need to define the following methods:
        - `step`
        - `reset`
        - `render`
        - `close`
        Refer to the [Gym documentation](https://gym.openai.com/docs/#environments).
        """
    reward_range = (-1, 1)
    action_space = None
    observation_space = None

    def __init__(self, gym, board_size):
        self.gym = gym
        self.action_space = gym.action_space
        self.observation_space = gym.observation_space
        self.reward_range = gym.reward_range
        self.board_size = board_size

    def step(self, action):
        """Run one timestep of the environment's dynamics.
        Accepts an action and returns a tuple (observation, reward, done, info).
        # Arguments
            action (object): An action provided by the environment.
        # Returns
            observation (object): Agent's observation of the current environment.
            reward (float) : Amount of reward returned after previous action.
            done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
            info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
        """
        action = np.argmax(action)
        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, action)
        state, reward, terminal, info = self.gym.step(all_actions)
        agent_state = self.featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        return agent_state, agent_reward, terminal, info

    def reset(self):
        """
        Resets the state of the environment and returns an initial observation.
        # Returns
            observation (object): The initial observation of the space. Initial reward is assumed to be 0.
        """
        # Add 3 random agents
        train_agent_pos = np.random.randint(0, 4)
        agents = []
        for agent_id in range(4):
            if agent_id == train_agent_pos:
                agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
            else:
                agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))
        self.gym.set_agents(agents)
        self.gym.set_training_agent(agents[train_agent_pos].agent_id)
        
        obs = self.gym.reset()
        agent_obs = self.featurize(obs[self.gym.training_agent])
        return agent_obs

    def render(self, mode='human', close=False):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.)
        # Arguments
            mode (str): The mode to render with.
            close (bool): Close all open renderings.
        """
        self.gym.render(mode=mode, close=close)

    def close(self):
        """Override in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self.gym.close()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        # Returns
            Returns the list of seeds used in this env's random number generators
        """
        raise self.gym.seed(seed)

    def configure(self, *args, **kwargs):
        """Provides runtime configuration to the environment.
        This configuration should consist of data that tells your
        environment how to run (such as an address of a remote server,
        or path to your ImageNet data). It should not affect the
        semantics of the environment.
        """
        raise NotImplementedError()

    def featurize(self, obs):
        shape = (self.board_size, self.board_size, 1)

        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(shape).astype(np.float32)

        def get_map(board, item):
            map = np.zeros(shape)
            map[board == item] = 1
            return map

        board = get_matrix(obs, 'board')

        # TODO: probably not needed Passage = 0
        rigid_map = get_map(board, 1)  # Rigid = 1
        wood_map = get_map(board, 2)  # Wood = 2
        bomb_map = get_map(board, 3)  # Bomb = 3
        flames_map = get_map(board, 4)  # Flames = 4
        fog_map = get_map(board, 5)  # TODO: not used for first two stages Fog = 5
        extra_bomb_map = get_map(board, 6)  # ExtraBomb = 6
        incr_range_map = get_map(board, 7)  # IncrRange = 7
        kick_map = get_map(board, 8)  # Kick = 8
        skull_map = get_map(board, 9)  # Skull = 9

        position = obs["position"]
        my_position = np.zeros(shape)
        my_position[position[0], position[1], 0] = 1

        team_mates = get_map(board, obs["teammate"].value)  # TODO during documentation it should be an array

        enemies = np.zeros(shape)
        for enemy in obs["enemies"]:
            enemies[board == enemy.value] = 1

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')

        ammo = np.full(shape, obs["ammo"])
        blast_strength = np.full(shape, obs["blast_strength"])
        can_kick = np.full(shape, int(obs["can_kick"]))

        obs = np.concatenate([my_position, enemies, team_mates, rigid_map,
                              wood_map, bomb_map, flames_map,
                              fog_map, extra_bomb_map, incr_range_map,
                              kick_map, skull_map, bomb_blast_strength,
                              bomb_life, ammo, blast_strength, can_kick], axis=2)
        return obs

    def __del__(self):
        self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)


class CustomProcessor(Processor):
    def process_state_batch(self, batch):
        """Processes an entire batch of states and returns it.
        # Arguments
            batch (list): List of states
        # Returns
            Processed list of states
        """
        batch = np.squeeze(batch, axis=1)
        return batch

    def process_info(self, info):
        """Processes the info as obtained from the environment for use in an agent and
        returns it.
        """
        info['result'] = info['result'].value
        return info

In [7]:
env_wrapper = EnvWrapper(env, BOARD_SIZE)
processor = CustomProcessor()


memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=512, nb_steps_warmup_actor=512,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  batch_size=512, processor=processor)
agent.compile(Adam(lr=0.0001, clipnorm=1.), metrics=['mae'])


file_logger = FileLogger(file_log_path, interval=log_interval)
checkpoint = ModelIntervalCheckpoint(model_path, interval=log_interval)
tensorboard = TensorboardLogger(tensorboard_path)
callbacks=[file_logger, checkpoint, tensorboard]
if os.path.isfile(model_path):
    agent.load_weights(model_path)

In [None]:
history = agent.fit(env_wrapper, nb_steps=number_of_training_steps, visualize=False, verbose=2,
        nb_max_episode_steps=env._max_steps, callbacks=callbacks)

Training for 500000 steps ...
    154/500000: episode: 1, duration: 3.560s, episode steps: 154, steps per second: 43, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 0.202 [-0.255, 0.758], mean observation: 0.278 [0.000, 24.000], loss: --, mean_absolute_error: --, mean_q: --
   1124/500000: episode: 2, duration: 58.083s, episode steps: 970, steps per second: 17, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean action: 0.214 [-0.885, 1.232], mean observation: 0.263 [0.000, 24.000], loss: 0.000180, mean_absolute_error: 0.006944, mean_q: 0.025429
   3624/500000: episode: 3, duration: 207.377s, episode steps: 2500, steps per second: 12, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: 0.165 [-1.327, 1.809], mean observation: 0.265 [0.000, 24.000], loss: 0.000074, mean_absolute_error: 0.003767, mean_q: 0.022566
   4083/500000: episode: 4, duration: 37.490s, episode steps: 459, steps per second: 12, episode reward: -1.000, mean

  24243/500000: episode: 30, duration: 68.336s, episode steps: 923, steps per second: 14, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean action: 0.432 [-1.152, 2.108], mean observation: 0.265 [0.000, 24.000], loss: 0.000056, mean_absolute_error: 0.004104, mean_q: -0.001009
  24650/500000: episode: 31, duration: 31.858s, episode steps: 407, steps per second: 13, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.250 [-0.520, 1.231], mean observation: 0.275 [0.000, 24.000], loss: 0.000052, mean_absolute_error: 0.004107, mean_q: -0.002057
  25005/500000: episode: 32, duration: 28.883s, episode steps: 355, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.148 [-0.638, 1.200], mean observation: 0.277 [0.000, 24.000], loss: 0.000044, mean_absolute_error: 0.003961, mean_q: -0.003495
  25313/500000: episode: 33, duration: 24.375s, episode steps: 308, steps per second: 13, episode reward: -1.000, mean re

  48666/500000: episode: 58, duration: 27.750s, episode steps: 340, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.019 [-0.784, 1.192], mean observation: 0.275 [0.000, 24.000], loss: 0.000048, mean_absolute_error: 0.004241, mean_q: -0.045635
  49208/500000: episode: 59, duration: 43.708s, episode steps: 542, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.267 [-0.642, 1.828], mean observation: 0.273 [0.000, 24.000], loss: 0.000058, mean_absolute_error: 0.004751, mean_q: -0.046185
  49511/500000: episode: 60, duration: 24.375s, episode steps: 303, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.205 [-0.826, 1.648], mean observation: 0.278 [0.000, 24.000], loss: 0.000064, mean_absolute_error: 0.004894, mean_q: -0.047289
  52011/500000: episode: 61, duration: 190.061s, episode steps: 2500, steps per second: 13, episode reward: 0.000, mean r

  74768/500000: episode: 87, duration: 38.077s, episode steps: 462, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.149 [-0.941, 1.492], mean observation: 0.276 [0.000, 24.000], loss: 0.000082, mean_absolute_error: 0.005408, mean_q: -0.060638
  75249/500000: episode: 88, duration: 39.036s, episode steps: 481, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.252 [-0.530, 1.446], mean observation: 0.269 [0.000, 24.000], loss: 0.000080, mean_absolute_error: 0.005226, mean_q: -0.060878
  75616/500000: episode: 89, duration: 28.715s, episode steps: 367, steps per second: 13, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.227 [-1.143, 1.319], mean observation: 0.277 [0.000, 24.000], loss: 0.000098, mean_absolute_error: 0.005480, mean_q: -0.061589
  78116/500000: episode: 90, duration: 203.771s, episode steps: 2500, steps per second: 12, episode reward: 0.000, mean r

  91206/500000: episode: 115, duration: 32.498s, episode steps: 403, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.082 [-0.696, 1.385], mean observation: 0.276 [0.000, 24.000], loss: 0.000079, mean_absolute_error: 0.005340, mean_q: -0.071743
  91587/500000: episode: 116, duration: 31.007s, episode steps: 381, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.332 [-0.404, 1.780], mean observation: 0.279 [0.000, 24.000], loss: 0.000070, mean_absolute_error: 0.005158, mean_q: -0.071814
  94087/500000: episode: 117, duration: 204.057s, episode steps: 2500, steps per second: 12, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: -0.008 [-1.554, 1.998], mean observation: 0.265 [0.000, 24.000], loss: 0.000062, mean_absolute_error: 0.005070, mean_q: -0.072692
  94447/500000: episode: 118, duration: 28.161s, episode steps: 360, steps per second: 13, episode reward: -1.000, mea

 111481/500000: episode: 143, duration: 30.839s, episode steps: 383, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.214 [-0.630, 1.438], mean observation: 0.277 [0.000, 24.000], loss: 0.000072, mean_absolute_error: 0.005345, mean_q: -0.085778
 111837/500000: episode: 144, duration: 29.091s, episode steps: 356, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.229 [-0.973, 1.733], mean observation: 0.278 [0.000, 24.000], loss: 0.000062, mean_absolute_error: 0.005211, mean_q: -0.086440
 112397/500000: episode: 145, duration: 44.263s, episode steps: 560, steps per second: 13, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.125 [-0.828, 1.584], mean observation: 0.274 [0.000, 24.000], loss: 0.000068, mean_absolute_error: 0.005152, mean_q: -0.085725
 112820/500000: episode: 146, duration: 34.380s, episode steps: 423, steps per second: 12, episode reward: -1.000, mea

 130104/500000: episode: 171, duration: 13.588s, episode steps: 167, steps per second: 12, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: -0.009 [-0.452, 0.971], mean observation: 0.278 [0.000, 24.000], loss: 0.000071, mean_absolute_error: 0.004925, mean_q: -0.096899
 130534/500000: episode: 172, duration: 34.399s, episode steps: 430, steps per second: 13, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: -0.075 [-1.111, 1.263], mean observation: 0.276 [0.000, 24.000], loss: 0.000062, mean_absolute_error: 0.004549, mean_q: -0.097868
 133034/500000: episode: 173, duration: 209.344s, episode steps: 2500, steps per second: 12, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: 0.335 [-1.537, 2.020], mean observation: 0.265 [0.000, 24.000], loss: 0.000060, mean_absolute_error: 0.004563, mean_q: -0.098612
 133417/500000: episode: 174, duration: 31.481s, episode steps: 383, steps per second: 12, episode reward: -1.000, me

 160020/500000: episode: 199, duration: 50.373s, episode steps: 557, steps per second: 11, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.162 [-1.096, 1.405], mean observation: 0.272 [0.000, 24.000], loss: 0.000053, mean_absolute_error: 0.004109, mean_q: -0.110607
 160411/500000: episode: 200, duration: 30.764s, episode steps: 391, steps per second: 13, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.203 [-0.734, 1.133], mean observation: 0.278 [0.000, 24.000], loss: 0.000068, mean_absolute_error: 0.004421, mean_q: -0.110725
 160980/500000: episode: 201, duration: 44.734s, episode steps: 569, steps per second: 13, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.331 [-0.624, 1.930], mean observation: 0.274 [0.000, 24.000], loss: 0.000051, mean_absolute_error: 0.004087, mean_q: -0.111403
 161394/500000: episode: 202, duration: 32.177s, episode steps: 414, steps per second: 13, episode reward: -1.000, mea

 177169/500000: episode: 227, duration: 23.880s, episode steps: 320, steps per second: 13, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.272 [-0.556, 1.481], mean observation: 0.276 [0.000, 24.000], loss: 0.000067, mean_absolute_error: 0.004612, mean_q: -0.125257
 177653/500000: episode: 228, duration: 37.777s, episode steps: 484, steps per second: 13, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.166 [-1.055, 1.539], mean observation: 0.274 [0.000, 24.000], loss: 0.000057, mean_absolute_error: 0.004365, mean_q: -0.125283
 177963/500000: episode: 229, duration: 24.054s, episode steps: 310, steps per second: 13, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.310 [-0.701, 1.113], mean observation: 0.275 [0.000, 24.000], loss: 0.000057, mean_absolute_error: 0.004459, mean_q: -0.126656
 178870/500000: episode: 230, duration: 66.068s, episode steps: 907, steps per second: 14, episode reward: -1.000, mea

 198123/500000: episode: 255, duration: 37.537s, episode steps: 459, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.086 [-1.212, 1.777], mean observation: 0.276 [0.000, 24.000], loss: 0.000067, mean_absolute_error: 0.004504, mean_q: -0.141689
 198497/500000: episode: 256, duration: 30.796s, episode steps: 374, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.143 [-0.529, 1.307], mean observation: 0.292 [0.000, 24.000], loss: 0.000066, mean_absolute_error: 0.004579, mean_q: -0.141308
 198850/500000: episode: 257, duration: 28.670s, episode steps: 353, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.181 [-0.820, 1.583], mean observation: 0.278 [0.000, 24.000], loss: 0.000054, mean_absolute_error: 0.004340, mean_q: -0.141142
 198917/500000: episode: 258, duration: 5.501s, episode steps: 67, steps per second: 12, episode reward: -1.000, mean 

 215632/500000: episode: 283, duration: 45.296s, episode steps: 552, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.104 [-0.895, 1.739], mean observation: 0.271 [0.000, 24.000], loss: 0.000069, mean_absolute_error: 0.004523, mean_q: -0.143145
 215817/500000: episode: 284, duration: 14.711s, episode steps: 185, steps per second: 13, episode reward: -1.000, mean reward: -0.005 [-1.000, 0.000], mean action: 0.069 [-0.774, 1.034], mean observation: 0.275 [0.000, 24.000], loss: 0.000098, mean_absolute_error: 0.004673, mean_q: -0.142513
 216158/500000: episode: 285, duration: 28.331s, episode steps: 341, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.161 [-0.561, 2.053], mean observation: 0.283 [0.000, 24.000], loss: 0.000074, mean_absolute_error: 0.004596, mean_q: -0.142644
 217952/500000: episode: 286, duration: 131.313s, episode steps: 1794, steps per second: 14, episode reward: -1.000, m

 238633/500000: episode: 311, duration: 31.720s, episode steps: 406, steps per second: 13, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: -0.109 [-1.039, 1.102], mean observation: 0.276 [0.000, 24.000], loss: 0.000067, mean_absolute_error: 0.004266, mean_q: -0.138839
 239025/500000: episode: 312, duration: 30.651s, episode steps: 392, steps per second: 13, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.082 [-0.819, 0.999], mean observation: 0.273 [0.000, 24.000], loss: 0.000076, mean_absolute_error: 0.004628, mean_q: -0.138748
 239300/500000: episode: 313, duration: 21.347s, episode steps: 275, steps per second: 13, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 0.416 [-0.386, 1.551], mean observation: 0.280 [0.000, 24.000], loss: 0.000071, mean_absolute_error: 0.004529, mean_q: -0.139664
 239908/500000: episode: 314, duration: 50.193s, episode steps: 608, steps per second: 12, episode reward: -1.000, me

 257245/500000: episode: 339, duration: 40.301s, episode steps: 498, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.217 [-0.523, 1.290], mean observation: 0.274 [0.000, 24.000], loss: 0.000070, mean_absolute_error: 0.004822, mean_q: -0.161209
 257936/500000: episode: 340, duration: 56.546s, episode steps: 691, steps per second: 12, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean action: 0.295 [-0.874, 1.965], mean observation: 0.270 [0.000, 24.000], loss: 0.000073, mean_absolute_error: 0.005039, mean_q: -0.161896
 258145/500000: episode: 341, duration: 17.305s, episode steps: 209, steps per second: 12, episode reward: -1.000, mean reward: -0.005 [-1.000, 0.000], mean action: 0.095 [-0.457, 0.988], mean observation: 0.279 [0.000, 24.000], loss: 0.000070, mean_absolute_error: 0.004844, mean_q: -0.162224
 259117/500000: episode: 342, duration: 77.438s, episode steps: 972, steps per second: 13, episode reward: -1.000, mea

 280760/500000: episode: 367, duration: 92.934s, episode steps: 1210, steps per second: 13, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean action: 0.218 [-1.004, 1.711], mean observation: 0.264 [0.000, 24.000], loss: 0.000071, mean_absolute_error: 0.004578, mean_q: -0.163114
 281135/500000: episode: 368, duration: 29.160s, episode steps: 375, steps per second: 13, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.119 [-0.842, 1.597], mean observation: 0.277 [0.000, 24.000], loss: 0.000060, mean_absolute_error: 0.004415, mean_q: -0.163759
 281527/500000: episode: 369, duration: 32.011s, episode steps: 392, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.143 [-0.861, 1.086], mean observation: 0.280 [0.000, 24.000], loss: 0.000073, mean_absolute_error: 0.004703, mean_q: -0.164209
 282419/500000: episode: 370, duration: 70.733s, episode steps: 892, steps per second: 13, episode reward: -1.000, me

 303623/500000: episode: 395, duration: 33.282s, episode steps: 410, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.035 [-1.009, 1.543], mean observation: 0.275 [0.000, 24.000], loss: 0.000071, mean_absolute_error: 0.004420, mean_q: -0.149678
 305937/500000: episode: 396, duration: 184.684s, episode steps: 2314, steps per second: 13, episode reward: -1.000, mean reward: -0.000 [-1.000, 0.000], mean action: 0.286 [-1.727, 1.861], mean observation: 0.264 [0.000, 24.000], loss: 0.000067, mean_absolute_error: 0.004312, mean_q: -0.150835
 306348/500000: episode: 397, duration: 34.344s, episode steps: 411, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.023 [-1.174, 1.326], mean observation: 0.276 [0.000, 24.000], loss: 0.000059, mean_absolute_error: 0.004080, mean_q: -0.150888
 307618/500000: episode: 398, duration: 99.316s, episode steps: 1270, steps per second: 13, episode reward: -1.000, 

 326969/500000: episode: 423, duration: 36.539s, episode steps: 449, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.173 [-1.295, 1.802], mean observation: 0.272 [0.000, 24.000], loss: 0.000057, mean_absolute_error: 0.004114, mean_q: -0.154863
 328126/500000: episode: 424, duration: 87.407s, episode steps: 1157, steps per second: 13, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean action: 0.083 [-1.587, 1.817], mean observation: 0.262 [0.000, 24.000], loss: 0.000060, mean_absolute_error: 0.004188, mean_q: -0.156755
 328894/500000: episode: 425, duration: 60.809s, episode steps: 768, steps per second: 13, episode reward: 1.000, mean reward: 0.001 [0.000, 1.000], mean action: 0.060 [-1.162, 1.673], mean observation: 0.270 [0.000, 24.000], loss: 0.000057, mean_absolute_error: 0.004126, mean_q: -0.157582
 329161/500000: episode: 426, duration: 22.603s, episode steps: 267, steps per second: 12, episode reward: -1.000, mean 

 345451/500000: episode: 453, duration: 23.252s, episode steps: 276, steps per second: 12, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 0.150 [-0.770, 1.228], mean observation: 0.284 [0.000, 24.000], loss: 0.000076, mean_absolute_error: 0.004563, mean_q: -0.152466
 345698/500000: episode: 454, duration: 20.565s, episode steps: 247, steps per second: 12, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 0.148 [-0.607, 1.416], mean observation: 0.281 [0.000, 24.000], loss: 0.000058, mean_absolute_error: 0.004220, mean_q: -0.152476
 346135/500000: episode: 455, duration: 50.747s, episode steps: 437, steps per second: 9, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: -0.011 [-1.094, 1.297], mean observation: 0.275 [0.000, 24.000], loss: 0.000058, mean_absolute_error: 0.004210, mean_q: -0.151591
 346761/500000: episode: 456, duration: 53.148s, episode steps: 626, steps per second: 12, episode reward: -1.000, mea

 362239/500000: episode: 481, duration: 40.062s, episode steps: 514, steps per second: 13, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.233 [-0.724, 1.453], mean observation: 0.273 [0.000, 24.000], loss: 0.000065, mean_absolute_error: 0.004067, mean_q: -0.150862
 362631/500000: episode: 482, duration: 32.929s, episode steps: 392, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: -0.117 [-1.073, 1.273], mean observation: 0.277 [0.000, 24.000], loss: 0.000071, mean_absolute_error: 0.004325, mean_q: -0.150968
 363352/500000: episode: 483, duration: 59.163s, episode steps: 721, steps per second: 12, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean action: 0.167 [-0.947, 1.795], mean observation: 0.272 [0.000, 24.000], loss: 0.000066, mean_absolute_error: 0.004169, mean_q: -0.149694
 363654/500000: episode: 484, duration: 25.071s, episode steps: 302, steps per second: 12, episode reward: -1.000, me

 378173/500000: episode: 509, duration: 32.468s, episode steps: 390, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.135 [-0.951, 1.298], mean observation: 0.271 [0.000, 24.000], loss: 0.000056, mean_absolute_error: 0.004567, mean_q: -0.161750
 378902/500000: episode: 510, duration: 57.470s, episode steps: 729, steps per second: 13, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean action: 0.245 [-0.852, 1.519], mean observation: 0.269 [0.000, 24.000], loss: 0.000057, mean_absolute_error: 0.004535, mean_q: -0.163210
 379349/500000: episode: 511, duration: 36.233s, episode steps: 447, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: -0.008 [-0.690, 1.069], mean observation: 0.275 [0.000, 24.000], loss: 0.000064, mean_absolute_error: 0.004572, mean_q: -0.163753
 379689/500000: episode: 512, duration: 27.107s, episode steps: 340, steps per second: 13, episode reward: -1.000, me

 395310/500000: episode: 537, duration: 40.455s, episode steps: 503, steps per second: 12, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.243 [-0.838, 1.454], mean observation: 0.273 [0.000, 24.000], loss: 0.000061, mean_absolute_error: 0.004598, mean_q: -0.167024
 395555/500000: episode: 538, duration: 20.553s, episode steps: 245, steps per second: 12, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 0.098 [-0.594, 1.060], mean observation: 0.284 [0.000, 24.000], loss: 0.000060, mean_absolute_error: 0.004683, mean_q: -0.167868
 396186/500000: episode: 539, duration: 49.036s, episode steps: 631, steps per second: 13, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.164 [-0.635, 1.606], mean observation: 0.270 [0.000, 24.000], loss: 0.000068, mean_absolute_error: 0.004869, mean_q: -0.168920
 397439/500000: episode: 540, duration: 98.417s, episode steps: 1253, steps per second: 13, episode reward: -1.000, me

 411487/500000: episode: 565, duration: 6.146s, episode steps: 73, steps per second: 12, episode reward: -1.000, mean reward: -0.014 [-1.000, 0.000], mean action: 0.169 [-0.240, 0.988], mean observation: 0.268 [0.000, 24.000], loss: 0.000086, mean_absolute_error: 0.005276, mean_q: -0.172194
 413987/500000: episode: 566, duration: 205.320s, episode steps: 2500, steps per second: 12, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: 0.178 [-1.350, 1.970], mean observation: 0.270 [0.000, 24.000], loss: 0.000069, mean_absolute_error: 0.004819, mean_q: -0.172241
 416487/500000: episode: 567, duration: 190.373s, episode steps: 2500, steps per second: 13, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: -0.063 [-1.524, 1.943], mean observation: 0.260 [0.000, 24.000], loss: 0.000071, mean_absolute_error: 0.004780, mean_q: -0.172610
 417246/500000: episode: 568, duration: 70.347s, episode steps: 759, steps per second: 11, episode reward: -1.000, mean r

 437917/500000: episode: 593, duration: 190.307s, episode steps: 2500, steps per second: 13, episode reward: 0.000, mean reward: 0.000 [0.000, 0.000], mean action: 0.248 [-0.999, 1.875], mean observation: 0.264 [0.000, 24.000], loss: 0.000073, mean_absolute_error: 0.005085, mean_q: -0.190448
 438247/500000: episode: 594, duration: 27.229s, episode steps: 330, steps per second: 12, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.247 [-0.496, 1.781], mean observation: 0.279 [0.000, 24.000], loss: 0.000075, mean_absolute_error: 0.005224, mean_q: -0.189558
 438454/500000: episode: 595, duration: 17.438s, episode steps: 207, steps per second: 12, episode reward: -1.000, mean reward: -0.005 [-1.000, 0.000], mean action: 0.027 [-0.545, 0.998], mean observation: 0.280 [0.000, 24.000], loss: 0.000063, mean_absolute_error: 0.004916, mean_q: -0.188142
 438956/500000: episode: 596, duration: 39.948s, episode steps: 502, steps per second: 13, episode reward: -1.000, mean

In [None]:
agent.save_weights(model_path, overwrite=True)