In [10]:
import numpy as np
import os
import tensorflow as tf
import warnings

from keras.models import Model
from keras.layers import Dense, Flatten, Convolution2D, Input, Concatenate, Activation
from keras.optimizers import Adam
from pommerman.configs import ffa_v0_env
from pommerman.envs.v0 import Pomme
from pommerman.agents import SimpleAgent, BaseAgent
from pommerman.constants import BOARD_SIZE
from rl.agents import DDPGAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.core import Env, Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint, Callback
from rl.random import OrnsteinUhlenbeckProcess

In [11]:
number_of_training_steps = 500000
log_interval = 1000
file_log_path = './dqn/rl_logs/ddgp_dense_128_1/log.txt'
tensorboard_path = './dqn/logs/ddgp_dense_128_1/'
model_path = './dqn/model/ddgp_dense_128_1/model{step}.h4'

In [12]:
if not os.path.isdir(os.path.dirname(file_log_path)):
    os.makedirs(os.path.dirname(file_log_path))
if not os.path.isdir(os.path.dirname(model_path)):
    os.makedirs(os.path.dirname(model_path))

In [13]:
class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass


class TensorboardLogger(Callback):
    """Logging in tensorboard without tensorflow ops."""
    def __init__(self, log_dir):
        # Some algorithms compute multiple episodes at once since they are multi-threaded.
        # We therefore use a dictionary that is indexed by the episode to separate episodes
        # from each other.
        self.observations = {}
        self.rewards = {}
        self.actions = {}
        self.metrics = {}
        self.step = 0
        """Creates a summary writer logging to log_dir."""
        self.writer = tf.summary.FileWriter(log_dir)

    def log_scalar(self, tag, value, step):
        """Log a scalar variable.
        Parameter
        ----------
        tag : basestring
            Name of the scalar
        value
        step : int
            training iteration
        """
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
        self.writer.add_summary(summary, step)

    def on_train_begin(self, logs):
        self.metrics_names = self.model.metrics_names

    def on_episode_begin(self, episode, logs):
        self.observations[episode] = []
        self.rewards[episode] = []
        self.actions[episode] = []
        self.metrics[episode] = []

    def on_episode_end(self, episode, logs):
        episode_steps = len(self.observations[episode])
        variables = {
            'step': self.step,
            'nb_steps': self.params['nb_steps'],
            'episode_steps': episode_steps,
            'episode_reward': np.sum(self.rewards[episode]),
            'reward_mean': np.mean(self.rewards[episode]),
            'reward_min': np.min(self.rewards[episode]),
            'reward_max': np.max(self.rewards[episode]),
            'action_mean': np.mean(np.argmax(self.actions[episode], axis=1)),
            'action_min': np.min(np.argmax(self.actions[episode], axis=1)),
            'action_max': np.max(np.argmax(self.actions[episode], axis=1)),
            'obs_mean': np.mean(self.observations[episode]),
            'obs_min': np.min(self.observations[episode]),
            'obs_max': np.max(self.observations[episode]),
        }

        # Format all metrics.
        metrics = np.array(self.metrics[episode])
        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            for idx, name in enumerate(self.metrics_names):
                try:
                    value = np.nanmean(metrics[:, idx])
                except Warning:
                    value = -1
                variables[name] = value
        for key, value in variables.items():
            self.log_scalar(key, value, episode + 1)

        # Free up resources.
        del self.observations[episode]
        del self.rewards[episode]
        del self.actions[episode]
        del self.metrics[episode]

    def on_step_end(self, step, logs):
        episode = logs['episode']
        self.observations[episode].append(logs['observation'])
        self.rewards[episode].append(logs['reward'])
        self.actions[episode].append(logs['action'])
        self.metrics[episode].append(logs['metrics'])
        self.step += 1

In [14]:
# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
np.random.seed(0)
env.seed(0)

env.set_init_game_state(None)
nb_actions = env.action_space.n


def create_actor(actions, input_shape=(2369,)):
    inp = Input(input_shape)
    x = Dense(128)(inp)
    x = Activation('relu')(x) 
    out = Dense(actions)(x)
    out = Activation('softmax')(out)
    model = Model(inputs=inp, outputs=out)
    return model


def create_critic(actions, input_shape=(2369,)):
    action_input = Input(shape=(actions,), name='action_input')
    observation_input = Input(shape=input_shape, name='observation_input')
    x = Concatenate()([action_input, observation_input])
    x = Dense(128)(x)
    x = Activation('relu')(x) 
    x = Dense(1)(x)
    return action_input, Model(inputs=[action_input, observation_input], outputs=x)


actor = create_actor(nb_actions)
action_input, critic = create_critic(nb_actions)
print(actor.summary())
print(critic.summary())

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 2369)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               303360    
_________________________________________________________________
activation_4 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 774       
_________________________________________________________________
activation_5 (Activation)    (None, 6)                 0         
Total params: 304,134
Trainable params: 304,134
Non-trainable params: 0
_________________________________________________________________
None
_______________________

In [15]:
class EnvWrapper(Env):
    """The abstract environment class that is used by all agents. This class has the exact
        same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the
        OpenAI Gym implementation, this class only defines the abstract methods without any actual
        implementation.
        To implement your own environment, you need to define the following methods:
        - `step`
        - `reset`
        - `render`
        - `close`
        Refer to the [Gym documentation](https://gym.openai.com/docs/#environments).
        """
    reward_range = (-1, 1)
    action_space = None
    observation_space = None

    def __init__(self, gym, board_size):
        self.gym = gym
        self.action_space = gym.action_space
        self.observation_space = gym.observation_space
        self.reward_range = gym.reward_range
        self.board_size = board_size

    def step(self, action):
        """Run one timestep of the environment's dynamics.
        Accepts an action and returns a tuple (observation, reward, done, info).
        # Arguments
            action (object): An action provided by the environment.
        # Returns
            observation (object): Agent's observation of the current environment.
            reward (float) : Amount of reward returned after previous action.
            done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
            info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
        """
        action = np.argmax(action)
        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, action)
        state, reward, terminal, info = self.gym.step(all_actions)
        agent_state = self.featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        return agent_state, agent_reward, terminal, info

    def reset(self):
        """
        Resets the state of the environment and returns an initial observation.
        # Returns
            observation (object): The initial observation of the space. Initial reward is assumed to be 0.
        """
        # Add 3 random agents
        train_agent_pos = np.random.randint(0, 4)
        agents = []
        for agent_id in range(4):
            if agent_id == train_agent_pos:
                agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
            else:
                agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))
        self.gym.set_agents(agents)
        self.gym.set_training_agent(agents[train_agent_pos].agent_id)
        
        obs = self.gym.reset()
        agent_obs = self.featurize(obs[self.gym.training_agent])
        return agent_obs

    def render(self, mode='human', close=False):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.)
        # Arguments
            mode (str): The mode to render with.
            close (bool): Close all open renderings.
        """
        self.gym.render(mode=mode, close=close)

    def close(self):
        """Override in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self.gym.close()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        # Returns
            Returns the list of seeds used in this env's random number generators
        """
        raise self.gym.seed(seed)

    def configure(self, *args, **kwargs):
        """Provides runtime configuration to the environment.
        This configuration should consist of data that tells your
        environment how to run (such as an address of a remote server,
        or path to your ImageNet data). It should not affect the
        semantics of the environment.
        """
        raise NotImplementedError()

    def featurize(self, obs):
        shape = (self.board_size, self.board_size, 1)

        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(shape).astype(np.float32)

        def get_map(board, item):
            map = np.zeros(shape)
            map[board == item] = 1
            return map

        board = get_matrix(obs, 'board')

        # TODO: probably not needed Passage = 0
        rigid_map = get_map(board, 1)               # Rigid = 1
        wood_map = get_map(board, 2)                # Wood = 2
        bomb_map = get_map(board, 3)                # Bomb = 3
        flames_map = get_map(board, 4)              # Flames = 4
        fog_map = get_map(board, 5)                 # TODO: not used for first two stages Fog = 5
        extra_bomb_map = get_map(board, 6)          # ExtraBomb = 6
        incr_range_map = get_map(board, 7)          # IncrRange = 7
        kick_map = get_map(board, 8)                # Kick = 8
        skull_map = get_map(board, 9)               # Skull = 9

        position = obs["position"]
        my_position = np.zeros(shape)
        my_position[position[0], position[1], 0] = 1

        team_mates = get_map(board, obs["teammate"].value) # TODO during documentation it should be an array

        enemies = np.zeros(shape)
        for enemy in obs["enemies"]:
            enemies[board == enemy.value] = 1

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')

        ammo = obs["ammo"]
        blast_strength = obs["blast_strength"]
        can_kick = int(obs["can_kick"])

        obs = np.concatenate([my_position, enemies, team_mates, rigid_map,
                              wood_map, bomb_map, flames_map,
                              fog_map, extra_bomb_map, incr_range_map,
                              kick_map, skull_map, bomb_blast_strength,
                              bomb_life], axis=2).flatten()
        obs = np.append(obs, [ammo, blast_strength, can_kick])
        return obs

    def __del__(self):
        self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)


class CustomProcessor(Processor):
    def process_state_batch(self, batch):
        """Processes an entire batch of states and returns it.
        # Arguments
            batch (list): List of states
        # Returns
            Processed list of states
        """
        batch = np.squeeze(batch, axis=1)
        return batch

    def process_info(self, info):
        """Processes the info as obtained from the environment for use in an agent and
        returns it.
        """
        info['result'] = info['result'].value
        return info

In [16]:
env_wrapper = EnvWrapper(env, BOARD_SIZE)
processor = CustomProcessor()


memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=512, nb_steps_warmup_actor=512,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  batch_size=512, processor=processor)
agent.compile(Adam(lr=0.0001, clipnorm=1.), metrics=['mae'])


file_logger = FileLogger(file_log_path, interval=log_interval)
checkpoint = ModelIntervalCheckpoint(model_path, interval=log_interval)
tensorboard = TensorboardLogger(tensorboard_path)
callbacks=[file_logger, checkpoint, tensorboard]
if os.path.isfile(model_path):
    agent.load_weights(model_path)

In [None]:
history = agent.fit(env_wrapper, nb_steps=number_of_training_steps, visualize=False, verbose=2,
        nb_max_episode_steps=env._max_steps, callbacks=callbacks)

Training for 500000 steps ...
     25/500000: episode: 1, duration: 0.775s, episode steps: 25, steps per second: 32, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.189 [0.023, 0.510], mean observation: 0.072 [0.000, 24.000], loss: --, mean_absolute_error: --, mean_q: --
     60/500000: episode: 2, duration: 0.415s, episode steps: 35, steps per second: 84, episode reward: -1.000, mean reward: -0.029 [-1.000, 0.000], mean action: 0.092 [-0.621, 0.481], mean observation: 0.069 [0.000, 24.000], loss: --, mean_absolute_error: --, mean_q: --
    100/500000: episode: 3, duration: 0.452s, episode steps: 40, steps per second: 89, episode reward: -1.000, mean reward: -0.025 [-1.000, 0.000], mean action: 0.134 [-0.205, 0.901], mean observation: 0.070 [0.000, 24.000], loss: --, mean_absolute_error: --, mean_q: --
   1081/500000: episode: 4, duration: 30.205s, episode steps: 981, steps per second: 32, episode reward: -1.000, mean reward: -0.001 [-1.000, 0.000], mean act

   3499/500000: episode: 30, duration: 1.499s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.149 [-0.321, 1.041], mean observation: 0.064 [0.000, 24.000], loss: 0.001557, mean_absolute_error: 0.028494, mean_q: 0.008415
   3524/500000: episode: 31, duration: 1.460s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.128 [-0.241, 1.009], mean observation: 0.072 [0.000, 24.000], loss: 0.001709, mean_absolute_error: 0.028903, mean_q: 0.009327
   3549/500000: episode: 32, duration: 1.419s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.124 [-0.187, 0.984], mean observation: 0.072 [0.000, 24.000], loss: 0.001623, mean_absolute_error: 0.029130, mean_q: 0.014595
   3574/500000: episode: 33, duration: 1.363s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.04

   4227/500000: episode: 59, duration: 1.255s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.114 [-0.257, 1.015], mean observation: 0.072 [0.000, 24.000], loss: 0.001656, mean_absolute_error: 0.028519, mean_q: -0.011971
   4252/500000: episode: 60, duration: 1.319s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.154 [-0.232, 1.130], mean observation: 0.072 [0.000, 24.000], loss: 0.001901, mean_absolute_error: 0.029740, mean_q: -0.008481
   4277/500000: episode: 61, duration: 1.440s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.120 [-0.457, 1.125], mean observation: 0.072 [0.000, 24.000], loss: 0.001334, mean_absolute_error: 0.027403, mean_q: -0.013001
   4302/500000: episode: 62, duration: 1.341s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0

   4952/500000: episode: 88, duration: 1.466s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.156 [-0.237, 1.091], mean observation: 0.072 [0.000, 24.000], loss: 0.001208, mean_absolute_error: 0.026391, mean_q: -0.007915
   4977/500000: episode: 89, duration: 1.372s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.229 [-0.114, 1.267], mean observation: 0.072 [0.000, 24.000], loss: 0.001336, mean_absolute_error: 0.026435, mean_q: -0.011337
   5002/500000: episode: 90, duration: 1.444s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.187 [-0.141, 1.006], mean observation: 0.070 [0.000, 24.000], loss: 0.001092, mean_absolute_error: 0.025061, mean_q: -0.007136
   5027/500000: episode: 91, duration: 1.455s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0

   5677/500000: episode: 117, duration: 1.367s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.149 [-0.215, 1.083], mean observation: 0.072 [0.000, 24.000], loss: 0.001219, mean_absolute_error: 0.025248, mean_q: -0.041823
   5702/500000: episode: 118, duration: 1.338s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.153 [-0.248, 1.098], mean observation: 0.071 [0.000, 24.000], loss: 0.001152, mean_absolute_error: 0.026235, mean_q: -0.029824
   5727/500000: episode: 119, duration: 1.406s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.194 [-0.321, 1.084], mean observation: 0.071 [0.000, 24.000], loss: 0.001133, mean_absolute_error: 0.026189, mean_q: -0.042656
   5752/500000: episode: 120, duration: 1.365s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward

   6402/500000: episode: 146, duration: 1.356s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.191 [-0.144, 1.079], mean observation: 0.071 [0.000, 24.000], loss: 0.000902, mean_absolute_error: 0.023659, mean_q: -0.031841
   6427/500000: episode: 147, duration: 1.356s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.212 [-0.106, 1.219], mean observation: 0.071 [0.000, 24.000], loss: 0.001194, mean_absolute_error: 0.025433, mean_q: -0.040043
   6452/500000: episode: 148, duration: 1.362s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.185 [-0.253, 1.066], mean observation: 0.071 [0.000, 24.000], loss: 0.001103, mean_absolute_error: 0.025159, mean_q: -0.041006
   6477/500000: episode: 149, duration: 1.486s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward

   7127/500000: episode: 175, duration: 1.392s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.206 [-0.182, 1.208], mean observation: 0.072 [0.000, 24.000], loss: 0.001086, mean_absolute_error: 0.024333, mean_q: -0.057240
   7152/500000: episode: 176, duration: 1.409s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.187 [-0.225, 1.159], mean observation: 0.072 [0.000, 24.000], loss: 0.001020, mean_absolute_error: 0.024904, mean_q: -0.052902
   7177/500000: episode: 177, duration: 1.385s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.154 [-0.212, 1.086], mean observation: 0.072 [0.000, 24.000], loss: 0.001034, mean_absolute_error: 0.024281, mean_q: -0.048666
   7202/500000: episode: 178, duration: 1.343s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward

   8201/500000: episode: 204, duration: 1.410s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.237 [-0.164, 1.059], mean observation: 0.071 [0.000, 24.000], loss: 0.000998, mean_absolute_error: 0.025380, mean_q: -0.092489
   8293/500000: episode: 205, duration: 5.013s, episode steps: 92, steps per second: 18, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 0.144 [-0.545, 1.183], mean observation: 0.069 [0.000, 24.000], loss: 0.001079, mean_absolute_error: 0.025365, mean_q: -0.094235
   8318/500000: episode: 206, duration: 1.383s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.163 [-0.287, 1.050], mean observation: 0.072 [0.000, 24.000], loss: 0.001251, mean_absolute_error: 0.027733, mean_q: -0.094385
   8345/500000: episode: 207, duration: 1.426s, episode steps: 27, steps per second: 19, episode reward: -1.000, mean reward

   9270/500000: episode: 233, duration: 5.356s, episode steps: 108, steps per second: 20, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 0.207 [-0.466, 1.346], mean observation: 0.065 [0.000, 24.000], loss: 0.001191, mean_absolute_error: 0.025156, mean_q: -0.115346
   9297/500000: episode: 234, duration: 1.544s, episode steps: 27, steps per second: 17, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 0.109 [-0.303, 1.003], mean observation: 0.070 [0.000, 24.000], loss: 0.001292, mean_absolute_error: 0.026175, mean_q: -0.114493
   9322/500000: episode: 235, duration: 1.454s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.103 [-0.299, 0.983], mean observation: 0.072 [0.000, 24.000], loss: 0.001048, mean_absolute_error: 0.025649, mean_q: -0.122187
   9399/500000: episode: 236, duration: 4.294s, episode steps: 77, steps per second: 18, episode reward: -1.000, mean rewar

  10336/500000: episode: 262, duration: 1.383s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.174 [-0.107, 1.052], mean observation: 0.071 [0.000, 24.000], loss: 0.001045, mean_absolute_error: 0.024777, mean_q: -0.127448
  10361/500000: episode: 263, duration: 1.386s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.161 [-0.261, 1.005], mean observation: 0.072 [0.000, 24.000], loss: 0.000995, mean_absolute_error: 0.024782, mean_q: -0.129579
  10386/500000: episode: 264, duration: 1.468s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.160 [-0.131, 1.151], mean observation: 0.071 [0.000, 24.000], loss: 0.001079, mean_absolute_error: 0.024893, mean_q: -0.140162
  10411/500000: episode: 265, duration: 1.444s, episode steps: 25, steps per second: 17, episode reward: -1.000, mean reward

  11655/500000: episode: 291, duration: 4.098s, episode steps: 76, steps per second: 19, episode reward: -1.000, mean reward: -0.013 [-1.000, 0.000], mean action: 0.244 [-0.206, 1.288], mean observation: 0.062 [0.000, 24.000], loss: 0.000986, mean_absolute_error: 0.025519, mean_q: -0.166321
  11837/500000: episode: 292, duration: 9.866s, episode steps: 182, steps per second: 18, episode reward: -1.000, mean reward: -0.005 [-1.000, 0.000], mean action: 0.188 [-0.727, 1.646], mean observation: 0.058 [0.000, 24.000], loss: 0.000997, mean_absolute_error: 0.025728, mean_q: -0.159088
  11862/500000: episode: 293, duration: 1.394s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.181 [-0.129, 1.115], mean observation: 0.072 [0.000, 24.000], loss: 0.000915, mean_absolute_error: 0.024151, mean_q: -0.157439
  11889/500000: episode: 294, duration: 1.501s, episode steps: 27, steps per second: 18, episode reward: -1.000, mean rewar

  13429/500000: episode: 320, duration: 1.579s, episode steps: 27, steps per second: 17, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 0.090 [-0.239, 1.103], mean observation: 0.070 [0.000, 24.000], loss: 0.000962, mean_absolute_error: 0.026092, mean_q: -0.172653
  13456/500000: episode: 321, duration: 1.450s, episode steps: 27, steps per second: 19, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 0.207 [-0.144, 1.068], mean observation: 0.071 [0.000, 24.000], loss: 0.000854, mean_absolute_error: 0.024995, mean_q: -0.168623
  13496/500000: episode: 322, duration: 2.166s, episode steps: 40, steps per second: 18, episode reward: -1.000, mean reward: -0.025 [-1.000, 0.000], mean action: 0.200 [-0.130, 1.120], mean observation: 0.070 [0.000, 24.000], loss: 0.001041, mean_absolute_error: 0.026107, mean_q: -0.173119
  13523/500000: episode: 323, duration: 1.512s, episode steps: 27, steps per second: 18, episode reward: -1.000, mean reward

  16048/500000: episode: 349, duration: 13.062s, episode steps: 238, steps per second: 18, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 0.112 [-0.882, 1.422], mean observation: 0.056 [0.000, 24.000], loss: 0.001121, mean_absolute_error: 0.027925, mean_q: -0.193411
  16381/500000: episode: 350, duration: 18.242s, episode steps: 333, steps per second: 18, episode reward: -1.000, mean reward: -0.003 [-1.000, 0.000], mean action: 0.214 [-0.813, 1.869], mean observation: 0.057 [0.000, 24.000], loss: 0.001144, mean_absolute_error: 0.028264, mean_q: -0.187700
  16432/500000: episode: 351, duration: 2.889s, episode steps: 51, steps per second: 18, episode reward: -1.000, mean reward: -0.020 [-1.000, 0.000], mean action: 0.150 [-0.386, 1.089], mean observation: 0.071 [0.000, 24.000], loss: 0.001215, mean_absolute_error: 0.029136, mean_q: -0.180236
  16483/500000: episode: 352, duration: 2.858s, episode steps: 51, steps per second: 18, episode reward: -1.000, mean re

  17890/500000: episode: 378, duration: 1.410s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.195 [-0.175, 1.152], mean observation: 0.072 [0.000, 24.000], loss: 0.000945, mean_absolute_error: 0.028038, mean_q: -0.199787
  17915/500000: episode: 379, duration: 1.320s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.176 [-0.210, 1.171], mean observation: 0.072 [0.000, 24.000], loss: 0.000837, mean_absolute_error: 0.025694, mean_q: -0.203151
  17941/500000: episode: 380, duration: 1.422s, episode steps: 26, steps per second: 18, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 0.165 [-0.223, 1.072], mean observation: 0.070 [0.000, 24.000], loss: 0.000978, mean_absolute_error: 0.027304, mean_q: -0.198149
  17971/500000: episode: 381, duration: 1.770s, episode steps: 30, steps per second: 17, episode reward: -1.000, mean reward

  22078/500000: episode: 407, duration: 1.367s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 0.195 [-0.140, 1.089], mean observation: 0.072 [0.000, 24.000], loss: 0.000581, mean_absolute_error: 0.022046, mean_q: -0.166667
  22168/500000: episode: 408, duration: 4.796s, episode steps: 90, steps per second: 19, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 0.171 [-0.288, 1.208], mean observation: 0.068 [0.000, 24.000], loss: 0.000677, mean_absolute_error: 0.022416, mean_q: -0.162147
  22275/500000: episode: 409, duration: 5.701s, episode steps: 107, steps per second: 19, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 0.166 [-0.720, 1.266], mean observation: 0.064 [0.000, 24.000], loss: 0.000679, mean_absolute_error: 0.022622, mean_q: -0.163987
  22300/500000: episode: 410, duration: 1.341s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean rewar

  24479/500000: episode: 436, duration: 8.433s, episode steps: 160, steps per second: 19, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 0.113 [-0.672, 1.360], mean observation: 0.060 [0.000, 24.000], loss: 0.000765, mean_absolute_error: 0.024044, mean_q: -0.169521
  24510/500000: episode: 437, duration: 1.671s, episode steps: 31, steps per second: 19, episode reward: -1.000, mean reward: -0.032 [-1.000, 0.000], mean action: 0.219 [-0.136, 1.134], mean observation: 0.069 [0.000, 24.000], loss: 0.000880, mean_absolute_error: 0.025703, mean_q: -0.169612
  24625/500000: episode: 438, duration: 6.373s, episode steps: 115, steps per second: 18, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 0.021 [-0.736, 1.148], mean observation: 0.066 [0.000, 24.000], loss: 0.000800, mean_absolute_error: 0.024787, mean_q: -0.170587
  24675/500000: episode: 439, duration: 2.629s, episode steps: 50, steps per second: 19, episode reward: -1.000, mean rewa

  28797/500000: episode: 465, duration: 6.126s, episode steps: 117, steps per second: 19, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 0.227 [-0.514, 1.422], mean observation: 0.065 [0.000, 24.000], loss: 0.000894, mean_absolute_error: 0.026186, mean_q: -0.158774
  29062/500000: episode: 466, duration: 15.289s, episode steps: 265, steps per second: 17, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 0.089 [-0.693, 1.647], mean observation: 0.060 [0.000, 24.000], loss: 0.000912, mean_absolute_error: 0.026462, mean_q: -0.152616
  29088/500000: episode: 467, duration: 1.438s, episode steps: 26, steps per second: 18, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 0.161 [-0.139, 1.127], mean observation: 0.071 [0.000, 24.000], loss: 0.000909, mean_absolute_error: 0.026563, mean_q: -0.156682
  29191/500000: episode: 468, duration: 5.516s, episode steps: 103, steps per second: 19, episode reward: -1.000, mean re

  32427/500000: episode: 493, duration: 1.438s, episode steps: 26, steps per second: 18, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 0.146 [-0.147, 1.034], mean observation: 0.059 [0.000, 24.000], loss: 0.000935, mean_absolute_error: 0.026258, mean_q: -0.147370
  32879/500000: episode: 494, duration: 23.653s, episode steps: 452, steps per second: 19, episode reward: -1.000, mean reward: -0.002 [-1.000, 0.000], mean action: 0.069 [-1.080, 1.732], mean observation: 0.049 [0.000, 24.000], loss: 0.000889, mean_absolute_error: 0.026151, mean_q: -0.148363
  32905/500000: episode: 495, duration: 1.472s, episode steps: 26, steps per second: 18, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 0.163 [-0.180, 1.012], mean observation: 0.071 [0.000, 24.000], loss: 0.000828, mean_absolute_error: 0.025729, mean_q: -0.139322
  32935/500000: episode: 496, duration: 1.603s, episode steps: 30, steps per second: 19, episode reward: -1.000, mean rewa

In [None]:
agent.save_weights(model_path, overwrite=True)