In [1]:
import numpy as np
import os
import tensorflow as tf
import warnings

from keras.models import Model
from keras.layers import Dense, Flatten, Convolution2D, Input
from keras.optimizers import Adam
from pommerman.configs import ffa_v0_env
from pommerman.envs.v0 import Pomme
from pommerman.agents import SimpleAgent, BaseAgent
from pommerman.constants import BOARD_SIZE
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.core import Env, Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint, Callback

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
number_of_training_steps = 5000000
log_interval = 1000
file_log_path = './dqn/rl_logs/dense_16_2/log.txt'
tensorboard_path = './dqn/logs/dense_16_2/'
model_path = './dqn/model/dense_16_2/model{step}.h4'

In [3]:
if not os.path.isdir(os.path.dirname(file_log_path)):
    os.makedirs(os.path.dirname(file_log_path))
if not os.path.isdir(os.path.dirname(model_path)):
    os.makedirs(os.path.dirname(model_path))

In [4]:
class TensorforceAgent(BaseAgent):
    def act(self, obs, action_space):
        pass


class TensorboardLogger(Callback):
    """Logging in tensorboard without tensorflow ops."""
    def __init__(self, log_dir):
        # Some algorithms compute multiple episodes at once since they are multi-threaded.
        # We therefore use a dictionary that is indexed by the episode to separate episodes
        # from each other.
        self.observations = {}
        self.rewards = {}
        self.actions = {}
        self.metrics = {}
        self.step = 0
        """Creates a summary writer logging to log_dir."""
        self.writer = tf.summary.FileWriter(log_dir)

    def log_scalar(self, tag, value, step):
        """Log a scalar variable.
        Parameter
        ----------
        tag : basestring
            Name of the scalar
        value
        step : int
            training iteration
        """
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
        self.writer.add_summary(summary, step)

    def on_train_begin(self, logs):
        self.metrics_names = self.model.metrics_names

    def on_episode_begin(self, episode, logs):
        self.observations[episode] = []
        self.rewards[episode] = []
        self.actions[episode] = []
        self.metrics[episode] = []

    def on_episode_end(self, episode, logs):
        episode_steps = len(self.observations[episode])
        variables = {
            'step': self.step,
            'nb_steps': self.params['nb_steps'],
            'episode_steps': episode_steps,
            'episode_reward': np.sum(self.rewards[episode]),
            'reward_mean': np.mean(self.rewards[episode]),
            'reward_min': np.min(self.rewards[episode]),
            'reward_max': np.max(self.rewards[episode]),
            'action_mean': np.mean(self.actions[episode]),
            'action_min': np.min(self.actions[episode]),
            'action_max': np.max(self.actions[episode]),
            'obs_mean': np.mean(self.observations[episode]),
            'obs_min': np.min(self.observations[episode]),
            'obs_max': np.max(self.observations[episode]),
        }

        # Format all metrics.
        metrics = np.array(self.metrics[episode])
        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            for idx, name in enumerate(self.metrics_names):
                try:
                    value = np.nanmean(metrics[:, idx])
                except Warning:
                    value = -1
                variables[name] = value
        for key, value in variables.items():
            self.log_scalar(key, value, episode + 1)

        # Free up resources.
        del self.observations[episode]
        del self.rewards[episode]
        del self.actions[episode]
        del self.metrics[episode]

    def on_step_end(self, step, logs):
        episode = logs['episode']
        self.observations[episode].append(logs['observation'])
        self.rewards[episode].append(logs['reward'])
        self.actions[episode].append(logs['action'])
        self.metrics[episode].append(logs['metrics'])
        self.step += 1

In [5]:
# Instantiate the environment
config = ffa_v0_env()
env = Pomme(**config["env_kwargs"])
np.random.seed(0)
env.seed(0)
# Add 3 random agents
agents = []
for agent_id in range(3):
    agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

# Add TensorforceAgent
agent_id += 1
agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
env.set_agents(agents)
env.set_training_agent(agents[-1].agent_id)
env.set_init_game_state(None)
nb_actions = env.action_space.n


def create_model(actions, input_shape=(2369,)):
    inp = Input(input_shape)        
    x = Dense(16, activation='relu')(inp)
    x = Dense(16, activation='relu')(x)
    out = Dense(actions)(x)
    model = Model(inputs = inp, outputs=out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


# Next, we build a very simple model regardless of the dueling architecture
# if you enable dueling network in DQN , DQN will build a dueling network base on your model automatically
# Also, you can build a dueling network by yourself and turn off the dueling network in DQN.
model = create_model(nb_actions)
print(model.summary())

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2369)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                37920     
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 102       
Total params: 38,294
Trainable params: 38,294
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
class EnvWrapper(Env):
    """The abstract environment class that is used by all agents. This class has the exact
        same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the
        OpenAI Gym implementation, this class only defines the abstract methods without any actual
        implementation.
        To implement your own environment, you need to define the following methods:
        - `step`
        - `reset`
        - `render`
        - `close`
        Refer to the [Gym documentation](https://gym.openai.com/docs/#environments).
        """
    reward_range = (-1, 1)
    action_space = None
    observation_space = None

    def __init__(self, gym, board_size):
        self.gym = gym
        self.action_space = gym.action_space
        self.observation_space = gym.observation_space
        self.reward_range = gym.reward_range
        self.board_size = board_size

    def step(self, action):
        """Run one timestep of the environment's dynamics.
        Accepts an action and returns a tuple (observation, reward, done, info).
        # Arguments
            action (object): An action provided by the environment.
        # Returns
            observation (object): Agent's observation of the current environment.
            reward (float) : Amount of reward returned after previous action.
            done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
            info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
        """
        obs = self.gym.get_observations()
        all_actions = self.gym.act(obs)
        all_actions.insert(self.gym.training_agent, action)
        state, reward, terminal, info = self.gym.step(all_actions)
        agent_state = self.featurize(state[self.gym.training_agent])
        agent_reward = reward[self.gym.training_agent]
        return agent_state, agent_reward, terminal, info

    def reset(self):
        """
        Resets the state of the environment and returns an initial observation.
        # Returns
            observation (object): The initial observation of the space. Initial reward is assumed to be 0.
        """
        obs = self.gym.reset()
        agent_obs = self.featurize(obs[self.gym.training_agent])
        return agent_obs

    def render(self, mode='human', close=False):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.)
        # Arguments
            mode (str): The mode to render with.
            close (bool): Close all open renderings.
        """
        self.gym.render(mode=mode, close=close)

    def close(self):
        """Override in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        self.gym.close()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        # Returns
            Returns the list of seeds used in this env's random number generators
        """
        raise self.gym.seed(seed)

    def configure(self, *args, **kwargs):
        """Provides runtime configuration to the environment.
        This configuration should consist of data that tells your
        environment how to run (such as an address of a remote server,
        or path to your ImageNet data). It should not affect the
        semantics of the environment.
        """
        raise NotImplementedError()

    def featurize(self, obs):
        shape = (self.board_size, self.board_size, 1)

        def get_matrix(dict, key):
            res = dict[key]
            return res.reshape(shape).astype(np.float32)

        def get_map(board, item):
            map = np.zeros(shape)
            map[board == item] = 1
            return map

        board = get_matrix(obs, 'board')

        # TODO: probably not needed Passage = 0
        rigid_map = get_map(board, 1)               # Rigid = 1
        wood_map = get_map(board, 2)                # Wood = 2
        bomb_map = get_map(board, 3)                # Bomb = 3
        flames_map = get_map(board, 4)              # Flames = 4
        fog_map = get_map(board, 5)                 # TODO: not used for first two stages Fog = 5
        extra_bomb_map = get_map(board, 6)          # ExtraBomb = 6
        incr_range_map = get_map(board, 7)          # IncrRange = 7
        kick_map = get_map(board, 8)                # Kick = 8
        skull_map = get_map(board, 9)               # Skull = 9

        position = obs["position"]
        my_position = np.zeros(shape)
        my_position[position[0], position[1], 0] = 1

        team_mates = get_map(board, obs["teammate"].value) # TODO during documentation it should be an array

        enemies = np.zeros(shape)
        for enemy in obs["enemies"]:
            enemies[board == enemy.value] = 1

        bomb_blast_strength = get_matrix(obs, 'bomb_blast_strength')
        bomb_life = get_matrix(obs, 'bomb_life')

        ammo = obs["ammo"]
        blast_strength = obs["blast_strength"]
        can_kick = int(obs["can_kick"])

        obs = np.concatenate([my_position, enemies, team_mates, rigid_map,
                              wood_map, bomb_map, flames_map,
                              fog_map, extra_bomb_map, incr_range_map,
                              kick_map, skull_map, bomb_blast_strength,
                              bomb_life], axis=2).flatten()
        obs = np.append(obs, [ammo, blast_strength, can_kick])
        return obs

    def __del__(self):
        self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)


class CustomProcessor(Processor):
    def process_state_batch(self, batch):
        """Processes an entire batch of states and returns it.
        # Arguments
            batch (list): List of states
        # Returns
            Processed list of states
        """
        batch = np.squeeze(batch, axis=1)
        return batch

    def process_info(self, info):
        """Processes the info as obtained from the environment for use in an agent and
        returns it.
        """
        info['result'] = info['result'].value
        return info

In [7]:
env_wrapper = EnvWrapper(env, BOARD_SIZE)
processor = CustomProcessor()

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=500000, window_length=1)
policy = BoltzmannQPolicy()
file_logger = FileLogger(file_log_path, interval=log_interval)
checkpoint = ModelIntervalCheckpoint(model_path, interval=log_interval)
tensorboard = TensorboardLogger(tensorboard_path)
callbacks=[file_logger, checkpoint, tensorboard]
# enable the dueling network
# you can specify the dueling_type to one of {'avg','max','naive'}
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=512,
               enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy,
               processor=processor, batch_size=512)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
if os.path.isfile(model_path):
    dqn.load_weights(model_path)

In [None]:
history = dqn.fit(env_wrapper, nb_steps=number_of_training_steps, visualize=False, verbose=2,
        nb_max_episode_steps=env._max_steps,
                  callbacks=callbacks)
dqn.save_weights(model_path, overwrite=True)

Training for 5000000 steps ...
      30/5000000: episode: 1, duration: 1.437s, episode steps: 30, steps per second: 21, episode reward: -1.000, mean reward: -0.033 [-1.000, 0.000], mean action: 2.633 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: --, mean_absolute_error: --, mean_q: --




      90/5000000: episode: 2, duration: 3.764s, episode steps: 60, steps per second: 16, episode reward: -1.000, mean reward: -0.017 [-1.000, 0.000], mean action: 2.467 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 0.052536, mean_absolute_error: 0.245707, mean_q: 0.190787




     121/5000000: episode: 3, duration: 1.363s, episode steps: 31, steps per second: 23, episode reward: -1.000, mean reward: -0.032 [-1.000, 0.000], mean action: 2.516 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 0.044316, mean_absolute_error: 0.250631, mean_q: 0.099055




     207/5000000: episode: 4, duration: 3.743s, episode steps: 86, steps per second: 23, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 2.430 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 0.022808, mean_absolute_error: 0.273776, mean_q: 0.069738




     312/5000000: episode: 5, duration: 4.450s, episode steps: 105, steps per second: 24, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 2.095 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 0.006251, mean_absolute_error: 0.252224, mean_q: 0.150801




     338/5000000: episode: 6, duration: 1.292s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.500 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 0.006272, mean_absolute_error: 0.280301, mean_q: 0.253209




     477/5000000: episode: 7, duration: 6.596s, episode steps: 139, steps per second: 21, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 2.827 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 0.006375, mean_absolute_error: 0.284122, mean_q: 0.279719




     594/5000000: episode: 8, duration: 5.506s, episode steps: 117, steps per second: 21, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.410 [0.000, 5.000], mean observation: 0.058 [0.000, 24.000], loss: 0.004469, mean_absolute_error: 0.273400, mean_q: 0.297819
     620/5000000: episode: 9, duration: 1.288s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.269 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 0.004038, mean_absolute_error: 0.282602, mean_q: 0.322059
     703/5000000: episode: 10, duration: 4.042s, episode steps: 83, steps per second: 21, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 2.843 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 0.003680, mean_absolute_error: 0.290938, mean_q: 0.348442
     729/5000000: episode: 11, duration: 1.330s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -

    2046/5000000: episode: 37, duration: 1.283s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.154 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 0.006809, mean_absolute_error: 0.703314, mean_q: 0.783511
    2072/5000000: episode: 38, duration: 1.251s, episode steps: 26, steps per second: 21, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.385 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 0.006496, mean_absolute_error: 0.705944, mean_q: 0.780088
    2152/5000000: episode: 39, duration: 3.702s, episode steps: 80, steps per second: 22, episode reward: -1.000, mean reward: -0.013 [-1.000, 0.000], mean action: 2.663 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 0.006736, mean_absolute_error: 0.715262, mean_q: 0.799860
    2443/5000000: episode: 40, duration: 14.236s, episode steps: 291, steps per second: 20, episode reward: -1.000, mean reward

    3734/5000000: episode: 66, duration: 1.344s, episode steps: 27, steps per second: 20, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 2.926 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 0.028921, mean_absolute_error: 1.129578, mean_q: 1.314826
    3777/5000000: episode: 67, duration: 2.028s, episode steps: 43, steps per second: 21, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 1.953 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 0.035122, mean_absolute_error: 1.148795, mean_q: 1.339238
    3805/5000000: episode: 68, duration: 1.357s, episode steps: 28, steps per second: 21, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.286 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 0.038109, mean_absolute_error: 1.172363, mean_q: 1.368114
    3900/5000000: episode: 69, duration: 4.595s, episode steps: 95, steps per second: 21, episode reward: -1.000, mean reward: 

    5175/5000000: episode: 95, duration: 2.185s, episode steps: 46, steps per second: 21, episode reward: -1.000, mean reward: -0.022 [-1.000, 0.000], mean action: 2.652 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 0.239084, mean_absolute_error: 2.031946, mean_q: 2.607229
    5443/5000000: episode: 96, duration: 12.190s, episode steps: 268, steps per second: 22, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 2.619 [0.000, 5.000], mean observation: 0.060 [0.000, 24.000], loss: 0.288806, mean_absolute_error: 2.128265, mean_q: 2.717347
    5533/5000000: episode: 97, duration: 4.439s, episode steps: 90, steps per second: 20, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 3.078 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 0.381317, mean_absolute_error: 2.282888, mean_q: 2.915964
    5558/5000000: episode: 98, duration: 1.272s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward

    6739/5000000: episode: 124, duration: 1.290s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.308 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 1.092139, mean_absolute_error: 4.118625, mean_q: 5.281473
    6764/5000000: episode: 125, duration: 1.200s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.640 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 0.980119, mean_absolute_error: 4.301497, mean_q: 5.524764
    6790/5000000: episode: 126, duration: 1.186s, episode steps: 26, steps per second: 22, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.846 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 0.891630, mean_absolute_error: 4.245378, mean_q: 5.452711
    6820/5000000: episode: 127, duration: 1.465s, episode steps: 30, steps per second: 20, episode reward: -1.000, mean rewa

    8630/5000000: episode: 153, duration: 3.640s, episode steps: 77, steps per second: 21, episode reward: -1.000, mean reward: -0.013 [-1.000, 0.000], mean action: 2.026 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 3.213791, mean_absolute_error: 8.915653, mean_q: 11.322460
    8728/5000000: episode: 154, duration: 4.806s, episode steps: 98, steps per second: 20, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 2.684 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 3.232484, mean_absolute_error: 9.353251, mean_q: 11.856326
    8835/5000000: episode: 155, duration: 4.980s, episode steps: 107, steps per second: 21, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.607 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 3.247223, mean_absolute_error: 9.744396, mean_q: 12.294409
    9033/5000000: episode: 156, duration: 9.107s, episode steps: 198, steps per second: 22, episode reward: -1.000, mean

   10644/5000000: episode: 181, duration: 4.908s, episode steps: 102, steps per second: 21, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 2.167 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 16.072033, mean_absolute_error: 19.882431, mean_q: 25.361235
   10876/5000000: episode: 182, duration: 11.361s, episode steps: 232, steps per second: 20, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 2.082 [0.000, 5.000], mean observation: 0.063 [0.000, 24.000], loss: 18.621986, mean_absolute_error: 20.983665, mean_q: 26.698381
   10903/5000000: episode: 183, duration: 1.418s, episode steps: 27, steps per second: 19, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 2.000 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 20.114305, mean_absolute_error: 23.333277, mean_q: 29.761124
   10943/5000000: episode: 184, duration: 1.934s, episode steps: 40, steps per second: 21, episode reward: -1.00

   12700/5000000: episode: 209, duration: 5.376s, episode steps: 113, steps per second: 21, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.044 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 91.446739, mean_absolute_error: 37.652191, mean_q: 47.241055
   12973/5000000: episode: 210, duration: 13.377s, episode steps: 273, steps per second: 20, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 2.374 [0.000, 5.000], mean observation: 0.061 [0.000, 24.000], loss: 81.529160, mean_absolute_error: 38.376240, mean_q: 48.115513
   13004/5000000: episode: 211, duration: 1.477s, episode steps: 31, steps per second: 21, episode reward: -1.000, mean reward: -0.032 [-1.000, 0.000], mean action: 2.000 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 81.049133, mean_absolute_error: 40.062618, mean_q: 50.468708
   13034/5000000: episode: 212, duration: 1.382s, episode steps: 30, steps per second: 22, episode reward: -1.00

   14554/5000000: episode: 237, duration: 1.724s, episode steps: 36, steps per second: 21, episode reward: -1.000, mean reward: -0.028 [-1.000, 0.000], mean action: 1.917 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 133.405197, mean_absolute_error: 52.948315, mean_q: 66.106720
   14579/5000000: episode: 238, duration: 1.238s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.440 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 127.329727, mean_absolute_error: 55.293713, mean_q: 68.971863
   14755/5000000: episode: 239, duration: 8.562s, episode steps: 176, steps per second: 21, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 2.324 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 170.925232, mean_absolute_error: 53.298195, mean_q: 66.383644
   14783/5000000: episode: 240, duration: 1.405s, episode steps: 28, steps per second: 20, episode reward: -1.0

   16194/5000000: episode: 265, duration: 1.387s, episode steps: 29, steps per second: 21, episode reward: -1.000, mean reward: -0.034 [-1.000, 0.000], mean action: 2.448 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 260.890228, mean_absolute_error: 72.457680, mean_q: 90.670601
   16292/5000000: episode: 266, duration: 4.824s, episode steps: 98, steps per second: 20, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 2.796 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 486.012787, mean_absolute_error: 68.800682, mean_q: 86.245789
   16317/5000000: episode: 267, duration: 1.192s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.600 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 332.956085, mean_absolute_error: 68.277466, mean_q: 85.763771
   16345/5000000: episode: 268, duration: 1.362s, episode steps: 28, steps per second: 21, episode reward: -1.00

   17292/5000000: episode: 293, duration: 1.240s, episode steps: 26, steps per second: 21, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 4.192 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 246.246689, mean_absolute_error: 84.632614, mean_q: 106.097977
   17317/5000000: episode: 294, duration: 1.201s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.400 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 285.380951, mean_absolute_error: 86.251610, mean_q: 108.734924
   17344/5000000: episode: 295, duration: 1.219s, episode steps: 27, steps per second: 22, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 4.259 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 203.348328, mean_absolute_error: 84.412338, mean_q: 106.109039
   17369/5000000: episode: 296, duration: 1.213s, episode steps: 25, steps per second: 21, episode reward: -1

   18615/5000000: episode: 321, duration: 1.413s, episode steps: 28, steps per second: 20, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 3.286 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 643.804688, mean_absolute_error: 97.442680, mean_q: 121.311447
   18640/5000000: episode: 322, duration: 1.192s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.000 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 232.376511, mean_absolute_error: 91.983147, mean_q: 114.683914
   18668/5000000: episode: 323, duration: 1.354s, episode steps: 28, steps per second: 21, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 4.214 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 652.274231, mean_absolute_error: 96.667419, mean_q: 120.498001
   18693/5000000: episode: 324, duration: 1.195s, episode steps: 25, steps per second: 21, episode reward: -1

   20302/5000000: episode: 349, duration: 2.508s, episode steps: 52, steps per second: 21, episode reward: -1.000, mean reward: -0.019 [-1.000, 0.000], mean action: 3.615 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 636.441101, mean_absolute_error: 115.360748, mean_q: 144.252792
   20327/5000000: episode: 350, duration: 1.201s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.200 [2.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 585.786621, mean_absolute_error: 113.541229, mean_q: 141.963806
   20353/5000000: episode: 351, duration: 1.375s, episode steps: 26, steps per second: 19, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 3.615 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 350.845276, mean_absolute_error: 118.375748, mean_q: 148.134018
   20378/5000000: episode: 352, duration: 1.254s, episode steps: 25, steps per second: 20, episode reward:

   21252/5000000: episode: 377, duration: 1.411s, episode steps: 30, steps per second: 21, episode reward: -1.000, mean reward: -0.033 [-1.000, 0.000], mean action: 4.200 [2.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 1596.133057, mean_absolute_error: 130.428925, mean_q: 163.677383
   21277/5000000: episode: 378, duration: 1.298s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.480 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 1069.050903, mean_absolute_error: 123.204010, mean_q: 154.040558
   21305/5000000: episode: 379, duration: 1.338s, episode steps: 28, steps per second: 21, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 3.786 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 840.945129, mean_absolute_error: 126.421303, mean_q: 157.762299
   21384/5000000: episode: 380, duration: 4.029s, episode steps: 79, steps per second: 20, episode rewar

   22371/5000000: episode: 405, duration: 1.274s, episode steps: 27, steps per second: 21, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 3.667 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 617.786804, mean_absolute_error: 143.897583, mean_q: 177.665100
   22398/5000000: episode: 406, duration: 1.401s, episode steps: 27, steps per second: 19, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 3.074 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 1747.225830, mean_absolute_error: 153.143173, mean_q: 189.461807
   22450/5000000: episode: 407, duration: 2.500s, episode steps: 52, steps per second: 21, episode reward: -1.000, mean reward: -0.019 [-1.000, 0.000], mean action: 3.250 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 822.323242, mean_absolute_error: 137.908005, mean_q: 170.540634
   22484/5000000: episode: 408, duration: 1.641s, episode steps: 34, steps per second: 21, episode reward

   24007/5000000: episode: 433, duration: 5.756s, episode steps: 120, steps per second: 21, episode reward: -1.000, mean reward: -0.008 [-1.000, 0.000], mean action: 3.267 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 1112.160156, mean_absolute_error: 165.623703, mean_q: 203.672684
   24081/5000000: episode: 434, duration: 3.438s, episode steps: 74, steps per second: 22, episode reward: -1.000, mean reward: -0.014 [-1.000, 0.000], mean action: 3.041 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 1020.163208, mean_absolute_error: 160.404449, mean_q: 197.025177
   24146/5000000: episode: 435, duration: 3.013s, episode steps: 65, steps per second: 22, episode reward: -1.000, mean reward: -0.015 [-1.000, 0.000], mean action: 3.154 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 1515.223267, mean_absolute_error: 162.066788, mean_q: 199.004593
   24203/5000000: episode: 436, duration: 2.792s, episode steps: 57, steps per second: 20, episode rew

   26884/5000000: episode: 461, duration: 2.402s, episode steps: 49, steps per second: 20, episode reward: -1.000, mean reward: -0.020 [-1.000, 0.000], mean action: 3.000 [1.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 1573.771851, mean_absolute_error: 209.784119, mean_q: 262.462799
   26928/5000000: episode: 462, duration: 2.140s, episode steps: 44, steps per second: 21, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 2.682 [1.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 1363.203247, mean_absolute_error: 204.600296, mean_q: 256.191132
   26971/5000000: episode: 463, duration: 2.033s, episode steps: 43, steps per second: 21, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 3.140 [1.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 2178.035889, mean_absolute_error: 223.022888, mean_q: 277.506927
   27067/5000000: episode: 464, duration: 4.576s, episode steps: 96, steps per second: 21, episode rewa

   29003/5000000: episode: 489, duration: 2.142s, episode steps: 46, steps per second: 21, episode reward: -1.000, mean reward: -0.022 [-1.000, 0.000], mean action: 2.196 [1.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 1644.252563, mean_absolute_error: 253.298599, mean_q: 313.223602
   29042/5000000: episode: 490, duration: 1.863s, episode steps: 39, steps per second: 21, episode reward: -1.000, mean reward: -0.026 [-1.000, 0.000], mean action: 3.077 [1.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 1629.859253, mean_absolute_error: 256.956146, mean_q: 318.221497
   29091/5000000: episode: 491, duration: 2.456s, episode steps: 49, steps per second: 20, episode reward: -1.000, mean reward: -0.020 [-1.000, 0.000], mean action: 2.265 [1.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 1703.216187, mean_absolute_error: 264.351471, mean_q: 327.443359
   29172/5000000: episode: 492, duration: 4.014s, episode steps: 81, steps per second: 20, episode rewa

   30751/5000000: episode: 517, duration: 3.550s, episode steps: 72, steps per second: 20, episode reward: -1.000, mean reward: -0.014 [-1.000, 0.000], mean action: 2.764 [0.000, 5.000], mean observation: 0.061 [0.000, 24.000], loss: 3684.555664, mean_absolute_error: 274.959900, mean_q: 340.382202
   30807/5000000: episode: 518, duration: 2.676s, episode steps: 56, steps per second: 21, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 3.000 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 4200.188477, mean_absolute_error: 294.019318, mean_q: 364.547455
   31039/5000000: episode: 519, duration: 11.284s, episode steps: 232, steps per second: 21, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 2.603 [0.000, 5.000], mean observation: 0.057 [0.000, 24.000], loss: 3214.854980, mean_absolute_error: 298.390167, mean_q: 369.252411
   31075/5000000: episode: 520, duration: 1.773s, episode steps: 36, steps per second: 20, episode re

   32777/5000000: episode: 545, duration: 2.282s, episode steps: 46, steps per second: 20, episode reward: -1.000, mean reward: -0.022 [-1.000, 0.000], mean action: 3.065 [3.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 4794.031250, mean_absolute_error: 349.426727, mean_q: 433.326172
   32810/5000000: episode: 546, duration: 1.485s, episode steps: 33, steps per second: 22, episode reward: -1.000, mean reward: -0.030 [-1.000, 0.000], mean action: 3.606 [2.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 4302.381348, mean_absolute_error: 348.797058, mean_q: 433.444611
   32837/5000000: episode: 547, duration: 1.412s, episode steps: 27, steps per second: 19, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 3.000 [2.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 3856.890869, mean_absolute_error: 357.137299, mean_q: 443.054657
   32868/5000000: episode: 548, duration: 1.493s, episode steps: 31, steps per second: 21, episode rewa

   33829/5000000: episode: 573, duration: 4.329s, episode steps: 90, steps per second: 21, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 3.044 [3.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 5696.390625, mean_absolute_error: 385.186981, mean_q: 477.171753
   33900/5000000: episode: 574, duration: 3.305s, episode steps: 71, steps per second: 21, episode reward: -1.000, mean reward: -0.014 [-1.000, 0.000], mean action: 3.352 [1.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 5359.159668, mean_absolute_error: 405.280457, mean_q: 501.617126
   33929/5000000: episode: 575, duration: 1.390s, episode steps: 29, steps per second: 21, episode reward: -1.000, mean reward: -0.034 [-1.000, 0.000], mean action: 4.552 [2.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 5906.999023, mean_absolute_error: 430.046204, mean_q: 532.078979
   33954/5000000: episode: 576, duration: 1.229s, episode steps: 25, steps per second: 20, episode rewa

   35145/5000000: episode: 601, duration: 1.224s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.080 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 6949.111328, mean_absolute_error: 444.594147, mean_q: 549.210022
   35290/5000000: episode: 602, duration: 6.977s, episode steps: 145, steps per second: 21, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 3.076 [3.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 7898.523438, mean_absolute_error: 448.815033, mean_q: 554.024353
   35343/5000000: episode: 603, duration: 2.517s, episode steps: 53, steps per second: 21, episode reward: -1.000, mean reward: -0.019 [-1.000, 0.000], mean action: 3.151 [1.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 8270.734375, mean_absolute_error: 452.706024, mean_q: 558.946594
   35369/5000000: episode: 604, duration: 1.223s, episode steps: 26, steps per second: 21, episode rew

   37200/5000000: episode: 629, duration: 1.225s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.920 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 12137.572266, mean_absolute_error: 516.760986, mean_q: 637.207336
   37227/5000000: episode: 630, duration: 1.363s, episode steps: 27, steps per second: 20, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 3.222 [3.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 21896.914062, mean_absolute_error: 530.698486, mean_q: 653.696472
   37253/5000000: episode: 631, duration: 1.383s, episode steps: 26, steps per second: 19, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 4.769 [3.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 19563.595703, mean_absolute_error: 509.041138, mean_q: 626.117004
   37278/5000000: episode: 632, duration: 1.186s, episode steps: 25, steps per second: 21, episode r

   38363/5000000: episode: 657, duration: 1.460s, episode steps: 27, steps per second: 18, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 3.370 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 14642.191406, mean_absolute_error: 582.695862, mean_q: 718.832886
   38389/5000000: episode: 658, duration: 1.322s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 3.423 [3.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 30711.886719, mean_absolute_error: 562.821594, mean_q: 692.194458
   38501/5000000: episode: 659, duration: 5.340s, episode steps: 112, steps per second: 21, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 3.196 [1.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 28156.140625, mean_absolute_error: 571.039185, mean_q: 703.766968
   38549/5000000: episode: 660, duration: 2.385s, episode steps: 48, steps per second: 20, episode 

   39444/5000000: episode: 685, duration: 1.339s, episode steps: 27, steps per second: 20, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 3.519 [2.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 35366.019531, mean_absolute_error: 605.181152, mean_q: 745.509521
   39469/5000000: episode: 686, duration: 1.190s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.440 [1.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 89438.773438, mean_absolute_error: 624.870300, mean_q: 765.142334
   39494/5000000: episode: 687, duration: 1.269s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.920 [1.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 34396.019531, mean_absolute_error: 628.275208, mean_q: 776.410645
   39519/5000000: episode: 688, duration: 1.215s, episode steps: 25, steps per second: 21, episode r

   40644/5000000: episode: 713, duration: 2.439s, episode steps: 50, steps per second: 21, episode reward: -1.000, mean reward: -0.020 [-1.000, 0.000], mean action: 4.640 [1.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 35965.871094, mean_absolute_error: 656.320312, mean_q: 807.772278
   40669/5000000: episode: 714, duration: 1.247s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 1.960 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 69498.367188, mean_absolute_error: 668.106628, mean_q: 822.473511
   40694/5000000: episode: 715, duration: 1.167s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 4.720 [1.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 48457.398438, mean_absolute_error: 674.313843, mean_q: 829.473145
   40719/5000000: episode: 716, duration: 1.120s, episode steps: 25, steps per second: 22, episode r

   41527/5000000: episode: 741, duration: 1.154s, episode steps: 25, steps per second: 22, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 5.000 [5.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 143823.000000, mean_absolute_error: 712.038696, mean_q: 878.545288
   41561/5000000: episode: 742, duration: 1.573s, episode steps: 34, steps per second: 22, episode reward: -1.000, mean reward: -0.029 [-1.000, 0.000], mean action: 2.176 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 124176.585938, mean_absolute_error: 738.948975, mean_q: 911.503418
   41591/5000000: episode: 743, duration: 1.495s, episode steps: 30, steps per second: 20, episode reward: -1.000, mean reward: -0.033 [-1.000, 0.000], mean action: 2.300 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 54747.636719, mean_absolute_error: 772.908936, mean_q: 950.495239
   41631/5000000: episode: 744, duration: 1.777s, episode steps: 40, steps per second: 23, episode

   42640/5000000: episode: 769, duration: 1.188s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 4.840 [1.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 120893.820312, mean_absolute_error: 817.658752, mean_q: 1011.090637
   42665/5000000: episode: 770, duration: 1.161s, episode steps: 25, steps per second: 22, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 4.600 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 52059.351562, mean_absolute_error: 787.718933, mean_q: 974.420715
   42690/5000000: episode: 771, duration: 1.209s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 5.000 [5.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 119876.242188, mean_absolute_error: 805.873779, mean_q: 996.258728
   42715/5000000: episode: 772, duration: 1.268s, episode steps: 25, steps per second: 20, episod

   43912/5000000: episode: 797, duration: 1.352s, episode steps: 27, steps per second: 20, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 4.037 [1.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 105970.218750, mean_absolute_error: 834.490540, mean_q: 1028.827026
   43938/5000000: episode: 798, duration: 1.293s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.962 [1.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 128475.179688, mean_absolute_error: 814.542969, mean_q: 1005.776611
   44022/5000000: episode: 799, duration: 4.091s, episode steps: 84, steps per second: 21, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 2.202 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 222560.953125, mean_absolute_error: 864.139709, mean_q: 1065.555664
   44052/5000000: episode: 800, duration: 1.437s, episode steps: 30, steps per second: 21, epi

   45285/5000000: episode: 825, duration: 1.239s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.760 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 257242.953125, mean_absolute_error: 1102.756836, mean_q: 1363.156860
   45368/5000000: episode: 826, duration: 3.848s, episode steps: 83, steps per second: 22, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 2.663 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 363554.062500, mean_absolute_error: 1028.674805, mean_q: 1270.304077
   45524/5000000: episode: 827, duration: 7.398s, episode steps: 156, steps per second: 21, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 2.654 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 315076.093750, mean_absolute_error: 1014.739258, mean_q: 1253.076416
   45549/5000000: episode: 828, duration: 1.239s, episode steps: 25, steps per second: 20,

   47039/5000000: episode: 853, duration: 1.371s, episode steps: 27, steps per second: 20, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 2.481 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 299974.843750, mean_absolute_error: 1151.706055, mean_q: 1420.728027
   47097/5000000: episode: 854, duration: 2.793s, episode steps: 58, steps per second: 21, episode reward: -1.000, mean reward: -0.017 [-1.000, 0.000], mean action: 3.121 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 526611.062500, mean_absolute_error: 1167.130493, mean_q: 1431.016846
   47157/5000000: episode: 855, duration: 2.948s, episode steps: 60, steps per second: 20, episode reward: -1.000, mean reward: -0.017 [-1.000, 0.000], mean action: 2.383 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 460864.593750, mean_absolute_error: 1190.122681, mean_q: 1461.799316
   47223/5000000: episode: 856, duration: 3.273s, episode steps: 66, steps per second: 20, 

   48687/5000000: episode: 881, duration: 1.119s, episode steps: 25, steps per second: 22, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.680 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 444471.093750, mean_absolute_error: 1372.189697, mean_q: 1685.195801
   48804/5000000: episode: 882, duration: 5.054s, episode steps: 117, steps per second: 23, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.521 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 671530.312500, mean_absolute_error: 1304.497803, mean_q: 1603.978760
   48864/5000000: episode: 883, duration: 2.878s, episode steps: 60, steps per second: 21, episode reward: -1.000, mean reward: -0.017 [-1.000, 0.000], mean action: 2.167 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 332317.031250, mean_absolute_error: 1336.498413, mean_q: 1639.502319
   48918/5000000: episode: 884, duration: 2.543s, episode steps: 54, steps per second: 21,

   50712/5000000: episode: 908, duration: 2.737s, episode steps: 56, steps per second: 20, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.929 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 1228422.000000, mean_absolute_error: 1678.000977, mean_q: 2037.649292
   50768/5000000: episode: 909, duration: 2.591s, episode steps: 56, steps per second: 22, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.411 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 1416386.625000, mean_absolute_error: 1671.809570, mean_q: 2043.048096
   50823/5000000: episode: 910, duration: 2.514s, episode steps: 55, steps per second: 22, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.382 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 608340.000000, mean_absolute_error: 1689.295166, mean_q: 2070.196777
   50964/5000000: episode: 911, duration: 6.632s, episode steps: 141, steps per second: 2

   52590/5000000: episode: 935, duration: 6.127s, episode steps: 125, steps per second: 20, episode reward: -1.000, mean reward: -0.008 [-1.000, 0.000], mean action: 2.664 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 1859690.000000, mean_absolute_error: 1956.981201, mean_q: 2389.634766
   52617/5000000: episode: 936, duration: 1.386s, episode steps: 27, steps per second: 19, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 2.407 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 799163.562500, mean_absolute_error: 1948.598389, mean_q: 2384.162598
   52679/5000000: episode: 937, duration: 3.026s, episode steps: 62, steps per second: 20, episode reward: -1.000, mean reward: -0.016 [-1.000, 0.000], mean action: 3.145 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 2283334.000000, mean_absolute_error: 1897.678955, mean_q: 2316.779541
   52705/5000000: episode: 938, duration: 1.225s, episode steps: 26, steps per second: 2

   54008/5000000: episode: 962, duration: 1.946s, episode steps: 43, steps per second: 22, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 2.326 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 1871924.625000, mean_absolute_error: 2071.746338, mean_q: 2537.257324
   54098/5000000: episode: 963, duration: 4.340s, episode steps: 90, steps per second: 21, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.544 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 3629339.750000, mean_absolute_error: 2337.366943, mean_q: 2842.648438
   54141/5000000: episode: 964, duration: 2.065s, episode steps: 43, steps per second: 21, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 2.070 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 1147876.750000, mean_absolute_error: 2349.723877, mean_q: 2846.072266
   54307/5000000: episode: 965, duration: 8.130s, episode steps: 166, steps per second: 

   55761/5000000: episode: 989, duration: 6.234s, episode steps: 125, steps per second: 20, episode reward: -1.000, mean reward: -0.008 [-1.000, 0.000], mean action: 2.488 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 4129789.000000, mean_absolute_error: 2504.731445, mean_q: 3060.682861
   55900/5000000: episode: 990, duration: 6.631s, episode steps: 139, steps per second: 21, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 2.619 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 3716076.500000, mean_absolute_error: 2598.209961, mean_q: 3180.192383
   55925/5000000: episode: 991, duration: 1.211s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.360 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 1905230.875000, mean_absolute_error: 2482.255859, mean_q: 3040.997559
   55986/5000000: episode: 992, duration: 2.952s, episode steps: 61, steps per second:

   57344/5000000: episode: 1016, duration: 1.793s, episode steps: 37, steps per second: 21, episode reward: -1.000, mean reward: -0.027 [-1.000, 0.000], mean action: 2.135 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 4161709.500000, mean_absolute_error: 3122.834229, mean_q: 3824.934082
   57370/5000000: episode: 1017, duration: 1.224s, episode steps: 26, steps per second: 21, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.615 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 3046029.000000, mean_absolute_error: 3641.902344, mean_q: 4442.998047
   57406/5000000: episode: 1018, duration: 1.678s, episode steps: 36, steps per second: 21, episode reward: -1.000, mean reward: -0.028 [-1.000, 0.000], mean action: 2.806 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 6631984.500000, mean_absolute_error: 3217.614258, mean_q: 3924.109375
   57547/5000000: episode: 1019, duration: 6.793s, episode steps: 141, steps per seco

   59461/5000000: episode: 1043, duration: 1.185s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.200 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 9408040.000000, mean_absolute_error: 5183.107422, mean_q: 6349.385742
   59532/5000000: episode: 1044, duration: 3.320s, episode steps: 71, steps per second: 21, episode reward: -1.000, mean reward: -0.014 [-1.000, 0.000], mean action: 2.493 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 7765318.500000, mean_absolute_error: 4405.760254, mean_q: 5374.086426
   59557/5000000: episode: 1045, duration: 1.210s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.640 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 2894445.500000, mean_absolute_error: 4139.917969, mean_q: 5102.597168
   59594/5000000: episode: 1046, duration: 1.642s, episode steps: 37, steps per secon

   60923/5000000: episode: 1070, duration: 1.261s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.840 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 4349967.000000, mean_absolute_error: 5089.900391, mean_q: 6272.134766
   61015/5000000: episode: 1071, duration: 4.435s, episode steps: 92, steps per second: 21, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.522 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 11761504.000000, mean_absolute_error: 5375.973633, mean_q: 6607.303223
   61043/5000000: episode: 1072, duration: 1.321s, episode steps: 28, steps per second: 21, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.607 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 7766805.000000, mean_absolute_error: 5089.201660, mean_q: 6249.171875
   61212/5000000: episode: 1073, duration: 8.077s, episode steps: 169, steps per sec

   62518/5000000: episode: 1097, duration: 1.970s, episode steps: 41, steps per second: 21, episode reward: -1.000, mean reward: -0.024 [-1.000, 0.000], mean action: 2.146 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 16104214.000000, mean_absolute_error: 5972.692383, mean_q: 7312.963379
   62628/5000000: episode: 1098, duration: 5.545s, episode steps: 110, steps per second: 20, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.727 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 20855808.000000, mean_absolute_error: 6229.128418, mean_q: 7632.004395
   62792/5000000: episode: 1099, duration: 8.054s, episode steps: 164, steps per second: 20, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 2.780 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 15853947.000000, mean_absolute_error: 6277.824219, mean_q: 7702.210449
   62817/5000000: episode: 1100, duration: 1.229s, episode steps: 25, steps per 

   64030/5000000: episode: 1124, duration: 1.431s, episode steps: 27, steps per second: 19, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 2.370 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 44208800.000000, mean_absolute_error: 7303.944336, mean_q: 8928.365234
   64087/5000000: episode: 1125, duration: 2.833s, episode steps: 57, steps per second: 20, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.088 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 19394342.000000, mean_absolute_error: 7189.373047, mean_q: 8802.469727
   64259/5000000: episode: 1126, duration: 8.252s, episode steps: 172, steps per second: 21, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 2.407 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 15897176.000000, mean_absolute_error: 7433.559570, mean_q: 9116.130859
   64287/5000000: episode: 1127, duration: 1.418s, episode steps: 28, steps per s

   65933/5000000: episode: 1151, duration: 1.234s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.320 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 31486562.000000, mean_absolute_error: 8938.031250, mean_q: 10944.764648
   66028/5000000: episode: 1152, duration: 4.728s, episode steps: 95, steps per second: 20, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.758 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 24618448.000000, mean_absolute_error: 8993.960938, mean_q: 11032.444336
   66060/5000000: episode: 1153, duration: 1.584s, episode steps: 32, steps per second: 20, episode reward: -1.000, mean reward: -0.031 [-1.000, 0.000], mean action: 2.375 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 26294880.000000, mean_absolute_error: 8918.624023, mean_q: 10912.685547
   66122/5000000: episode: 1154, duration: 3.088s, episode steps: 62, steps per

   67940/5000000: episode: 1178, duration: 4.688s, episode steps: 94, steps per second: 20, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.691 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 9355460.000000, mean_absolute_error: 9540.197266, mean_q: 11662.978516
   67980/5000000: episode: 1179, duration: 1.969s, episode steps: 40, steps per second: 20, episode reward: -1.000, mean reward: -0.025 [-1.000, 0.000], mean action: 2.450 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 15890754.000000, mean_absolute_error: 10820.423828, mean_q: 13282.443359
   68008/5000000: episode: 1180, duration: 1.446s, episode steps: 28, steps per second: 19, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.250 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 20952142.000000, mean_absolute_error: 10700.806641, mean_q: 13102.129883
   68062/5000000: episode: 1181, duration: 2.640s, episode steps: 54, steps pe

   69403/5000000: episode: 1205, duration: 5.832s, episode steps: 121, steps per second: 21, episode reward: -1.000, mean reward: -0.008 [-1.000, 0.000], mean action: 2.612 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 30870304.000000, mean_absolute_error: 11172.405273, mean_q: 13651.576172
   69431/5000000: episode: 1206, duration: 1.415s, episode steps: 28, steps per second: 20, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 1.893 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 16395562.000000, mean_absolute_error: 11482.123047, mean_q: 14050.315430
   69463/5000000: episode: 1207, duration: 1.487s, episode steps: 32, steps per second: 22, episode reward: -1.000, mean reward: -0.031 [-1.000, 0.000], mean action: 2.594 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 19550456.000000, mean_absolute_error: 10817.205078, mean_q: 13194.735352
   69491/5000000: episode: 1208, duration: 1.389s, episode steps: 28, steps

   72097/5000000: episode: 1232, duration: 1.665s, episode steps: 33, steps per second: 20, episode reward: -1.000, mean reward: -0.030 [-1.000, 0.000], mean action: 2.848 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 33505498.000000, mean_absolute_error: 14104.012695, mean_q: 17239.218750
   72312/5000000: episode: 1233, duration: 10.328s, episode steps: 215, steps per second: 21, episode reward: -1.000, mean reward: -0.005 [-1.000, 0.000], mean action: 2.544 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 64815236.000000, mean_absolute_error: 13281.637695, mean_q: 16180.841797
   72426/5000000: episode: 1234, duration: 5.258s, episode steps: 114, steps per second: 22, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.342 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 68230888.000000, mean_absolute_error: 13649.189453, mean_q: 16631.513672
   72521/5000000: episode: 1235, duration: 4.664s, episode steps: 95, ste

   73900/5000000: episode: 1259, duration: 1.624s, episode steps: 32, steps per second: 20, episode reward: -1.000, mean reward: -0.031 [-1.000, 0.000], mean action: 2.656 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 97810592.000000, mean_absolute_error: 15346.121094, mean_q: 18721.466797
   73932/5000000: episode: 1260, duration: 1.575s, episode steps: 32, steps per second: 20, episode reward: -1.000, mean reward: -0.031 [-1.000, 0.000], mean action: 2.969 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 80768104.000000, mean_absolute_error: 14953.630859, mean_q: 18221.798828
   74019/5000000: episode: 1261, duration: 4.218s, episode steps: 87, steps per second: 21, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.483 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 50922944.000000, mean_absolute_error: 14683.000977, mean_q: 17900.441406
   74044/5000000: episode: 1262, duration: 1.285s, episode steps: 25, steps 

   75380/5000000: episode: 1286, duration: 1.206s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.960 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 94120472.000000, mean_absolute_error: 17114.148438, mean_q: 20865.685547
   75405/5000000: episode: 1287, duration: 1.201s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.040 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 30277570.000000, mean_absolute_error: 16272.572266, mean_q: 19863.201172
   75496/5000000: episode: 1288, duration: 4.444s, episode steps: 91, steps per second: 20, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.549 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 52641352.000000, mean_absolute_error: 16076.980469, mean_q: 19593.343750
   75530/5000000: episode: 1289, duration: 1.586s, episode steps: 34, steps 

   76865/5000000: episode: 1313, duration: 2.932s, episode steps: 56, steps per second: 19, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.411 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 95799000.000000, mean_absolute_error: 17604.847656, mean_q: 21402.529297
   76897/5000000: episode: 1314, duration: 1.562s, episode steps: 32, steps per second: 20, episode reward: -1.000, mean reward: -0.031 [-1.000, 0.000], mean action: 2.094 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 26331156.000000, mean_absolute_error: 16878.955078, mean_q: 20502.660156
   77082/5000000: episode: 1315, duration: 9.113s, episode steps: 185, steps per second: 20, episode reward: -1.000, mean reward: -0.005 [-1.000, 0.000], mean action: 2.670 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 63307620.000000, mean_absolute_error: 17077.177734, mean_q: 20766.339844
   77109/5000000: episode: 1316, duration: 1.218s, episode steps: 27, steps

   78329/5000000: episode: 1340, duration: 1.374s, episode steps: 28, steps per second: 20, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.179 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 234478752.000000, mean_absolute_error: 18761.826172, mean_q: 22765.853516
   78359/5000000: episode: 1341, duration: 1.470s, episode steps: 30, steps per second: 20, episode reward: -1.000, mean reward: -0.033 [-1.000, 0.000], mean action: 2.433 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 22978942.000000, mean_absolute_error: 18865.542969, mean_q: 22970.519531
   78621/5000000: episode: 1342, duration: 13.176s, episode steps: 262, steps per second: 20, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 2.550 [0.000, 5.000], mean observation: 0.062 [0.000, 24.000], loss: 94585088.000000, mean_absolute_error: 18657.166016, mean_q: 22680.679688
   78687/5000000: episode: 1343, duration: 3.254s, episode steps: 66, ste

   79821/5000000: episode: 1367, duration: 1.334s, episode steps: 26, steps per second: 19, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.615 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 139215456.000000, mean_absolute_error: 20080.126953, mean_q: 24374.080078
   79886/5000000: episode: 1368, duration: 2.973s, episode steps: 65, steps per second: 22, episode reward: -1.000, mean reward: -0.015 [-1.000, 0.000], mean action: 2.231 [0.000, 5.000], mean observation: 0.073 [0.000, 24.000], loss: 93204176.000000, mean_absolute_error: 19488.478516, mean_q: 23672.904297
   79916/5000000: episode: 1369, duration: 1.314s, episode steps: 30, steps per second: 23, episode reward: -1.000, mean reward: -0.033 [-1.000, 0.000], mean action: 2.200 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 74033504.000000, mean_absolute_error: 20496.388672, mean_q: 24921.054688
   79945/5000000: episode: 1370, duration: 1.476s, episode steps: 29, steps

   81060/5000000: episode: 1394, duration: 2.673s, episode steps: 55, steps per second: 21, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.545 [0.000, 5.000], mean observation: 0.063 [0.000, 24.000], loss: 22002194.000000, mean_absolute_error: 19342.843750, mean_q: 23439.794922
   81085/5000000: episode: 1395, duration: 1.298s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.320 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 25633846.000000, mean_absolute_error: 20094.410156, mean_q: 24357.939453
   81111/5000000: episode: 1396, duration: 1.248s, episode steps: 26, steps per second: 21, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.346 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 65258616.000000, mean_absolute_error: 20484.613281, mean_q: 24871.716797
   81143/5000000: episode: 1397, duration: 1.562s, episode steps: 32, steps 

   82610/5000000: episode: 1421, duration: 3.102s, episode steps: 60, steps per second: 19, episode reward: -1.000, mean reward: -0.017 [-1.000, 0.000], mean action: 2.583 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 239608768.000000, mean_absolute_error: 21919.082031, mean_q: 26556.464844
   82663/5000000: episode: 1422, duration: 2.637s, episode steps: 53, steps per second: 20, episode reward: -1.000, mean reward: -0.019 [-1.000, 0.000], mean action: 2.774 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 106391792.000000, mean_absolute_error: 24932.144531, mean_q: 30370.201172
   82774/5000000: episode: 1423, duration: 5.329s, episode steps: 111, steps per second: 21, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.730 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 134280896.000000, mean_absolute_error: 21981.650391, mean_q: 26656.152344
   82847/5000000: episode: 1424, duration: 3.478s, episode steps: 73, st

   84254/5000000: episode: 1448, duration: 4.063s, episode steps: 85, steps per second: 21, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 2.529 [0.000, 5.000], mean observation: 0.073 [0.000, 24.000], loss: 116431888.000000, mean_absolute_error: 24614.867188, mean_q: 29928.128906
   84279/5000000: episode: 1449, duration: 1.207s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.000 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 184468688.000000, mean_absolute_error: 25272.839844, mean_q: 30718.835938
   84307/5000000: episode: 1450, duration: 1.345s, episode steps: 28, steps per second: 21, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.214 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 99299696.000000, mean_absolute_error: 23142.250000, mean_q: 28055.494141
   84441/5000000: episode: 1451, duration: 6.647s, episode steps: 134, ste

   85707/5000000: episode: 1475, duration: 2.737s, episode steps: 55, steps per second: 20, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.164 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 240941152.000000, mean_absolute_error: 27819.296875, mean_q: 33841.441406
   85733/5000000: episode: 1476, duration: 1.296s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.231 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 152537920.000000, mean_absolute_error: 25702.251953, mean_q: 31255.160156
   85759/5000000: episode: 1477, duration: 1.275s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.615 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 325322528.000000, mean_absolute_error: 26145.646484, mean_q: 31719.123047
   85817/5000000: episode: 1478, duration: 2.876s, episode steps: 58, ste

   86833/5000000: episode: 1502, duration: 1.425s, episode steps: 28, steps per second: 20, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.536 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 170441936.000000, mean_absolute_error: 26375.970703, mean_q: 31961.937500
   86861/5000000: episode: 1503, duration: 1.311s, episode steps: 28, steps per second: 21, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.964 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 384197728.000000, mean_absolute_error: 29396.359375, mean_q: 35722.332031
   86941/5000000: episode: 1504, duration: 3.869s, episode steps: 80, steps per second: 21, episode reward: -1.000, mean reward: -0.013 [-1.000, 0.000], mean action: 2.350 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 589706688.000000, mean_absolute_error: 27424.683594, mean_q: 33238.722656
   86969/5000000: episode: 1505, duration: 1.357s, episode steps: 28, ste

   88524/5000000: episode: 1529, duration: 2.886s, episode steps: 64, steps per second: 22, episode reward: -1.000, mean reward: -0.016 [-1.000, 0.000], mean action: 2.625 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 175745104.000000, mean_absolute_error: 29137.664062, mean_q: 35339.898438
   88557/5000000: episode: 1530, duration: 1.652s, episode steps: 33, steps per second: 20, episode reward: -1.000, mean reward: -0.030 [-1.000, 0.000], mean action: 2.303 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 269241280.000000, mean_absolute_error: 28865.453125, mean_q: 34960.578125
   88582/5000000: episode: 1531, duration: 1.254s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.440 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 242753632.000000, mean_absolute_error: 28960.800781, mean_q: 35132.496094
   88608/5000000: episode: 1532, duration: 1.308s, episode steps: 26, ste

   89798/5000000: episode: 1556, duration: 3.551s, episode steps: 78, steps per second: 22, episode reward: -1.000, mean reward: -0.013 [-1.000, 0.000], mean action: 2.846 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 515730080.000000, mean_absolute_error: 28204.888672, mean_q: 34085.835938
   89831/5000000: episode: 1557, duration: 1.526s, episode steps: 33, steps per second: 22, episode reward: -1.000, mean reward: -0.030 [-1.000, 0.000], mean action: 2.667 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 276592864.000000, mean_absolute_error: 27504.937500, mean_q: 33274.968750
   89858/5000000: episode: 1558, duration: 1.302s, episode steps: 27, steps per second: 21, episode reward: -1.000, mean reward: -0.037 [-1.000, 0.000], mean action: 2.444 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 503873408.000000, mean_absolute_error: 28487.972656, mean_q: 34507.257812
   89887/5000000: episode: 1559, duration: 1.441s, episode steps: 29, ste

   91176/5000000: episode: 1583, duration: 1.571s, episode steps: 30, steps per second: 19, episode reward: -1.000, mean reward: -0.033 [-1.000, 0.000], mean action: 2.200 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 320247264.000000, mean_absolute_error: 32503.089844, mean_q: 39332.574219
   91219/5000000: episode: 1584, duration: 2.123s, episode steps: 43, steps per second: 20, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 2.256 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 168354112.000000, mean_absolute_error: 31952.837891, mean_q: 38790.632812
   91271/5000000: episode: 1585, duration: 2.403s, episode steps: 52, steps per second: 22, episode reward: -1.000, mean reward: -0.019 [-1.000, 0.000], mean action: 2.942 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 340802944.000000, mean_absolute_error: 29646.888672, mean_q: 35905.542969
   91300/5000000: episode: 1586, duration: 1.466s, episode steps: 29, ste

   92838/5000000: episode: 1610, duration: 4.037s, episode steps: 84, steps per second: 21, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 2.524 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 338110976.000000, mean_absolute_error: 32974.890625, mean_q: 39929.859375
   92873/5000000: episode: 1611, duration: 1.742s, episode steps: 35, steps per second: 20, episode reward: -1.000, mean reward: -0.029 [-1.000, 0.000], mean action: 2.571 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 453296800.000000, mean_absolute_error: 31905.931641, mean_q: 38462.546875
   92917/5000000: episode: 1612, duration: 2.065s, episode steps: 44, steps per second: 21, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 1.909 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 157684560.000000, mean_absolute_error: 31888.257812, mean_q: 38650.152344
   92942/5000000: episode: 1613, duration: 1.206s, episode steps: 25, ste

   94348/5000000: episode: 1637, duration: 1.220s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.440 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 1095552896.000000, mean_absolute_error: 32477.748047, mean_q: 39248.183594
   94382/5000000: episode: 1638, duration: 1.638s, episode steps: 34, steps per second: 21, episode reward: -1.000, mean reward: -0.029 [-1.000, 0.000], mean action: 2.500 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 806925440.000000, mean_absolute_error: 33263.773438, mean_q: 40202.792969
   94439/5000000: episode: 1639, duration: 2.643s, episode steps: 57, steps per second: 22, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.772 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 419697920.000000, mean_absolute_error: 31953.486328, mean_q: 38639.578125
   94464/5000000: episode: 1640, duration: 1.189s, episode steps: 25, st

   95723/5000000: episode: 1664, duration: 4.768s, episode steps: 97, steps per second: 20, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 2.485 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 952296704.000000, mean_absolute_error: 36553.472656, mean_q: 44234.050781
   95813/5000000: episode: 1665, duration: 4.398s, episode steps: 90, steps per second: 20, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.378 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 607291072.000000, mean_absolute_error: 36825.910156, mean_q: 44300.398438
   95929/5000000: episode: 1666, duration: 5.769s, episode steps: 116, steps per second: 20, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.578 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 703453696.000000, mean_absolute_error: 34899.476562, mean_q: 42237.167969
   96035/5000000: episode: 1667, duration: 5.322s, episode steps: 106, s

   97353/5000000: episode: 1691, duration: 5.344s, episode steps: 108, steps per second: 20, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.398 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 569980800.000000, mean_absolute_error: 39505.250000, mean_q: 47768.292969
   97537/5000000: episode: 1692, duration: 8.871s, episode steps: 184, steps per second: 21, episode reward: -1.000, mean reward: -0.005 [-1.000, 0.000], mean action: 2.380 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 776005184.000000, mean_absolute_error: 36615.265625, mean_q: 44241.648438
   97674/5000000: episode: 1693, duration: 6.692s, episode steps: 137, steps per second: 20, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 2.372 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 540547520.000000, mean_absolute_error: 36658.234375, mean_q: 44353.671875
   97770/5000000: episode: 1694, duration: 4.802s, episode steps: 96, 

   98751/5000000: episode: 1718, duration: 2.782s, episode steps: 57, steps per second: 20, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.789 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 984883840.000000, mean_absolute_error: 37729.558594, mean_q: 45652.968750
   98825/5000000: episode: 1719, duration: 3.513s, episode steps: 74, steps per second: 21, episode reward: -1.000, mean reward: -0.014 [-1.000, 0.000], mean action: 2.297 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 1025396544.000000, mean_absolute_error: 40253.742188, mean_q: 48745.761719
   98853/5000000: episode: 1720, duration: 1.424s, episode steps: 28, steps per second: 20, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.536 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 343118784.000000, mean_absolute_error: 38023.804688, mean_q: 46022.226562
   99038/5000000: episode: 1721, duration: 16.460s, episode steps: 185, 

  100289/5000000: episode: 1745, duration: 8.640s, episode steps: 178, steps per second: 21, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 2.522 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 628511616.000000, mean_absolute_error: 39694.976562, mean_q: 47916.718750
  100314/5000000: episode: 1746, duration: 1.349s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.720 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 389578304.000000, mean_absolute_error: 44283.488281, mean_q: 53579.019531
  100348/5000000: episode: 1747, duration: 1.684s, episode steps: 34, steps per second: 20, episode reward: -1.000, mean reward: -0.029 [-1.000, 0.000], mean action: 2.147 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 1628201472.000000, mean_absolute_error: 45147.042969, mean_q: 54349.003906
  100481/5000000: episode: 1748, duration: 6.437s, episode steps: 133, 

  101938/5000000: episode: 1772, duration: 1.313s, episode steps: 28, steps per second: 21, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.750 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 847160512.000000, mean_absolute_error: 49514.914062, mean_q: 59589.777344
  102023/5000000: episode: 1773, duration: 3.995s, episode steps: 85, steps per second: 21, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 2.635 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 1567071744.000000, mean_absolute_error: 44480.535156, mean_q: 53756.347656
  102167/5000000: episode: 1774, duration: 6.744s, episode steps: 144, steps per second: 21, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 2.590 [0.000, 5.000], mean observation: 0.073 [0.000, 24.000], loss: 843705088.000000, mean_absolute_error: 44343.492188, mean_q: 53578.765625
  102204/5000000: episode: 1775, duration: 1.755s, episode steps: 37, s

  103647/5000000: episode: 1799, duration: 3.847s, episode steps: 80, steps per second: 21, episode reward: -1.000, mean reward: -0.013 [-1.000, 0.000], mean action: 3.237 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 2063023744.000000, mean_absolute_error: 48526.054688, mean_q: 58566.210938
  103673/5000000: episode: 1800, duration: 1.190s, episode steps: 26, steps per second: 22, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.654 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 525544096.000000, mean_absolute_error: 43680.414062, mean_q: 52762.742188
  103736/5000000: episode: 1801, duration: 2.901s, episode steps: 63, steps per second: 22, episode reward: -1.000, mean reward: -0.016 [-1.000, 0.000], mean action: 2.238 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 1540161792.000000, mean_absolute_error: 44140.878906, mean_q: 53147.058594
  103771/5000000: episode: 1802, duration: 1.696s, episode steps: 35, s

  105457/5000000: episode: 1826, duration: 1.360s, episode steps: 28, steps per second: 21, episode reward: -1.000, mean reward: -0.036 [-1.000, 0.000], mean action: 2.643 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 2007639680.000000, mean_absolute_error: 50443.238281, mean_q: 60814.863281
  105569/5000000: episode: 1827, duration: 5.678s, episode steps: 112, steps per second: 20, episode reward: -1.000, mean reward: -0.009 [-1.000, 0.000], mean action: 2.277 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 1309631616.000000, mean_absolute_error: 48549.550781, mean_q: 58631.367188
  105628/5000000: episode: 1828, duration: 2.820s, episode steps: 59, steps per second: 21, episode reward: -1.000, mean reward: -0.017 [-1.000, 0.000], mean action: 2.780 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 674077952.000000, mean_absolute_error: 50768.683594, mean_q: 61219.878906
  105683/5000000: episode: 1829, duration: 2.595s, episode steps: 55, 

  107184/5000000: episode: 1853, duration: 7.006s, episode steps: 141, steps per second: 20, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 2.553 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 2039426816.000000, mean_absolute_error: 53927.296875, mean_q: 64905.566406
  107215/5000000: episode: 1854, duration: 1.610s, episode steps: 31, steps per second: 19, episode reward: -1.000, mean reward: -0.032 [-1.000, 0.000], mean action: 3.194 [0.000, 5.000], mean observation: 0.068 [0.000, 24.000], loss: 613900096.000000, mean_absolute_error: 51205.953125, mean_q: 61929.500000
  107281/5000000: episode: 1855, duration: 3.248s, episode steps: 66, steps per second: 20, episode reward: -1.000, mean reward: -0.015 [-1.000, 0.000], mean action: 2.242 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 2002827264.000000, mean_absolute_error: 53390.410156, mean_q: 64525.507812
  107357/5000000: episode: 1856, duration: 3.692s, episode steps: 76, 

  108744/5000000: episode: 1880, duration: 1.451s, episode steps: 30, steps per second: 21, episode reward: -1.000, mean reward: -0.033 [-1.000, 0.000], mean action: 2.500 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 981051264.000000, mean_absolute_error: 51776.296875, mean_q: 62461.570312
  108796/5000000: episode: 1881, duration: 2.514s, episode steps: 52, steps per second: 21, episode reward: -1.000, mean reward: -0.019 [-1.000, 0.000], mean action: 2.827 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 675325696.000000, mean_absolute_error: 55240.585938, mean_q: 66828.679688
  108821/5000000: episode: 1882, duration: 1.132s, episode steps: 25, steps per second: 22, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.440 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 3530383104.000000, mean_absolute_error: 61918.671875, mean_q: 74269.218750
  108874/5000000: episode: 1883, duration: 2.627s, episode steps: 53, st

  110280/5000000: episode: 1907, duration: 11.944s, episode steps: 254, steps per second: 21, episode reward: -1.000, mean reward: -0.004 [-1.000, 0.000], mean action: 2.280 [0.000, 5.000], mean observation: 0.062 [0.000, 24.000], loss: 2279296256.000000, mean_absolute_error: 55622.222656, mean_q: 67158.898438
  110453/5000000: episode: 1908, duration: 7.516s, episode steps: 173, steps per second: 23, episode reward: -1.000, mean reward: -0.006 [-1.000, 0.000], mean action: 2.572 [0.000, 5.000], mean observation: 0.060 [0.000, 24.000], loss: 1909885312.000000, mean_absolute_error: 55275.273438, mean_q: 66783.664062
  110545/5000000: episode: 1909, duration: 4.308s, episode steps: 92, steps per second: 21, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.641 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 468031456.000000, mean_absolute_error: 54083.175781, mean_q: 65418.433594
  110571/5000000: episode: 1910, duration: 1.369s, episode steps: 26

  112351/5000000: episode: 1934, duration: 6.606s, episode steps: 142, steps per second: 21, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 2.373 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 1581553792.000000, mean_absolute_error: 60481.437500, mean_q: 73123.593750
  112502/5000000: episode: 1935, duration: 7.464s, episode steps: 151, steps per second: 20, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 2.450 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 2025570432.000000, mean_absolute_error: 59710.531250, mean_q: 72108.281250
  112623/5000000: episode: 1936, duration: 5.767s, episode steps: 121, steps per second: 21, episode reward: -1.000, mean reward: -0.008 [-1.000, 0.000], mean action: 2.388 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 2068395136.000000, mean_absolute_error: 60799.667969, mean_q: 73443.835938
  112651/5000000: episode: 1937, duration: 1.308s, episode steps: 2

  114153/5000000: episode: 1961, duration: 4.150s, episode steps: 82, steps per second: 20, episode reward: -1.000, mean reward: -0.012 [-1.000, 0.000], mean action: 2.585 [0.000, 5.000], mean observation: 0.073 [0.000, 24.000], loss: 973254848.000000, mean_absolute_error: 60121.089844, mean_q: 72646.929688
  114189/5000000: episode: 1962, duration: 1.653s, episode steps: 36, steps per second: 22, episode reward: -1.000, mean reward: -0.028 [-1.000, 0.000], mean action: 2.056 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 483753248.000000, mean_absolute_error: 57882.386719, mean_q: 69895.843750
  114289/5000000: episode: 1963, duration: 4.780s, episode steps: 100, steps per second: 21, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 2.500 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 2118797440.000000, mean_absolute_error: 64079.863281, mean_q: 77463.796875
  114323/5000000: episode: 1964, duration: 1.576s, episode steps: 34, s

  115798/5000000: episode: 1988, duration: 6.383s, episode steps: 134, steps per second: 21, episode reward: -1.000, mean reward: -0.007 [-1.000, 0.000], mean action: 2.604 [0.000, 5.000], mean observation: 0.067 [0.000, 24.000], loss: 2776354048.000000, mean_absolute_error: 60859.140625, mean_q: 73484.593750
  115860/5000000: episode: 1989, duration: 2.975s, episode steps: 62, steps per second: 21, episode reward: -1.000, mean reward: -0.016 [-1.000, 0.000], mean action: 2.435 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 2480535808.000000, mean_absolute_error: 64942.375000, mean_q: 78559.164062
  115915/5000000: episode: 1990, duration: 2.765s, episode steps: 55, steps per second: 20, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.109 [0.000, 5.000], mean observation: 0.076 [0.000, 24.000], loss: 1452426368.000000, mean_absolute_error: 62925.769531, mean_q: 76205.312500
  115966/5000000: episode: 1991, duration: 2.648s, episode steps: 51,

  117602/5000000: episode: 2015, duration: 1.179s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.120 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 2563503360.000000, mean_absolute_error: 63035.589844, mean_q: 75692.148438
  117627/5000000: episode: 2016, duration: 1.256s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 3.160 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 2025995520.000000, mean_absolute_error: 69690.601562, mean_q: 84367.890625
  117663/5000000: episode: 2017, duration: 1.709s, episode steps: 36, steps per second: 21, episode reward: -1.000, mean reward: -0.028 [-1.000, 0.000], mean action: 2.389 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 2355513088.000000, mean_absolute_error: 69763.929688, mean_q: 83998.523438
  117691/5000000: episode: 2018, duration: 1.380s, episode steps: 28, 

  119135/5000000: episode: 2042, duration: 1.289s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.462 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 947488064.000000, mean_absolute_error: 65535.480469, mean_q: 79417.437500
  119165/5000000: episode: 2043, duration: 1.513s, episode steps: 30, steps per second: 20, episode reward: -1.000, mean reward: -0.033 [-1.000, 0.000], mean action: 2.400 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 984055488.000000, mean_absolute_error: 73587.343750, mean_q: 89350.789062
  119237/5000000: episode: 2044, duration: 3.130s, episode steps: 72, steps per second: 23, episode reward: -1.000, mean reward: -0.014 [-1.000, 0.000], mean action: 2.542 [0.000, 5.000], mean observation: 0.066 [0.000, 24.000], loss: 945761600.000000, mean_absolute_error: 66867.476562, mean_q: 80934.960938
  119267/5000000: episode: 2045, duration: 1.388s, episode steps: 30, ste

  120410/5000000: episode: 2069, duration: 1.309s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 1.692 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 780189312.000000, mean_absolute_error: 64786.398438, mean_q: 78260.515625
  120436/5000000: episode: 2070, duration: 1.303s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.885 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 1224515584.000000, mean_absolute_error: 73126.203125, mean_q: 88431.460938
  120462/5000000: episode: 2071, duration: 1.240s, episode steps: 26, steps per second: 21, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.538 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 1473502976.000000, mean_absolute_error: 65384.972656, mean_q: 78934.093750
  120495/5000000: episode: 2072, duration: 1.652s, episode steps: 33, s

  121777/5000000: episode: 2096, duration: 5.055s, episode steps: 105, steps per second: 21, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 2.505 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 1843809536.000000, mean_absolute_error: 71978.031250, mean_q: 86988.257812
  121803/5000000: episode: 2097, duration: 1.292s, episode steps: 26, steps per second: 20, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.808 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 1016218688.000000, mean_absolute_error: 67887.609375, mean_q: 82032.320312
  121828/5000000: episode: 2098, duration: 1.261s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.200 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 1277765120.000000, mean_absolute_error: 71812.492188, mean_q: 86966.179688
  121899/5000000: episode: 2099, duration: 3.515s, episode steps: 71,

  123876/5000000: episode: 2123, duration: 4.736s, episode steps: 91, steps per second: 19, episode reward: -1.000, mean reward: -0.011 [-1.000, 0.000], mean action: 2.385 [0.000, 5.000], mean observation: 0.073 [0.000, 24.000], loss: 1930273920.000000, mean_absolute_error: 76606.132812, mean_q: 92624.132812
  123974/5000000: episode: 2124, duration: 4.625s, episode steps: 98, steps per second: 21, episode reward: -1.000, mean reward: -0.010 [-1.000, 0.000], mean action: 2.316 [0.000, 5.000], mean observation: 0.061 [0.000, 24.000], loss: 1138994816.000000, mean_absolute_error: 72600.914062, mean_q: 87601.726562
  124027/5000000: episode: 2125, duration: 2.578s, episode steps: 53, steps per second: 21, episode reward: -1.000, mean reward: -0.019 [-1.000, 0.000], mean action: 2.245 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 1149771520.000000, mean_absolute_error: 73419.093750, mean_q: 88814.757812
  124090/5000000: episode: 2126, duration: 2.989s, episode steps: 63, 

  125756/5000000: episode: 2150, duration: 1.377s, episode steps: 25, steps per second: 18, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.040 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 3182950400.000000, mean_absolute_error: 80817.328125, mean_q: 97944.539062
  125811/5000000: episode: 2151, duration: 2.685s, episode steps: 55, steps per second: 20, episode reward: -1.000, mean reward: -0.018 [-1.000, 0.000], mean action: 2.382 [0.000, 5.000], mean observation: 0.064 [0.000, 24.000], loss: 1945446272.000000, mean_absolute_error: 74712.265625, mean_q: 90274.406250
  125864/5000000: episode: 2152, duration: 2.559s, episode steps: 53, steps per second: 21, episode reward: -1.000, mean reward: -0.019 [-1.000, 0.000], mean action: 2.472 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 2133671936.000000, mean_absolute_error: 75564.039062, mean_q: 91307.148438
  125980/5000000: episode: 2153, duration: 5.538s, episode steps: 116,

  127563/5000000: episode: 2177, duration: 1.377s, episode steps: 26, steps per second: 19, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.731 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 4002885376.000000, mean_absolute_error: 77310.265625, mean_q: 93186.906250
  127595/5000000: episode: 2178, duration: 1.575s, episode steps: 32, steps per second: 20, episode reward: -1.000, mean reward: -0.031 [-1.000, 0.000], mean action: 2.281 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 3474752256.000000, mean_absolute_error: 82016.039062, mean_q: 99118.757812
  127620/5000000: episode: 2179, duration: 1.167s, episode steps: 25, steps per second: 21, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.800 [0.000, 5.000], mean observation: 0.065 [0.000, 24.000], loss: 784787136.000000, mean_absolute_error: 78134.882812, mean_q: 94534.000000
  127655/5000000: episode: 2180, duration: 1.728s, episode steps: 35, s

  129486/5000000: episode: 2204, duration: 2.064s, episode steps: 43, steps per second: 21, episode reward: -1.000, mean reward: -0.023 [-1.000, 0.000], mean action: 1.884 [0.000, 5.000], mean observation: 0.070 [0.000, 24.000], loss: 3559604224.000000, mean_absolute_error: 86296.078125, mean_q: 103856.468750
  129512/5000000: episode: 2205, duration: 1.221s, episode steps: 26, steps per second: 21, episode reward: -1.000, mean reward: -0.038 [-1.000, 0.000], mean action: 2.615 [0.000, 5.000], mean observation: 0.071 [0.000, 24.000], loss: 356098336.000000, mean_absolute_error: 80555.125000, mean_q: 97303.953125
  129553/5000000: episode: 2206, duration: 2.008s, episode steps: 41, steps per second: 20, episode reward: -1.000, mean reward: -0.024 [-1.000, 0.000], mean action: 2.488 [0.000, 5.000], mean observation: 0.069 [0.000, 24.000], loss: 1658797440.000000, mean_absolute_error: 74536.593750, mean_q: 89984.164062
  129586/5000000: episode: 2207, duration: 1.561s, episode steps: 33, 

  130803/5000000: episode: 2231, duration: 1.326s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.400 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 1035378624.000000, mean_absolute_error: 83653.914062, mean_q: 101197.476562
  130828/5000000: episode: 2232, duration: 1.230s, episode steps: 25, steps per second: 20, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.680 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 672488960.000000, mean_absolute_error: 88950.523438, mean_q: 107831.257812
  130853/5000000: episode: 2233, duration: 1.316s, episode steps: 25, steps per second: 19, episode reward: -1.000, mean reward: -0.040 [-1.000, 0.000], mean action: 2.400 [0.000, 5.000], mean observation: 0.072 [0.000, 24.000], loss: 3348037888.000000, mean_absolute_error: 81222.109375, mean_q: 98250.359375
  130972/5000000: episode: 2234, duration: 5.737s, episode steps: 119