In [1]:
#Pygame imports
import pygame
import pygame_menu
import pygame.freetype

#Standard imports
import sys
import random
import math
import numpy as np
import os
import os.path

#OpenAI gym
from gym import Env
from gym.spaces import Discrete, Box

#Keras RL
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import LeakyReLU
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam

#Import local modules
sys.path.append(os.getcwd())
from ipynb.fs.full.Games.SpaceInvaders import Space_Invaders
from ipynb.fs.full.Games.Asteroids import Asteroids
pygame.init()

#Global window length
WINDOW_LENGTH = 3

pygame 2.0.1 (SDL 2.0.14, Python 3.6.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
class Custom_OpenAI_Env(Env):
    def __init__(self, screen_width, screen_height, game, 
                 action_space = Discrete(5)):
        # Assign action and observation space
        self.action_space = action_space
        self.observation_space = Box(0, 255, shape=(1, screen_width, screen_height, ))

        #Initialise the game
        self.game = game

        #Assign colour, get the initial game state and record it 
        self.state = self.game.get_state()
        self.start_state = self.state
    
        print("initialisation complete")
        
        self.delay = 1000
        
    def step(self, action):
        # Apply action
        self.game.execute_action(action)
        
        #Call the update loop before getting the state
        self.game.update()
        self.state = self.game.get_state()
        
        #Calculate step-based reward
        reward = self.game.calculate_reward()
        done = self.game.done
        
        # Set placeholder for info (required for AI env superclass step method)
        info = {}
        # Return step information
        return self.state, reward, done, info
    
    def render(self, mode):
        self.game.render()
        
    def reset(self):
        #Restart the game
        self.state = self.start_state
        self.game.reset()
        return self.game.get_state()
    

In [4]:
#Game menu class controlling the functionality of the entire framework
class Game_Menu:

    def __init__(self, width, height):
        self.menu = None
        self.surface = None
        self.game = None
        self.icon_surface = None
        
        #Set standard network parameters
        self.learning_rates = [0.0001, 0.000001, 0.0000001]
        self.episodes = [1, 10, 100]
        self.steps = [3000, 50000, 100000]

        #Standard parameter indices
        self.learning_rate = 0
        self.episode = 0
        self.step = 0
        self.visualize = False

        #Asteroid specific parameters
        self.scales = [[800,600, 1.0], [200,200, 0.65], [150,150, 0.45]]
        self.intensity_rates = [0, 1, 3]
        self.player_speeds = [1, 2, 3]

        #Scale, player speed and colour are common to both games
        self.scale = 0
        self.intensity = 0
        self.player_speed = 0
        self.homogenous_controls = False
        self.colour = False
        
        #Space invaders specific parameters
        self.enemy_speeds = [0.5, 1.0, 2.0]
        self.enemy_speed = 0
        
        #Transfer mode 
        self.transfer = -1
        self.test_env = -1
        
        #Initialise pygame
        pygame.display.init()
        self.surface = pygame.display.set_mode((800, 600))
        self.icon_surface = pygame.image.load(os.path.join(os.getcwd(), "Dependencies/Resources", "Masterslogo.png"))
        pygame.display.set_icon(self.icon_surface)
        pygame.display.set_caption("Master's Project")
        
        #Start the main menu
        self.main_menu()
        
    def reset_parameters(self):
        #Reset all adjustable variables when returning to the main menu.
        self.enemy_speed = 0
        self.scale = 0
        self.intensity = 0
        self.player_speed = 0
        self.homogenous_controls = False
        self.colour = False
        self.learning_rate = 0
        self.episode = 0
        self.step = 0
        self.visualize = False
        
    def main_menu(self):
        self.reset_parameters()
        self.surface = pygame.display.set_mode((800, 600))
        #Disable any menu if it exists
        if self.menu:
            self.menu.disable()
        #Initialise the main menu interface
        self.menu = pygame_menu.Menu(600, 800, 'Main Menu',
                         theme=pygame_menu.themes.THEME_DARK)
        self.menu.add_button('Space Invaders', self.start_space_invaders)
        self.menu.add_button('Space Invaders - Training', self.set_game_mode)
        self.menu.add_button('Asteroids', self.start_asteroids)
        self.menu.add_button('Asteroids - Training', self.set_standard_parameters)
        self.menu.add_button('Transfer Learning', self.transfer_menu)
        self.menu.add_button('Test - Space Invaders', self.set_test_space_invaders)
        self.menu.add_button('Test - Asteroids', self.set_test_asteroids)
        self.menu.add_button('Quit', pygame_menu.events.EXIT)
        
        self.menu.mainloop(self.surface)

    def transfer_menu(self):
        #Transfer menu, always disable because this will never be the first
        #menu created
        self.menu.disable()
        self.menu = pygame_menu.Menu(600, 800, 'Transfer Learning',
                               theme=pygame_menu.themes.THEME_DARK)
        self.menu.add_button('Space Invaders -> Asteroids', self.set_transfer_SA)
        self.menu.add_button('Asteroids -> Space Invaders', self.set_transfer_AS)
        self.menu.add_button('Back', self.main_menu)
        
        self.menu.mainloop(self.surface)
    
    
    #Common DQN network settings
    def set_standard_param(self, arg, param):
        #Learning rate 
        if param == 0:
            print("changing learning rate")
            if self.learning_rate < 2:
                self.learning_rate += 1
            else:
                self.learning_rate = 0
        #Steps
        elif param == 1:
            if self.step < 2:
                self.step += 1
            else:
                self.step = 0
        #Episodes
        elif param == 2:
            if self.episode < 2:
                self.episode += 1
            else:
                self.episode = 0
        #Visualize
        elif param == 3:
            self.visualize = 1 if self.visualize == 0 else 1 
            
    def set_game_mode(self):
        self.set_standard_parameters(1)
        
    def set_test_asteroids(self):
        self.test_env = 0
        self.test_model()
    
    def set_test_space_invaders(self):
        self.test_env = 1
        self.test_model()
        
    #Set transfer mode: Space invaders to Asteroids
    def set_transfer_SA(self):
        self.transfer = 0
        self.set_standard_parameters()
        
    #Set transfer mode: Asteroids to Space Invaders
    def set_transfer_AS(self):
        self.transfer = 1
        self.set_standard_parameters(1)
    
    #Setup menu for standard network parameters
    def set_standard_parameters(self, gamemode = 0):
        print("standard params called")
        self.menu.disable()
        self.gamemode = gamemode
        self.menu = pygame_menu.Menu(600, 800, 'Select Variables',
                               theme=pygame_menu.themes.THEME_DARK)

        self.menu.add_selector('Learning rate :', [('1e-4', 0),
                                                   ('1e-6', 0),
                                                   ('1e-7', 0)],
                                                   onchange=self.set_standard_param)
        self.menu.add_selector('Steps :', [('3000', 1),
                                           ('50,000', 1),
                                           ('100,000', 1),],
                                           onchange=self.set_standard_param)
        self.menu.add_selector('Episodes: ', [('1', 2),
                                           ('10', 2),
                                           ('100', 2),],
                                           onchange=self.set_standard_param)
        self.menu.add_selector('Visualize', [('False', 3), ('True', 3)],
                               onchange=self.set_standard_param)
        self.menu.add_button('Back', self.main_menu)
        
        #Continue based on what game mode is selected
        if gamemode == 0:
            self.menu.add_button('Continue - Asteroids', self.set_asteroids_parameters)
        else:
            self.menu.add_button('Continue - Space Invaders', self.set_space_invaders_parameters)
        
        self.menu.mainloop(self.surface)
    
    def set_mode_parameter(self, arg, param):
        #0 scale, 1 intensity, 2, player speed, 3 homo controls, 4 colours, 5 enemy speed
        if param == 0:
            self.scale += 1 if self.scale < 2 else 0
        if param == 1:
            self.intensity += 1 if self.intensity < 2 else 0        
        if param == 2:
            self.player_speed += 1 if self.player_speed < 2 else 0        
        if param == 3:
            self.homogenous_controls = True if self.homogenous_controls == False else False
        if param == 4:
            self.colour = True if self.colour == False else False
        if param == 5:
            self.enemy_speed += 1 if self.enemy_speed < 2 else 0
            
    #Menu for setting asteroids game mode specific parameters
    def set_asteroids_parameters(self):
        self.menu.disable()
        self.menu = pygame_menu.Menu(600, 800, 'Select Gameplay Variables',
                               theme=pygame_menu.themes.THEME_DARK)

        self.menu.add_selector('Scale :', [('(800, 600)', 0), ('(400, 300)', 0),('(150, 150)', 0)],
                               onchange=self.set_mode_parameter)
        self.menu.add_selector('Intensity:', [('0.1', 1), ('0.5', 1),('1.0', 1)],
                               onchange=self.set_mode_parameter)
        self.menu.add_selector('Player Speed :', [('1', 2), ('1.5', 2),('3', 2)],
                               onchange=self.set_mode_parameter)
        self.menu.add_selector('Colour Input: ', [('False', 4), ('True', 4)],
                        onchange=self.set_mode_parameter)
        self.menu.add_button('Start Training', self.start_asteroids_training)
        self.menu.add_button('Back', self.main_menu)
        
        self.menu.mainloop(self.surface)
    
    #Menu for setting Space invaders game mode specific parameters
    def set_space_invaders_parameters(self):
        self.menu.disable()
        self.menu = pygame_menu.Menu(600, 800, 'Select Gameplay Variables',
                               theme=pygame_menu.themes.THEME_DARK)

        self.menu.add_selector('Scale :', [('(800, 600)', 0), ('(300, 300)', 0),
                                          ('(150, 150)', 0)],
                               onchange=self.set_mode_parameter)
        self.menu.add_selector('Enemy Speed:', [('0.5', 5), ('1.0', 5),
                                          ('2.0', 5)],
                               onchange=self.set_mode_parameter)
        self.menu.add_selector('Player Speed :', [('1', 2), ('1.5', 2),
                                          ('3', 2)],
                               onchange=self.set_mode_parameter)
        self.menu.add_selector('Colour Input: ', [('False', 4), ('True', 4)],
                        onchange=self.set_mode_parameter)
        self.menu.add_selector('Homogenous Controls: ', [('False', 3), ('True', 3)],
                onchange=self.set_mode_parameter)
        self.menu.add_button('Start Training', self.start_space_invaders_training)
        self.menu.add_button('Back', self.main_menu)
        
        self.menu.mainloop(self.surface)

    #Start asteroids as a normal player
    def start_asteroids(self):
        self.game = Asteroids(800, 600, self.surface, False, 0.125)

    #Start space invaders as a normal player
    def start_space_invaders(self):
        self.game = Space_Invaders(800, 600, self.surface, False, 1.0)
        
    def start_asteroids_training(self):
        self.menu.disable()
        print("starting asteroid training")
        training_game = Asteroids(self.scales[self.scale][0], self.scales[self.scale][1], 
                                  self.surface, True, self.scales[self.scale][2], #add player speed
                                  player_speed = self.player_speeds[self.player_speed],
                                  player_rtspeed = self.player_speeds[self.player_speed], 
                                  intensity_modifier = self.intensity_rates[self.intensity],
                                  has_colour = self.colour)
        
        self.game = Custom_OpenAI_Env(self.scales[self.scale][0], self.scales[self.scale][1],
                                       training_game)
        self.build_model()
        self.train_model()
        if self.transfer == -1:
            self.save("asteroids")
        else:
            self.save("Space-Asteroids-Transfer")
        self.main_menu()
    
    def start_space_invaders_training(self):
        self.menu.disable()
        print("Starting space invaders training")
        training_game = Space_Invaders(self.scales[self.scale][0], self.scales[self.scale][1]
                                       , self.surface, True, self.scales[self.scale][2],
                                       enemy_speed = self.enemy_speeds[self.enemy_speed],
                                       player_speed = self.player_speeds[self.player_speed],
                                       game_intensity_modifier = self.intensity_rates[self.intensity],
                                       homogenous_controls = self.homogenous_controls,
                                       has_colour = self.colour)
        
        #Assign new action space based on homogenous control setting
        action_space= Discrete(5)
        self.game = Custom_OpenAI_Env(self.scales[self.scale][0], self.scales[self.scale][1],
                                     training_game,
                                     action_space = action_space)
        self.build_model()
        self.train_model()
        if self.transfer == -1:
            self.save("space-invaders")
        else:
            self.save("Asteroids-Space-Transfer")
        self.main_menu()
        
    def build_model(self, metrics = ['mae']):
        print("building model")
        
        #Initialise state/action arrays
        states = self.game.observation_space.shape
        states = (WINDOW_LENGTH, states[1], states[2])
        print(states)
        self.actions = self.game.action_space.n
        
        #Initialise the DRL model based on transfer setting
        #if self.transfer == -1:
        self.model = self.build_network(states, self.actions)

        if self.transfer == 0:
            print("loading space invaders original model.")
            if self.load("space-invaders") != False:
                self.load("space-invaders")
            else:
                self.model = self.build_network(states, self.actions)
                
        elif self.transfer == 1:
            print("loading asteroids original model")
            if self.load("asteroids") != False:
                self.load("asteroids")
            else:
                self.model = self.build_network(states, self.actions)

        
        self.dqn = self.build_agent(self.model, self.actions)
        self.dqn.compile(Adam(beta_1 = 0.99, lr=self.learning_rates[self.learning_rate]), metrics=metrics)
        
    def train_model(self, episodes = 5, verbose = 1):
        #Initialise the DQN agent
        #Debug info
        print("parameters: ", "\nlearning rate: ", self.learning_rates[self.learning_rate],
             "\nepisodes: ",  self.episodes[self.episode],
              "\nsteps: ", self.steps[self.step],
              "\nvisualize: ", self.visualize,
              "\nscale: ", self.scales[self.scale],
              "\nintensity (asteroids only): ", self.intensity_rates[self.intensity],
              "\nplayer speed : ", self.player_speeds[self.player_speed],
              "\nhomogenous controls (space invaders only): ", self.homogenous_controls,
              "\ncolour: ", self.colour)

        print("beginning training")
        #Fit with openAI gym
        self.dqn.fit(self.game, nb_steps=self.steps[self.step], visualize=self.visualize, verbose=2)

        print("training complete")
        self.test_model()

    
    def test_model(self):
        #Test with scores
        action_space= Discrete(5)
        
        if self.test_env == 1:
            training_game = Space_Invaders(self.scales[2][0], self.scales[2][1]
                                    , self.surface, True, self.scales[2][2],
                                    homogenous_controls = self.homogenous_controls)
            self.game = Custom_OpenAI_Env(self.scales[2][0], self.scales[2][1],
                                         training_game, colour=self.colour,
                                         action_space = action_space)
        elif self.test_env == 0:

            training_game = Asteroids(self.scales[2][0], self.scales[2][1], 
                                self.surface, True, self.scales[2][2],
                                intensity_modifier = self.intensity_rates[self.intensity])
            self.game = Custom_OpenAI_Env(self.scales[2][0], self.scales[2][1],
                                           training_game, colour=self.colour)
        
        if self.test_env != -1:
            self.build_model()
            
        scores = self.dqn.test(self.game, nb_episodes=100, visualize=True)
        print(np.mean(scores.history['episode_reward']))
        
    def save(self, name='default-model'):
        #was dqn now model
        self.dqn.save_weights(name + ".hdf5", overwrite=True)
        
    def load(self, name):
        #Load existing weights into the blank model 
        if os.path.isfile(name + '.hdf5'):
            return self.model.load_weights(name + '.hdf5')
        print("file not found, creating default model.")
        return False
    
    def build_network(self, states, actions):
        model = Sequential()
        print("shape : ", len(states), actions)
        #Convolutional layers
        model.add(Conv2D(16, (10,10), activation='relu', input_shape=states, padding='same'))
        model.add(MaxPooling2D((2, 2), padding = 'same'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(Conv2D(32, (5,5), activation='relu', input_shape=states, padding='same'))
        model.add(MaxPooling2D((2, 2), padding = 'same'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(Flatten())
        #Fully connected layers
        model.add(Dense(256, activation='relu'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(Dense(32, activation='relu'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(Dense(32, activation='relu'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(Dense(16, activation='relu'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(Dense(self.actions, activation='relu'))
        #Debug summary of the model built
        print(model.summary())
        return model

    #Function to create DQN model 
    def build_agent(self, model, actions, policy_type = 0):
        #Explorative policy
        policy = BoltzmannQPolicy()
        #Exploitative policy
        if policy_type != 0:
            policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.2, nb_steps=10000)
        
        memory = SequentialMemory(limit=1000, window_length=WINDOW_LENGTH)
        dqn = DQNAgent(model=model, memory=memory, policy=policy,
                      enable_dueling_network=True, dueling_type='avg', 
                       nb_actions=actions, nb_steps_warmup=1000
                      )
        return dqn

In [None]:
Game = Game_Menu(800,600)

standard params called
changing learning rate
changing learning rate
starting asteroid training
initialisation complete
building model
(3, 150, 150)
shape :  3 5
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 3, 150, 16)        240016    
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 2, 75, 16)         0         
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 2, 75, 16)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 2, 75, 32)         12832     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 38, 32)         0         
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU) 

  1831/100000: episode: 23, duration: 11.701s, episode steps:  65, steps per second:   6, episode reward:  2.000, mean reward:  0.031 [ 0.000,  1.000], mean action: 1.923 [0.000, 4.000],  loss: 5.614073, mae: 5.574471, mean_q: 9.529521
  1872/100000: episode: 24, duration: 7.324s, episode steps:  41, steps per second:   6, episode reward:  1.000, mean reward:  0.024 [ 0.000,  1.000], mean action: 1.951 [0.000, 4.000],  loss: 6.114672, mae: 5.413899, mean_q: 9.266161
  1945/100000: episode: 25, duration: 12.767s, episode steps:  73, steps per second:   6, episode reward:  1.000, mean reward:  0.014 [ 0.000,  1.000], mean action: 1.945 [0.000, 4.000],  loss: 7.323336, mae: 5.631390, mean_q: 9.578229
  2041/100000: episode: 26, duration: 16.749s, episode steps:  96, steps per second:   6, episode reward:  1.000, mean reward:  0.010 [ 0.000,  1.000], mean action: 1.802 [0.000, 4.000],  loss: 6.286077, mae: 5.862550, mean_q: 9.978385
  2237/100000: episode: 27, duration: 33.980s, episode st

  5460/100000: episode: 58, duration: 7.392s, episode steps:  42, steps per second:   6, episode reward:  1.000, mean reward:  0.024 [ 0.000,  1.000], mean action: 2.238 [0.000, 4.000],  loss: 7.903409, mae: 5.894479, mean_q: 9.797974
  5593/100000: episode: 59, duration: 23.082s, episode steps: 133, steps per second:   6, episode reward:  1.000, mean reward:  0.008 [ 0.000,  1.000], mean action: 1.699 [0.000, 4.000],  loss: 6.681366, mae: 6.021786, mean_q: 10.012145
  5666/100000: episode: 60, duration: 12.719s, episode steps:  73, steps per second:   6, episode reward:  1.000, mean reward:  0.014 [ 0.000,  1.000], mean action: 2.192 [0.000, 4.000],  loss: 7.383044, mae: 6.896339, mean_q: 11.451265
  5753/100000: episode: 61, duration: 15.168s, episode steps:  87, steps per second:   6, episode reward:  2.000, mean reward:  0.023 [ 0.000,  1.000], mean action: 2.218 [0.000, 4.000],  loss: 7.390929, mae: 7.161745, mean_q: 11.892189
  5906/100000: episode: 62, duration: 26.610s, episode

  8696/100000: episode: 93, duration: 7.880s, episode steps:  45, steps per second:   6, episode reward:  1.000, mean reward:  0.022 [ 0.000,  1.000], mean action: 1.756 [0.000, 4.000],  loss: 8.717838, mae: 8.097464, mean_q: 13.531757
  8746/100000: episode: 94, duration: 8.719s, episode steps:  50, steps per second:   6, episode reward:  1.000, mean reward:  0.020 [ 0.000,  1.000], mean action: 2.520 [0.000, 4.000],  loss: 9.825883, mae: 7.997077, mean_q: 13.258331
  8793/100000: episode: 95, duration: 8.233s, episode steps:  47, steps per second:   6, episode reward:  1.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 1.404 [0.000, 4.000],  loss: 7.921573, mae: 7.792444, mean_q: 12.966427
  8866/100000: episode: 96, duration: 12.736s, episode steps:  73, steps per second:   6, episode reward:  2.000, mean reward:  0.027 [ 0.000,  1.000], mean action: 2.192 [0.000, 4.000],  loss: 8.771295, mae: 7.467428, mean_q: 12.395802
  8942/100000: episode: 97, duration: 13.263s, episode 

 11430/100000: episode: 128, duration: 8.446s, episode steps:  48, steps per second:   6, episode reward:  1.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 2.333 [0.000, 4.000],  loss: 6.836053, mae: 6.303256, mean_q: 10.422276
 11522/100000: episode: 129, duration: 16.012s, episode steps:  92, steps per second:   6, episode reward:  1.000, mean reward:  0.011 [ 0.000,  1.000], mean action: 1.870 [0.000, 4.000],  loss: 6.444311, mae: 6.280526, mean_q: 10.366632
 11628/100000: episode: 130, duration: 18.438s, episode steps: 106, steps per second:   6, episode reward:  1.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.302 [0.000, 4.000],  loss: 6.323218, mae: 6.176347, mean_q: 10.176941
 11804/100000: episode: 131, duration: 30.514s, episode steps: 176, steps per second:   6, episode reward:  2.000, mean reward:  0.011 [ 0.000,  1.000], mean action: 1.972 [0.000, 4.000],  loss: 6.486554, mae: 6.904442, mean_q: 11.286271
 11903/100000: episode: 132, duration: 17.224s, e

 14613/100000: episode: 163, duration: 10.514s, episode steps:  60, steps per second:   6, episode reward:  2.000, mean reward:  0.033 [ 0.000,  1.000], mean action: 1.817 [0.000, 4.000],  loss: 5.680422, mae: 6.066197, mean_q: 9.998399
 14955/100000: episode: 164, duration: 59.039s, episode steps: 342, steps per second:   6, episode reward:  1.000, mean reward:  0.003 [ 0.000,  1.000], mean action: 2.386 [0.000, 4.000],  loss: 5.961192, mae: 6.457204, mean_q: 10.731126
 14994/100000: episode: 165, duration: 6.906s, episode steps:  39, steps per second:   6, episode reward:  1.000, mean reward:  0.026 [ 0.000,  1.000], mean action: 2.051 [0.000, 4.000],  loss: 6.413867, mae: 7.509166, mean_q: 12.546541
 15041/100000: episode: 166, duration: 8.246s, episode steps:  47, steps per second:   6, episode reward:  2.000, mean reward:  0.043 [ 0.000,  1.000], mean action: 2.000 [0.000, 4.000],  loss: 6.457944, mae: 7.225306, mean_q: 12.093929
 15168/100000: episode: 167, duration: 22.029s, epi

 18094/100000: episode: 198, duration: 39.399s, episode steps: 228, steps per second:   6, episode reward:  2.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.254 [0.000, 4.000],  loss: 6.401051, mae: 7.526092, mean_q: 12.382620
 18167/100000: episode: 199, duration: 12.692s, episode steps:  73, steps per second:   6, episode reward:  2.000, mean reward:  0.027 [ 0.000,  1.000], mean action: 2.041 [0.000, 4.000],  loss: 6.650132, mae: 7.414240, mean_q: 12.319550
 18255/100000: episode: 200, duration: 15.288s, episode steps:  88, steps per second:   6, episode reward:  1.000, mean reward:  0.011 [ 0.000,  1.000], mean action: 2.000 [0.000, 4.000],  loss: 7.156430, mae: 7.773210, mean_q: 12.841800
 18310/100000: episode: 201, duration: 9.603s, episode steps:  55, steps per second:   6, episode reward:  1.000, mean reward:  0.018 [ 0.000,  1.000], mean action: 2.055 [0.000, 4.000],  loss: 7.967523, mae: 8.030756, mean_q: 13.224784
 18508/100000: episode: 202, duration: 34.207s, e

 20733/100000: episode: 233, duration: 6.711s, episode steps:  38, steps per second:   6, episode reward:  2.000, mean reward:  0.053 [ 0.000,  1.000], mean action: 1.816 [0.000, 4.000],  loss: 4.779156, mae: 5.372038, mean_q: 8.931964
 20856/100000: episode: 234, duration: 21.286s, episode steps: 123, steps per second:   6, episode reward:  2.000, mean reward:  0.016 [ 0.000,  1.000], mean action: 2.065 [0.000, 4.000],  loss: 5.433859, mae: 5.566198, mean_q: 9.237506
 20992/100000: episode: 235, duration: 23.530s, episode steps: 136, steps per second:   6, episode reward:  3.000, mean reward:  0.022 [ 0.000,  1.000], mean action: 2.544 [0.000, 4.000],  loss: 6.057072, mae: 5.544964, mean_q: 9.213945
 21114/100000: episode: 236, duration: 21.042s, episode steps: 122, steps per second:   6, episode reward:  1.000, mean reward:  0.008 [ 0.000,  1.000], mean action: 2.213 [0.000, 4.000],  loss: 6.068618, mae: 5.712463, mean_q: 9.623606
 21169/100000: episode: 237, duration: 9.619s, episod

 23703/100000: episode: 268, duration: 14.904s, episode steps:  86, steps per second:   6, episode reward:  1.000, mean reward:  0.012 [ 0.000,  1.000], mean action: 2.081 [0.000, 4.000],  loss: 5.349967, mae: 7.051135, mean_q: 11.712268
 23797/100000: episode: 269, duration: 16.270s, episode steps:  94, steps per second:   6, episode reward:  1.000, mean reward:  0.011 [ 0.000,  1.000], mean action: 2.223 [0.000, 4.000],  loss: 5.173464, mae: 6.908716, mean_q: 11.509309
 23904/100000: episode: 270, duration: 18.513s, episode steps: 107, steps per second:   6, episode reward:  1.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.486 [0.000, 4.000],  loss: 5.656140, mae: 7.279716, mean_q: 12.080683
 24014/100000: episode: 271, duration: 18.977s, episode steps: 110, steps per second:   6, episode reward:  1.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.118 [0.000, 4.000],  loss: 5.763171, mae: 7.336510, mean_q: 12.162008
 24052/100000: episode: 272, duration: 6.680s, e

 26542/100000: episode: 303, duration: 9.599s, episode steps:  55, steps per second:   6, episode reward:  2.000, mean reward:  0.036 [ 0.000,  1.000], mean action: 1.982 [0.000, 4.000],  loss: 4.778219, mae: 5.985465, mean_q: 9.652264
 26580/100000: episode: 304, duration: 6.685s, episode steps:  38, steps per second:   6, episode reward:  2.000, mean reward:  0.053 [ 0.000,  1.000], mean action: 2.000 [0.000, 4.000],  loss: 4.879708, mae: 5.738923, mean_q: 9.239436
 26753/100000: episode: 305, duration: 29.851s, episode steps: 173, steps per second:   6, episode reward:  1.000, mean reward:  0.006 [ 0.000,  1.000], mean action: 2.046 [0.000, 4.000],  loss: 4.681337, mae: 5.440833, mean_q: 8.809487
 26815/100000: episode: 306, duration: 10.806s, episode steps:  62, steps per second:   6, episode reward:  1.000, mean reward:  0.016 [ 0.000,  1.000], mean action: 2.323 [0.000, 4.000],  loss: 3.881965, mae: 4.838525, mean_q: 8.028243
 26871/100000: episode: 307, duration: 9.820s, episode

 29208/100000: episode: 338, duration: 8.734s, episode steps:  50, steps per second:   6, episode reward:  2.000, mean reward:  0.040 [ 0.000,  1.000], mean action: 2.000 [0.000, 4.000],  loss: 5.334792, mae: 6.209164, mean_q: 10.131711
 29292/100000: episode: 339, duration: 14.566s, episode steps:  84, steps per second:   6, episode reward:  1.000, mean reward:  0.012 [ 0.000,  1.000], mean action: 2.619 [0.000, 4.000],  loss: 5.177652, mae: 6.145382, mean_q: 10.066989
 29598/100000: episode: 340, duration: 52.758s, episode steps: 306, steps per second:   6, episode reward:  2.000, mean reward:  0.007 [ 0.000,  1.000], mean action: 2.288 [0.000, 4.000],  loss: 5.693681, mae: 7.333718, mean_q: 12.056602
 29690/100000: episode: 341, duration: 15.897s, episode steps:  92, steps per second:   6, episode reward:  1.000, mean reward:  0.011 [ 0.000,  1.000], mean action: 2.402 [0.000, 4.000],  loss: 6.468036, mae: 8.288663, mean_q: 13.655005
 29735/100000: episode: 342, duration: 7.864s, ep

 32258/100000: episode: 373, duration: 9.233s, episode steps:  53, steps per second:   6, episode reward:  1.000, mean reward:  0.019 [ 0.000,  1.000], mean action: 1.528 [0.000, 4.000],  loss: 4.202546, mae: 6.129601, mean_q: 9.992194
 32346/100000: episode: 374, duration: 15.261s, episode steps:  88, steps per second:   6, episode reward:  1.000, mean reward:  0.011 [ 0.000,  1.000], mean action: 1.784 [0.000, 4.000],  loss: 4.109660, mae: 6.031889, mean_q: 9.836317
 32448/100000: episode: 375, duration: 17.624s, episode steps: 102, steps per second:   6, episode reward:  2.000, mean reward:  0.020 [ 0.000,  1.000], mean action: 1.990 [0.000, 4.000],  loss: 4.316694, mae: 6.287016, mean_q: 10.242890
 32597/100000: episode: 376, duration: 25.716s, episode steps: 149, steps per second:   6, episode reward:  1.000, mean reward:  0.007 [ 0.000,  1.000], mean action: 2.268 [0.000, 4.000],  loss: 4.379536, mae: 6.410317, mean_q: 10.516931
 32658/100000: episode: 377, duration: 10.593s, epi

 35717/100000: episode: 408, duration: 23.997s, episode steps: 139, steps per second:   6, episode reward:  2.000, mean reward:  0.014 [ 0.000,  1.000], mean action: 2.439 [0.000, 4.000],  loss: 5.573839, mae: 8.730906, mean_q: 14.218329
 35752/100000: episode: 409, duration: 6.153s, episode steps:  35, steps per second:   6, episode reward:  1.000, mean reward:  0.029 [ 0.000,  1.000], mean action: 2.314 [0.000, 4.000],  loss: 4.979828, mae: 8.749703, mean_q: 14.308395
 35792/100000: episode: 410, duration: 7.045s, episode steps:  40, steps per second:   6, episode reward:  1.000, mean reward:  0.025 [ 0.000,  1.000], mean action: 1.850 [0.000, 4.000],  loss: 5.201047, mae: 8.431284, mean_q: 13.734650
 35845/100000: episode: 411, duration: 9.266s, episode steps:  53, steps per second:   6, episode reward:  1.000, mean reward:  0.019 [ 0.000,  1.000], mean action: 2.245 [0.000, 4.000],  loss: 4.839299, mae: 7.812365, mean_q: 12.792150
 35885/100000: episode: 412, duration: 7.047s, epis

 38851/100000: episode: 443, duration: 38.997s, episode steps: 226, steps per second:   6, episode reward:  2.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.235 [0.000, 4.000],  loss: 5.099897, mae: 6.304586, mean_q: 10.495872
 39031/100000: episode: 444, duration: 30.963s, episode steps: 180, steps per second:   6, episode reward:  4.000, mean reward:  0.022 [ 0.000,  1.000], mean action: 2.211 [0.000, 4.000],  loss: 4.696270, mae: 6.563058, mean_q: 10.884044
 39079/100000: episode: 445, duration: 8.383s, episode steps:  48, steps per second:   6, episode reward:  1.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 1.917 [0.000, 4.000],  loss: 4.703920, mae: 6.642570, mean_q: 10.862515
 39128/100000: episode: 446, duration: 8.553s, episode steps:  49, steps per second:   6, episode reward:  2.000, mean reward:  0.041 [ 0.000,  1.000], mean action: 1.612 [0.000, 4.000],  loss: 4.206356, mae: 6.322611, mean_q: 10.471395
 39224/100000: episode: 447, duration: 16.606s, ep

 42250/100000: episode: 478, duration: 6.836s, episode steps:  39, steps per second:   6, episode reward:  1.000, mean reward:  0.026 [ 0.000,  1.000], mean action: 2.513 [0.000, 4.000],  loss: 6.294784, mae: 9.233840, mean_q: 15.015364
 42298/100000: episode: 479, duration: 8.402s, episode steps:  48, steps per second:   6, episode reward:  1.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 2.104 [0.000, 4.000],  loss: 6.865736, mae: 9.239221, mean_q: 14.990410
 42365/100000: episode: 480, duration: 11.662s, episode steps:  67, steps per second:   6, episode reward:  2.000, mean reward:  0.030 [ 0.000,  1.000], mean action: 2.418 [0.000, 4.000],  loss: 7.704737, mae: 9.350788, mean_q: 15.062372
 42496/100000: episode: 481, duration: 22.661s, episode steps: 131, steps per second:   6, episode reward:  3.000, mean reward:  0.023 [ 0.000,  1.000], mean action: 2.046 [0.000, 4.000],  loss: 6.339449, mae: 8.732029, mean_q: 14.183603
 42549/100000: episode: 482, duration: 9.258s, epi

 45051/100000: episode: 513, duration: 6.520s, episode steps:  37, steps per second:   6, episode reward:  1.000, mean reward:  0.027 [ 0.000,  1.000], mean action: 2.216 [0.000, 4.000],  loss: 4.585954, mae: 6.855047, mean_q: 11.406037
 45106/100000: episode: 514, duration: 9.589s, episode steps:  55, steps per second:   6, episode reward:  1.000, mean reward:  0.018 [ 0.000,  1.000], mean action: 2.055 [0.000, 4.000],  loss: 5.430320, mae: 7.110699, mean_q: 11.773421
 45156/100000: episode: 515, duration: 8.758s, episode steps:  50, steps per second:   6, episode reward:  1.000, mean reward:  0.020 [ 0.000,  1.000], mean action: 2.160 [0.000, 4.000],  loss: 5.948578, mae: 7.386378, mean_q: 12.169890
 45216/100000: episode: 516, duration: 10.450s, episode steps:  60, steps per second:   6, episode reward:  1.000, mean reward:  0.017 [ 0.000,  1.000], mean action: 1.983 [0.000, 4.000],  loss: 5.648504, mae: 7.448042, mean_q: 12.260904
 45294/100000: episode: 517, duration: 13.586s, epi

 48255/100000: episode: 548, duration: 19.733s, episode steps: 114, steps per second:   6, episode reward:  1.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.439 [0.000, 4.000],  loss: 4.982746, mae: 5.963404, mean_q: 10.053222
 48303/100000: episode: 549, duration: 8.430s, episode steps:  48, steps per second:   6, episode reward:  1.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 1.896 [0.000, 4.000],  loss: 5.575634, mae: 5.726823, mean_q: 9.695603
 48394/100000: episode: 550, duration: 15.783s, episode steps:  91, steps per second:   6, episode reward:  1.000, mean reward:  0.011 [ 0.000,  1.000], mean action: 2.242 [0.000, 4.000],  loss: 4.853025, mae: 5.772946, mean_q: 9.846055
 48433/100000: episode: 551, duration: 6.863s, episode steps:  39, steps per second:   6, episode reward:  1.000, mean reward:  0.026 [ 0.000,  1.000], mean action: 2.256 [0.000, 4.000],  loss: 4.857255, mae: 6.132282, mean_q: 10.541663
 48575/100000: episode: 552, duration: 24.549s, epis

 51636/100000: episode: 583, duration: 9.477s, episode steps:  54, steps per second:   6, episode reward:  1.000, mean reward:  0.019 [ 0.000,  1.000], mean action: 2.463 [0.000, 4.000],  loss: 4.373816, mae: 6.574643, mean_q: 11.105409
 51686/100000: episode: 584, duration: 8.751s, episode steps:  50, steps per second:   6, episode reward:  1.000, mean reward:  0.020 [ 0.000,  1.000], mean action: 2.040 [0.000, 4.000],  loss: 4.617615, mae: 6.763862, mean_q: 11.403211
 51725/100000: episode: 585, duration: 6.863s, episode steps:  39, steps per second:   6, episode reward:  1.000, mean reward:  0.026 [ 0.000,  1.000], mean action: 2.821 [0.000, 4.000],  loss: 4.798488, mae: 6.993961, mean_q: 11.864976
 51830/100000: episode: 586, duration: 18.249s, episode steps: 105, steps per second:   6, episode reward:  2.000, mean reward:  0.019 [ 0.000,  1.000], mean action: 1.905 [0.000, 4.000],  loss: 4.560119, mae: 6.747497, mean_q: 11.473760
 51902/100000: episode: 587, duration: 12.541s, epi

 54895/100000: episode: 618, duration: 11.996s, episode steps:  69, steps per second:   6, episode reward:  1.000, mean reward:  0.014 [ 0.000,  1.000], mean action: 2.072 [0.000, 4.000],  loss: 3.644506, mae: 5.394797, mean_q: 9.088399
 54952/100000: episode: 619, duration: 9.939s, episode steps:  57, steps per second:   6, episode reward:  1.000, mean reward:  0.018 [ 0.000,  1.000], mean action: 1.807 [0.000, 4.000],  loss: 3.356549, mae: 5.345754, mean_q: 9.019026
 55020/100000: episode: 620, duration: 11.859s, episode steps:  68, steps per second:   6, episode reward:  2.000, mean reward:  0.029 [ 0.000,  1.000], mean action: 2.088 [0.000, 4.000],  loss: 4.055529, mae: 5.651812, mean_q: 9.426076
 55089/100000: episode: 621, duration: 12.028s, episode steps:  69, steps per second:   6, episode reward:  2.000, mean reward:  0.029 [ 0.000,  1.000], mean action: 2.043 [0.000, 4.000],  loss: 3.710574, mae: 5.259604, mean_q: 8.787171
 55358/100000: episode: 622, duration: 46.306s, episo

 57686/100000: episode: 653, duration: 24.257s, episode steps: 140, steps per second:   6, episode reward:  2.000, mean reward:  0.014 [ 0.000,  1.000], mean action: 2.079 [0.000, 4.000],  loss: 4.890100, mae: 6.856921, mean_q: 11.513121
 57733/100000: episode: 654, duration: 8.251s, episode steps:  47, steps per second:   6, episode reward:  1.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 1.915 [0.000, 4.000],  loss: 5.261283, mae: 7.504519, mean_q: 12.538408
 57843/100000: episode: 655, duration: 19.038s, episode steps: 110, steps per second:   6, episode reward:  1.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.336 [0.000, 4.000],  loss: 5.954219, mae: 7.667253, mean_q: 12.858974
 57930/100000: episode: 656, duration: 15.111s, episode steps:  87, steps per second:   6, episode reward:  1.000, mean reward:  0.011 [ 0.000,  1.000], mean action: 2.322 [0.000, 4.000],  loss: 6.533198, mae: 8.079665, mean_q: 13.653781
 57972/100000: episode: 657, duration: 7.399s, ep

 61401/100000: episode: 688, duration: 29.981s, episode steps: 174, steps per second:   6, episode reward:  1.000, mean reward:  0.006 [ 0.000,  1.000], mean action: 2.339 [0.000, 4.000],  loss: 4.461294, mae: 7.552257, mean_q: 12.732701
 61445/100000: episode: 689, duration: 7.719s, episode steps:  44, steps per second:   6, episode reward:  2.000, mean reward:  0.045 [ 0.000,  1.000], mean action: 1.977 [0.000, 4.000],  loss: 5.433994, mae: 8.038675, mean_q: 13.591266
 61487/100000: episode: 690, duration: 7.365s, episode steps:  42, steps per second:   6, episode reward:  1.000, mean reward:  0.024 [ 0.000,  1.000], mean action: 1.881 [0.000, 4.000],  loss: 5.831982, mae: 7.899106, mean_q: 13.335277
 61698/100000: episode: 691, duration: 36.403s, episode steps: 211, steps per second:   6, episode reward:  1.000, mean reward:  0.005 [ 0.000,  1.000], mean action: 2.227 [0.000, 4.000],  loss: 5.733128, mae: 7.804675, mean_q: 12.914020
 61818/100000: episode: 692, duration: 20.766s, ep

 64286/100000: episode: 723, duration: 20.268s, episode steps: 117, steps per second:   6, episode reward:  1.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.265 [0.000, 4.000],  loss: 4.956530, mae: 7.072650, mean_q: 11.758455
 64340/100000: episode: 724, duration: 9.451s, episode steps:  54, steps per second:   6, episode reward:  3.000, mean reward:  0.056 [ 0.000,  1.000], mean action: 2.037 [0.000, 4.000],  loss: 4.863966, mae: 7.096488, mean_q: 11.824705
 64425/100000: episode: 725, duration: 14.754s, episode steps:  85, steps per second:   6, episode reward:  1.000, mean reward:  0.012 [ 0.000,  1.000], mean action: 2.400 [0.000, 4.000],  loss: 5.468111, mae: 7.232872, mean_q: 12.095207
 64471/100000: episode: 726, duration: 8.052s, episode steps:  46, steps per second:   6, episode reward:  1.000, mean reward:  0.022 [ 0.000,  1.000], mean action: 2.261 [0.000, 4.000],  loss: 5.667532, mae: 7.291343, mean_q: 12.324327
 64512/100000: episode: 727, duration: 7.213s, epi

 66979/100000: episode: 758, duration: 10.165s, episode steps:  58, steps per second:   6, episode reward:  2.000, mean reward:  0.034 [ 0.000,  1.000], mean action: 1.914 [0.000, 4.000],  loss: 5.997587, mae: 6.874319, mean_q: 11.803789
 67065/100000: episode: 759, duration: 15.013s, episode steps:  86, steps per second:   6, episode reward:  2.000, mean reward:  0.023 [ 0.000,  1.000], mean action: 2.267 [0.000, 4.000],  loss: 6.190030, mae: 6.729432, mean_q: 11.441424
 67127/100000: episode: 760, duration: 10.836s, episode steps:  62, steps per second:   6, episode reward:  1.000, mean reward:  0.016 [ 0.000,  1.000], mean action: 2.323 [0.000, 4.000],  loss: 6.448819, mae: 6.778102, mean_q: 11.604216
 67197/100000: episode: 761, duration: 12.203s, episode steps:  70, steps per second:   6, episode reward:  2.000, mean reward:  0.029 [ 0.000,  1.000], mean action: 2.443 [0.000, 4.000],  loss: 5.709737, mae: 6.632299, mean_q: 11.409390
 67257/100000: episode: 762, duration: 10.504s, 

 69798/100000: episode: 793, duration: 10.132s, episode steps:  58, steps per second:   6, episode reward:  2.000, mean reward:  0.034 [ 0.000,  1.000], mean action: 2.241 [0.000, 4.000],  loss: 4.910751, mae: 6.954809, mean_q: 11.664351
 69868/100000: episode: 794, duration: 12.182s, episode steps:  70, steps per second:   6, episode reward:  1.000, mean reward:  0.014 [ 0.000,  1.000], mean action: 2.143 [0.000, 4.000],  loss: 5.012158, mae: 6.728694, mean_q: 11.168017
 69905/100000: episode: 795, duration: 6.547s, episode steps:  37, steps per second:   6, episode reward:  1.000, mean reward:  0.027 [ 0.000,  1.000], mean action: 2.270 [0.000, 4.000],  loss: 5.046446, mae: 7.163979, mean_q: 11.983988
 69970/100000: episode: 796, duration: 11.345s, episode steps:  65, steps per second:   6, episode reward:  1.000, mean reward:  0.015 [ 0.000,  1.000], mean action: 2.385 [0.000, 4.000],  loss: 5.082349, mae: 6.903931, mean_q: 11.409124
 70020/100000: episode: 797, duration: 8.759s, ep

 73347/100000: episode: 828, duration: 9.245s, episode steps:  53, steps per second:   6, episode reward:  1.000, mean reward:  0.019 [ 0.000,  1.000], mean action: 2.528 [0.000, 4.000],  loss: 6.510339, mae: 7.152411, mean_q: 12.652527
 73455/100000: episode: 829, duration: 18.710s, episode steps: 108, steps per second:   6, episode reward:  1.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.361 [0.000, 4.000],  loss: 6.986250, mae: 7.062313, mean_q: 12.469654
 73500/100000: episode: 830, duration: 7.908s, episode steps:  45, steps per second:   6, episode reward:  1.000, mean reward:  0.022 [ 0.000,  1.000], mean action: 2.111 [0.000, 4.000],  loss: 6.810327, mae: 7.430865, mean_q: 13.085510
 73612/100000: episode: 831, duration: 19.439s, episode steps: 112, steps per second:   6, episode reward:  1.000, mean reward:  0.009 [ 0.000,  1.000], mean action: 2.473 [0.000, 4.000],  loss: 6.282714, mae: 7.435585, mean_q: 13.067787
 73656/100000: episode: 832, duration: 7.715s, epi

 76022/100000: episode: 863, duration: 8.257s, episode steps:  47, steps per second:   6, episode reward:  1.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 1.830 [0.000, 4.000],  loss: 5.886426, mae: 6.957636, mean_q: 11.679954
 76080/100000: episode: 864, duration: 10.181s, episode steps:  58, steps per second:   6, episode reward:  2.000, mean reward:  0.034 [ 0.000,  1.000], mean action: 2.621 [0.000, 4.000],  loss: 5.267107, mae: 6.500436, mean_q: 10.968993
 76124/100000: episode: 865, duration: 7.744s, episode steps:  44, steps per second:   6, episode reward:  1.000, mean reward:  0.023 [ 0.000,  1.000], mean action: 1.932 [0.000, 4.000],  loss: 5.824551, mae: 6.457438, mean_q: 10.934913
 76225/100000: episode: 866, duration: 17.570s, episode steps: 101, steps per second:   6, episode reward:  1.000, mean reward:  0.010 [ 0.000,  1.000], mean action: 2.416 [0.000, 4.000],  loss: 5.710565, mae: 6.335122, mean_q: 10.828985
 76520/100000: episode: 867, duration: 50.995s, ep

 79131/100000: episode: 898, duration: 41.288s, episode steps: 240, steps per second:   6, episode reward:  1.000, mean reward:  0.004 [ 0.000,  1.000], mean action: 2.538 [0.000, 4.000],  loss: 4.835907, mae: 7.339449, mean_q: 12.952716
 79198/100000: episode: 899, duration: 11.678s, episode steps:  67, steps per second:   6, episode reward:  1.000, mean reward:  0.015 [ 0.000,  1.000], mean action: 1.925 [0.000, 4.000],  loss: 5.881157, mae: 7.502548, mean_q: 13.489013
 79253/100000: episode: 900, duration: 9.627s, episode steps:  55, steps per second:   6, episode reward:  1.000, mean reward:  0.018 [ 0.000,  1.000], mean action: 2.455 [0.000, 4.000],  loss: 5.924403, mae: 7.251412, mean_q: 12.899413
 79310/100000: episode: 901, duration: 9.979s, episode steps:  57, steps per second:   6, episode reward:  1.000, mean reward:  0.018 [ 0.000,  1.000], mean action: 2.351 [0.000, 4.000],  loss: 5.521254, mae: 7.250319, mean_q: 13.047626
 79355/100000: episode: 902, duration: 7.902s, epi

 81761/100000: episode: 933, duration: 7.951s, episode steps:  45, steps per second:   6, episode reward:  1.000, mean reward:  0.022 [ 0.000,  1.000], mean action: 2.467 [0.000, 4.000],  loss: 3.813923, mae: 5.474324, mean_q: 9.325624
 81809/100000: episode: 934, duration: 8.432s, episode steps:  48, steps per second:   6, episode reward:  1.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 2.271 [0.000, 4.000],  loss: 4.484770, mae: 5.335425, mean_q: 9.044568
 81937/100000: episode: 935, duration: 22.184s, episode steps: 128, steps per second:   6, episode reward:  2.000, mean reward:  0.016 [ 0.000,  1.000], mean action: 2.312 [0.000, 4.000],  loss: 4.182367, mae: 5.404627, mean_q: 9.146534
 82042/100000: episode: 936, duration: 18.241s, episode steps: 105, steps per second:   6, episode reward:  2.000, mean reward:  0.019 [ 0.000,  1.000], mean action: 2.019 [0.000, 4.000],  loss: 3.826770, mae: 5.514097, mean_q: 9.442077
 82129/100000: episode: 937, duration: 15.159s, episod

 84753/100000: episode: 968, duration: 7.220s, episode steps:  41, steps per second:   6, episode reward:  1.000, mean reward:  0.024 [ 0.000,  1.000], mean action: 2.171 [0.000, 4.000],  loss: 6.085099, mae: 6.643352, mean_q: 11.163088
 84822/100000: episode: 969, duration: 12.031s, episode steps:  69, steps per second:   6, episode reward:  1.000, mean reward:  0.014 [ 0.000,  1.000], mean action: 2.957 [0.000, 4.000],  loss: 5.009725, mae: 6.701506, mean_q: 11.292746
 84879/100000: episode: 970, duration: 9.987s, episode steps:  57, steps per second:   6, episode reward:  2.000, mean reward:  0.035 [ 0.000,  1.000], mean action: 2.737 [0.000, 4.000],  loss: 6.320929, mae: 6.295943, mean_q: 10.669348
 84957/100000: episode: 971, duration: 13.592s, episode steps:  78, steps per second:   6, episode reward:  1.000, mean reward:  0.013 [ 0.000,  1.000], mean action: 2.282 [0.000, 4.000],  loss: 5.416831, mae: 6.145088, mean_q: 10.496780
 85060/100000: episode: 972, duration: 17.861s, ep

 87818/100000: episode: 1003, duration: 17.652s, episode steps: 102, steps per second:   6, episode reward:  2.000, mean reward:  0.020 [ 0.000,  1.000], mean action: 2.137 [0.000, 4.000],  loss: 3.660694, mae: 5.898588, mean_q: 10.328044
 87873/100000: episode: 1004, duration: 9.608s, episode steps:  55, steps per second:   6, episode reward:  1.000, mean reward:  0.018 [ 0.000,  1.000], mean action: 2.127 [0.000, 4.000],  loss: 3.759126, mae: 6.013584, mean_q: 10.548348
 88049/100000: episode: 1005, duration: 30.361s, episode steps: 176, steps per second:   6, episode reward:  3.000, mean reward:  0.017 [ 0.000,  1.000], mean action: 2.239 [0.000, 4.000],  loss: 3.405863, mae: 5.680010, mean_q: 9.964133
 88154/100000: episode: 1006, duration: 18.180s, episode steps: 105, steps per second:   6, episode reward:  1.000, mean reward:  0.010 [ 0.000,  1.000], mean action: 2.352 [0.000, 4.000],  loss: 3.762881, mae: 5.722743, mean_q: 10.050332
 88199/100000: episode: 1007, duration: 7.882s

 90831/100000: episode: 1038, duration: 7.755s, episode steps:  44, steps per second:   6, episode reward:  1.000, mean reward:  0.023 [ 0.000,  1.000], mean action: 2.477 [0.000, 4.000],  loss: 4.525116, mae: 5.657946, mean_q: 9.512589
 90942/100000: episode: 1039, duration: 19.271s, episode steps: 111, steps per second:   6, episode reward:  2.000, mean reward:  0.018 [ 0.000,  1.000], mean action: 1.874 [0.000, 4.000],  loss: 4.892728, mae: 5.818607, mean_q: 9.794332
 91012/100000: episode: 1040, duration: 12.241s, episode steps:  70, steps per second:   6, episode reward:  2.000, mean reward:  0.029 [ 0.000,  1.000], mean action: 2.729 [0.000, 4.000],  loss: 4.955667, mae: 5.776595, mean_q: 9.779156
 91052/100000: episode: 1041, duration: 7.048s, episode steps:  40, steps per second:   6, episode reward:  1.000, mean reward:  0.025 [ 0.000,  1.000], mean action: 2.150 [0.000, 4.000],  loss: 5.026218, mae: 5.840978, mean_q: 9.811907
 91246/100000: episode: 1042, duration: 33.552s, e

 94766/100000: episode: 1073, duration: 7.719s, episode steps:  44, steps per second:   6, episode reward:  2.000, mean reward:  0.045 [ 0.000,  1.000], mean action: 2.568 [0.000, 4.000],  loss: 4.255746, mae: 7.015725, mean_q: 12.233463
 94810/100000: episode: 1074, duration: 7.723s, episode steps:  44, steps per second:   6, episode reward:  1.000, mean reward:  0.023 [ 0.000,  1.000], mean action: 2.114 [0.000, 4.000],  loss: 4.107853, mae: 6.762183, mean_q: 11.738068
 95002/100000: episode: 1075, duration: 33.134s, episode steps: 192, steps per second:   6, episode reward:  2.000, mean reward:  0.010 [ 0.000,  1.000], mean action: 2.458 [0.000, 4.000],  loss: 3.891752, mae: 6.707613, mean_q: 11.678528
 95053/100000: episode: 1076, duration: 8.922s, episode steps:  51, steps per second:   6, episode reward:  1.000, mean reward:  0.020 [ 0.000,  1.000], mean action: 2.235 [0.000, 4.000],  loss: 3.568377, mae: 6.476455, mean_q: 11.281519
 95111/100000: episode: 1077, duration: 10.122s

 97954/100000: episode: 1108, duration: 10.667s, episode steps:  61, steps per second:   6, episode reward:  1.000, mean reward:  0.016 [ 0.000,  1.000], mean action: 2.377 [0.000, 4.000],  loss: 4.903727, mae: 6.661449, mean_q: 11.174904
 98106/100000: episode: 1109, duration: 26.289s, episode steps: 152, steps per second:   6, episode reward:  1.000, mean reward:  0.007 [ 0.000,  1.000], mean action: 2.066 [0.000, 4.000],  loss: 4.529207, mae: 6.405660, mean_q: 10.777441
 98249/100000: episode: 1110, duration: 24.738s, episode steps: 143, steps per second:   6, episode reward:  1.000, mean reward:  0.007 [ 0.000,  1.000], mean action: 2.490 [0.000, 4.000],  loss: 4.647287, mae: 6.377871, mean_q: 10.709054
 98314/100000: episode: 1111, duration: 11.310s, episode steps:  65, steps per second:   6, episode reward:  1.000, mean reward:  0.015 [ 0.000,  1.000], mean action: 2.615 [0.000, 4.000],  loss: 4.580628, mae: 6.352193, mean_q: 10.823589
 98375/100000: episode: 1112, duration: 10.6

Episode 91: reward: 1.000, steps: 55
Episode 92: reward: 1.000, steps: 107
Episode 93: reward: 1.000, steps: 73
Episode 94: reward: 1.000, steps: 64
Episode 95: reward: 1.000, steps: 83
Episode 96: reward: 1.000, steps: 86
Episode 97: reward: 1.000, steps: 92
Episode 98: reward: 1.000, steps: 99
Episode 99: reward: 1.000, steps: 255
Episode 100: reward: 1.000, steps: 56
0.99
