In [66]:
# CartPole Env
import gym
import matplotlib as plt

class CartPoleEnv:
    def __init__(self):
        self.fps = 30
        self.env = gym.make('CartPole-v0')  # environment interface to game
        self.env.seed(42)
        self.env.reset()
        self.is_show = False
        self.current_sate = []
        self.is_done = False

    def render(self):
        self.env.render()
        
    def reset(self, is_show = False):
        state = self.env.reset()
        state = state.tolist()
        self.current_sate = state
        self.is_done = False
        self.is_show = is_show
        if self.is_show:
            self.env.render()

        return state
    
    def act(self, action):
        if self.is_show:
            self.env.render()
        # return observation, reward, done, info
        state, reward, is_done, info = self.env.step(self.get_action_set()[action])
        state = state.tolist()
        self.current_sate = state
        self.is_done = is_done
        return state, reward, is_done, info

    def get_num_actions(self):
        return 2

    def get_action_set(self):
        return [0, 1]

    def get_screen_rgb(self):
        pass

    def get_screen_gray(self):
        pass

    def get_num_state_features(self):
        return 4

    def get_state(self):
        return self.current_sate

    def is_over(self):
        return self.is_done


In [67]:
# !pip install pygame
# !git clone https://github.com/ntasfi/PyGame-Learning-Environment.git
# !pip install -e ./PyGame-Learning-Environment

In [68]:
from ple.games.flappybird import FlappyBird as flappyBird
from ple import PLE

class FlappyBirdEnv:
    def __init__(self):
        self.fps = 30
        self.game = flappyBird()
        self.env = PLE(self.game, fps=self.fps, display_screen=False)  # environment interface to game
        self.env.reset_game()

    def reset(self, is_show = False):
        self.env = PLE(self.game, fps=self.fps, display_screen=is_show)  # environment interface to game
        self.env.reset_game()
        state = self.get_state()

        return state
    
    def act(self, action):
        # return state_prime, reward, done, info
        reward = self.env.act(self.env.getActionSet()[action])
#         print(reward)
        
        # Survive reward +1
#         reward += 1
        
        # Get closer to the middle of top and bottom pipe and get more reward
#         state = self.game.getGameState()
        
#         next_dis_to_mid = abs((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2 - state['player_y'])
#         print('State')
#         print(state)
#         print('Mid')
#         print((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2)
#         print('next_dis_to_mid')
#         print(abs((state['next_pipe_top_y'] + state['next_pipe_bottom_y']) / 2 - state['player_y']))
#         next_next_dis_to_mid = abs((state['next_next_pipe_top_y'] + state['next_next_pipe_bottom_y']) / 2 - state['player_y'])
#         dis_reward_coef = 0.01
        
#         reward += dis_reward_coef * ((-next_dis_to_mid) + 0.5 * (-next_next_dis_to_mid))
        
        state_prime = self.get_state()
        is_done = self.is_over()
        info = ""
        return state_prime, reward, is_done, info

    def get_num_actions(self):
        return len(self.env.getActionSet())

    def get_action_set(self):
        return self.env.getActionSet()

    def get_screen_rgb(self):
        return self.env.getScreenRGB()

    def get_screen_gray(self):
        return self.env.getScreenGrayscale()

    def get_num_state_features(self):
        return len(self.game.getGameState())

    def get_state(self):
        # dict
        #     * player y position.
        #     * players velocity.
        #     * next pipe distance to player
        #     * next pipe top y position
        #     * next pipe bottom y position
        #     * next next pipe distance to player
        #     * next next pipe top y position
        #     * next next pipe bottom y position

        # state = {
        #     "player_y": self.player.pos_y,
        #     "player_vel": self.player.vel,
            
        #     "next_pipe_dist_to_player": next_pipe.x - self.player.pos_x,
        #     "next_pipe_top_y": next_pipe.gap_start,
        #     "next_pipe_bottom_y": next_pipe.gap_start+self.pipe_gap, 
            
        #     "next_next_pipe_dist_to_player": next_next_pipe.x - self.player.pos_x,
        #     "next_next_pipe_top_y": next_next_pipe.gap_start,
        #     "next_next_pipe_bottom_y": next_next_pipe.gap_start+self.pipe_gap 
        # }
        state = self.game.getGameState()
        state['next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_next_pipe_bottom_y'] -= state['player_y']
        return list(state.values())

    def is_over(self):
        return self.env.game_over()


In [69]:
import numpy as np

class EpsilonGreedy:
    def __init__(self, epsilon, num_action, min_epsilon = 0.01, decay = 0.99):
        self.epsilon = epsilon
        self.num_action = num_action
        self.min_epsilon = min_epsilon
        self.decay = decay
        self.action_time = 0

    def select_action(self):
        self.action_time += 1
        if np.random.rand() < self.epsilon:
            self.update_epsilon()
            return np.random.choice(self.num_action)
        else: 
            self.update_epsilon()
            return -1
    
    def update_epsilon(self):
        self.epsilon = max(self.min_epsilon, min(0.5, 0.99**(self.action_time / 30)))
    
    def shutdown_explore(self):
        self.epsilon = 0

In [70]:
# ddqn.py
# Implement DDQN with Delay Network, History Replay, Epsilon Greedy
# 2020/08/10 SYC

import tensorflow as tf
import numpy as np
# import matplotlib as plt
# import policiesPractice.models.expStrategy as stg

class Agent:
    def __init__(self, state_size, num_action, delay_update_every_iter, reward_discount, learning_rate, exploration_strategy):
        self.state_size = state_size
        self.num_action = num_action
        self.reward_discount = reward_discount
        self.exploration_strategy = exploration_strategy
        self.delay_update_every_iter = delay_update_every_iter
        self.iter = 0
        self.eps = 0
        self.data_type = tf.float32
        self.batch_size = 32
        self.optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
        self.avg_loss = tf.keras.metrics.Mean(name = 'loss')
        self.online_model = self.build_model('online')
        self.target_model = self.build_model('target')
        self.is_shutdown_explore = False

        self.buffer = []
        self.buffer_size = 50000
    
    def build_model(self, name):
        nn_input = tf.keras.Input(shape = self.state_size, dtype = self.data_type)

        x = tf.keras.layers.Dense(units = 128)(nn_input)
        x = tf.keras.layers.ReLU()(x)
        x = tf.keras.layers.Dense(units = 128)(x)
        x = tf.keras.layers.ReLU()(x)
        nn_output = tf.keras.layers.Dense(units = self.num_action)(x)

        model = tf.keras.Model(name = name, inputs = nn_input, outputs = nn_output)

        return model

    def predict(self, state):
        # Online Model
        return self.online_model(tf.convert_to_tensor(state, self.data_type))

    def max_q(self, state_primes):
        # Target Model
        return tf.reduce_max(self.target_model(tf.convert_to_tensor(state_primes, self.data_type)), axis = 1)

    def loss(self, states, actions, rewards, state_primes):
        predicts = self.predict(states)
        indice = tf.stack([tf.range(len(actions)), actions], axis = 1)
        predict_qs = tf.gather_nd(predicts, indice)

        target_qs = self.max_q(state_primes)
        # Compute loss as formular: loss = E((r + gamma * max(Q(s', a'| Theta')) - Q(s, a, | Theta))^2)
        # Update model with a batch Every time. As a result, we compute the Expectation(E) of the total loss of a batch.
        return tf.reduce_mean(tf.square(rewards + self.reward_discount * target_qs - predict_qs))

    def get_metrics_loss(self):
        return self.avg_loss.result()
    
    def reset_metrics_loss(self):
        self.avg_loss.reset_states()

    def select_action(self, state):
        # Assume using Epsilon Greedy Strategy
        action = self.exploration_strategy.select_action()
        act_qs = self.predict([state])
        # If the index of action (return value) is -1, choose the action with highest probability that online_model predict
        if action == -1:
            return tf.argmax(act_qs, axis = 1)[0], act_qs
        else:
            # If the index of action (return value) is != -1, act randomly    
            return action, act_qs

    def shutdown_explore(self):
        self.is_shutdown_explore = True

    def update(self, loss, tape):
        gradients = tape.gradient(loss, self.online_model.trainable_variables)
        # gradients = [gradients if gradients is not None else tf.zeros_like(var) for var, grad in zip(self.model.trainable_variables, gradients)]
        self.optimizer.apply_gradients(zip(gradients, self.online_model.trainable_variables))
        self.avg_loss.update_state(loss)

        # Update exploration rate of Epsilon Greedy Strategy
        self.exploration_strategy.update_epsilon()
        
        # Deplay Update
        is_update_target = False
        if self.iter % self.delay_update_every_iter == 0:
            self.target_model.set_weights(self.online_model.get_weights())
            is_update_target = True

        return is_update_target
    
#     def preprocess_state(self, env_state):
#         # Preprocess SINGLE state
#         return list(env_state.values())

#     def preprocess_states(self, env_states):
#         # Preprocess MULTIPLE states
#         state_list = []
#         for env_state in env_states:
#             state_list.append(list(env_state.values()))

#         return state_list

    def add_buffer(self, new_state, new_action, new_reward, new_state_prime):
        # Add ONE action-state pair every time
        self.buffer.append({'state': new_state, 'action': new_action, 'reward': new_reward, 'state_prime': new_state_prime})
        if len(self.buffer) > self.buffer_size:
            self.buffer.pop(0)
    
    def sample(self, num_sample):
        if num_sample >= len(self.buffer):
            idx_samples = np.random.choice(len(self.buffer), num_sample).tolist()
        else:
            idx_samples = np.random.choice(len(self.buffer), num_sample, replace = False).tolist()

        states = []
        actions = []
        rewards = []
        state_primes = []
#         act_qs = []

        for idx in idx_samples:
            states.append(self.buffer[idx]['state'])
            actions.append(self.buffer[idx]['action'])
            rewards.append(self.buffer[idx]['reward'])
            state_primes.append(self.buffer[idx]['state_prime'])
#             act_qs.append(self.buffer[idx]['act_qs'])

        return states, actions, rewards, state_primes #, act_qs
    
    def train_on_env(self, env, is_show = False):
        episode_reward = 0
        episode_loss = 0
        count = 0
        state = env.reset(is_show)

        while not env.is_over():
            action, act_qs = self.select_action(state)
            state_prime, reward, is_done, info = env.act(action)

            # print(f'B State: {state}, Action: {action}, Reward: {reward}, State_Prime: {state_prime}')

            self.add_buffer(state, action, reward, state_prime)
            sample_states, sample_actions, sample_rewards, sample_state_primes = self.sample(self.batch_size)
            #  Update model with TD
            with tf.GradientTape() as tape:
                tape.watch(self.online_model.trainable_variables)
                loss = self.loss(sample_states, sample_actions, sample_rewards, sample_state_primes)
                is_update_target = self.update(loss, tape)

                self.iter += 1
                count += 1

                state = state_prime
                episode_reward += reward
                episode_loss += loss
                
        env.reset()
        self.eps += 1

        return episode_reward, (episode_loss / count)

In [None]:
# FlappyBird-DDQN Experiment
# 2020/08/11 SYC 

# import models.A2C as A2C
# import models.expStrategy.epsilonGreedy as EPSG
# import envs.cartPole as cartPole
import models.util as Util
import logging
import matplotlib.pyplot as plt
from matplotlib.pylab import figure
import os
import numpy as np
# To run tqdm on notebook, import tqdm.notebook
from tqdm.notebook import tqdm
# Run on pure python
# from tqdm import tqdm

# Config Logging format
# logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
# Config logging module to enable on notebook
# logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)

# Block any pop-up windows
os.environ['SDL_VIDEODRIVER'] = 'dummy'

# Test GPU and show the available logical & physical GPUs
Util.test_gpu()

env = FlappyBirdEnv()
# env = CartPoleEnv()
NUM_STATE_FEATURES = env.get_num_state_features()
NUM_ACTIONS = env.get_num_actions()
EPISODE_NUM = 10000
PRINT_EVERY_EPISODE = 50
LEARNING_RATE = 0.003
REWARD_DISCOUNT = 0.99
DELAY_EVERY_ITER = 1000

exp_stg = EpsilonGreedy(0.2, NUM_ACTIONS)
agent = Agent((NUM_STATE_FEATURES, ), NUM_ACTIONS, DELAY_EVERY_ITER, REWARD_DISCOUNT, LEARNING_RATE, exp_stg)

state = env.reset()
accum_reward = 0

# tqdm progress bar
bar = []
# Reward & LossHistory
r_his = []
avg_r_his = [0]
loss_his = []
episode_reward = 0

print("Episode 1")
for episode in range(1, EPISODE_NUM + 1):
    if episode % PRINT_EVERY_EPISODE == 1:
        if episode > 1:
            bar.close()
            print("Avgerage Accumulated Reward: {} | Loss: {}".format(round(accum_reward / PRINT_EVERY_EPISODE), agent.get_metrics_loss()))
            print("Episode {}".format(episode))
            agent.reset_metrics_loss()
            avg_r_his.append(round(accum_reward / PRINT_EVERY_EPISODE))
            accum_reward = 0
        bar = tqdm(total = PRINT_EVERY_EPISODE)

    episode_reward, episode_loss = agent.train_on_env(env)
    accum_reward += episode_reward
    r_his.append(episode_reward)
    loss_his.append(episode_loss)
    
    episode_reward = 0

    bar.update(1)        
    env.reset()

bar.close()    
print("Accumulated Reward: {} | Loss: {}".format(round(accum_reward / PRINT_EVERY_EPISODE), agent.get_metrics_loss()))
avg_r_his.append(round(accum_reward / PRINT_EVERY_EPISODE))
agent.reset_metrics_loss()

# Evaluate the model
agent.shutdown_explore()
agent.reset_metrics_loss()
# Reset Game
env_state = env.reset()
accum_reward = 0

while not env.is_over():
    # env.render()
    action, act_log_prob = agent.select_action(state)
    state_prime, reward, is_done, info = env.act(action)

    state = state_prime
    accum_reward += reward

print("Evaluate")
print("Accumulated Reward: {}".format(round(accum_reward)))

# Plot Reward History
# figure(num=None, figsize=(24, 6), dpi=80)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(24, 6), dpi=80)
fig.suptitle(f'FlappyBird DDQN Result (Evaluate Reward: {round(accum_reward)})')
x_datas = range(0, len(r_his))
avg_x_datas = range(0, EPISODE_NUM + 1, PRINT_EVERY_EPISODE)

ax1.plot(x_datas, r_his, color='blue')
ax1.plot(avg_x_datas, avg_r_his, color='red')
ax1.set_xlabel('Episodes')
ax1.set_ylabel('Reward / Episode')
ax1.grid()

ax2.plot(x_datas, loss_his, color='orange')
ax2.set_xlabel('Episodes')
ax2.set_ylabel('Loss / Episode')
ax2.grid()

plt.savefig('flappyBird-DDQN-res.svg')
plt.show()

4 Physical GPUs, 1 Logical GPUs
Episode 1


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 13.07143783569336
Episode 51


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.8814830780029297
Episode 101


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 2.406684637069702
Episode 151


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 4.795338153839111
Episode 201


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 3.5003223419189453
Episode 251


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 4.842352867126465
Episode 301


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 3.590871572494507
Episode 351


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 4.247480392456055
Episode 401


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 5.6505513191223145
Episode 451


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 5.840968608856201
Episode 501


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 7.314149379730225
Episode 551


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 18.18058204650879
Episode 601


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 16.049421310424805
Episode 651


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 7.476778030395508
Episode 701


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 6.231672286987305
Episode 751


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 7.6787238121032715
Episode 801


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 4.7448225021362305
Episode 851


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 7.798046112060547
Episode 901


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 5.158641815185547
Episode 951


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 9.776373863220215
Episode 1001


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 6.288321018218994
Episode 1051


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 2.7699506282806396
Episode 1101


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.617060899734497
Episode 1151


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.9096837043762207
Episode 1201


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 2.0969722270965576
Episode 1251


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.623816967010498
Episode 1301


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.557982325553894
Episode 1351


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.4874756336212158
Episode 1401


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.2645293474197388
Episode 1451


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.236274003982544
Episode 1501


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.4282358884811401
Episode 1551


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.5292607545852661
Episode 1601


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.9284616708755493
Episode 1651


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 2.314467191696167
Episode 1701


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 2.1109583377838135
Episode 1751


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.714040994644165
Episode 1801


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.5188148021697998
Episode 1851


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.264337420463562
Episode 1901


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.1767215728759766
Episode 1951


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.9964500665664673
Episode 2001


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -2 | Loss: 0.9112913608551025
Episode 2051


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -3 | Loss: 1.0108236074447632
Episode 2101


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.089985966682434
Episode 2151


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.9630497694015503
Episode 2201


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.1375681161880493
Episode 2251


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.1195056438446045
Episode 2301


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.9513604044914246
Episode 2351


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -3 | Loss: 0.8562552332878113
Episode 2401


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.9215745329856873
Episode 2451


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.6665618419647217
Episode 2501


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.805955708026886
Episode 2551


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.160130500793457
Episode 2601


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.3908895254135132
Episode 2651


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.4405453205108643
Episode 2701


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.4799304008483887
Episode 2751


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.4471639394760132
Episode 2801


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.7744407653808594
Episode 2901


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.8235609531402588
Episode 2951


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.0144259929656982
Episode 3001


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.6961697340011597
Episode 3051


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.556186318397522
Episode 3101


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.222946286201477
Episode 3151


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.5153964757919312
Episode 3201


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.5242582559585571
Episode 3251


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.911913275718689
Episode 3301


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.9526376128196716
Episode 3351


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.8475018739700317
Episode 3401


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.4894357919692993
Episode 3451


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.9805737733840942
Episode 3501


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.4395755529403687
Episode 3551


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 2.0070159435272217
Episode 3601


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.33761465549469
Episode 3651


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.890002727508545
Episode 3701


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.9637559652328491
Episode 3751


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 2.9070839881896973
Episode 3801


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 3.6458535194396973
Episode 3851


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -2 | Loss: 3.7281174659729004
Episode 3901


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 2.089149236679077
Episode 3951


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.85068678855896
Episode 4001


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.6698994636535645
Episode 4051


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -3 | Loss: 3.0963072776794434
Episode 4101


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -3 | Loss: 4.365659713745117
Episode 4151


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 3.570122718811035
Episode 4201


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 4.393612384796143
Episode 4251


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.6451107263565063
Episode 4301


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.6401164531707764
Episode 4351


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.3425432443618774
Episode 4401


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.530549168586731
Episode 4451


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.1989011764526367
Episode 4501


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.2917654514312744
Episode 4551


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.607459545135498
Episode 4601


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 2.1509780883789062
Episode 4651


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.9597071409225464
Episode 4701


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 2.7018113136291504
Episode 4751


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 2.1540884971618652
Episode 4801


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.7633302211761475
Episode 4851


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.3567256927490234
Episode 4901


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.244228720664978
Episode 4951


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.4006562232971191
Episode 5001


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.4206677675247192
Episode 5051


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.4805175065994263
Episode 5101


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -3 | Loss: 1.632175087928772
Episode 5151


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -3 | Loss: 2.519871950149536
Episode 5201


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 2.2691705226898193
Episode 5251


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.781538724899292
Episode 5301


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.966525673866272
Episode 5351


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.6690917015075684
Episode 5401


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 1.35860276222229
Episode 5451


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.9251344203948975
Episode 5501


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.7628498077392578
Episode 5551


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.6201433539390564
Episode 5601


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.5875346660614014
Episode 5651


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.6540151238441467
Episode 5701


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.5255524516105652
Episode 5751


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.4607388973236084
Episode 5801


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.4240669310092926
Episode 5851


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.3823837339878082
Episode 5901


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.4437709450721741
Episode 5951


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.4887484014034271
Episode 6001


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 0.5060444474220276
Episode 6051


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.5539207458496094
Episode 6101


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.7758709192276001
Episode 6151


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -4 | Loss: 0.73581463098526
Episode 6201


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.031221628189087
Episode 6251


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.0788319110870361
Episode 6301


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.553877830505371
Episode 6351


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Avgerage Accumulated Reward: -5 | Loss: 1.5797085762023926
Episode 6401


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))