In [1]:
import numpy as np
import gym
import gym_chrome_dino
import tensorflow as tf
import time
import random
import os
import cv2
from PIL import Image
from tensorflow.python.keras import callbacks
from tqdm import tqdm
from collections import deque

In [2]:
# Set parameters
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50000
MIN_REPLAY_MEMORY_SIZE = 1000
MINI_BATCH_SIZE = 64
UPDATE_TARGET_EVERY = 5
MODEL_NAME = 'Dino_run'
MIN_REWARD = -200
MEMORY_FRACTION = 0.2

EPISODES = 20000

epsilon = 1
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

AGGREGATE_STATS_EVERY = 50  # episodes

In [3]:
ep_rewards = [-200]

random.seed(1)
np.random.seed(1)
tf.random.set_seed(1)

# create model folder
if not os.path.isdir('./models'):
    os.makedirs('./models')

In [4]:
from tensorflow.keras.callbacks import TensorBoard

class ModifiedTensorBoard(TensorBoard):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.create_file_writer(self.log_dir)
        self._log_write_dir = self.log_dir

    def set_model(self, model):
        self.model = model

        self._train_dir = os.path.join(self._log_write_dir, 'train')
        self._train_step = self.model._train_counter

        self._val_dir = os.path.join(self._log_write_dir, 'validation')
        self._val_step = self.model._test_counter

        self._should_write_train_graph = False

    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    def on_batch_end(self, batch, logs=None):
        pass

    def on_train_end(self, _):
        pass

    def update_stats(self, **stats):
        with self.writer.as_default():
            for key, value in stats.items():
                tf.summary.scalar(key, value, step = self.step)
                self.writer.flush()

In [7]:
from tensorflow.keras import layers, Sequential

# Create agent
class DQNAgent:
    def __init__(self):
        # main model
        self.model = self.create_model()
        # target model
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        
        # An array with last n steps for training
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
        
        self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}".format(MODEL_NAME, int(time.time())))
        
        self.target_update_counter = 0
    
    def create_model(self):
        model = Sequential([
            layers.Conv2D(256, 3, activation='relu' ,input_shape=(env.render().shape)),
            layers.MaxPool2D(),
            layers.BatchNormalization(),
            layers.Dropout(0.2),
            layers.Conv2D(128, 3, activation='relu'),
            layers.MaxPool2D(),
            layers.BatchNormalization(),
            layers.Dropout(0.2),
            layers.Conv2D(64, 3, activation='relu'),
            layers.MaxPool2D(),
            layers.BatchNormalization(),
            layers.Dropout(0.2),
            layers.Flatten(),
            layers.Dense(32, activation='relu'),
            layers.Dense(env.action_space.n, activation='linear')]) # use linear because we use np.argmax
        
        model.compile(loss='mse',
                      optimizer='adam',
                      metrics=['mae'])
        return model
    
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)
    
    # train network every step during episode
    def train(self, terminal_state, step):
        # start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return
        
        # get a mini batch of random samples from memory replay table
        mini_batch = random.sample(self.replay_memory, MINI_BATCH_SIZE)
        
        # get current states from mini batch, then throw the data into NN to get Q value
        current_states = np.array([transition[0] for transition in mini_batch])/255
        current_qs_list = self.model.predict(current_states)
        
        # get future states from mini batch, then throw the data into NN to get Q value
        new_current_states = np.array([transition[3] for transition in mini_batch])/255
        future_qs_list = self.target_model.predict(new_current_states)
        
        X, y = [], []
        
        # enumerate data from mini batch
        for idx, (current_state, action, reward, new_current_state, done) in enumerate(mini_batch):
            
            # If not terminal state, get new Q from future state, otherwise set it to 0
            if not done:
                max_future_q = np.max(future_qs_list[idx])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward
            
            current_qs = current_qs_list[idx]
            current_qs[action] = new_q
            
            X.append(current_state)
            y.append(current_qs)
        
        # Fit on all samplse as one batch, log only on terminal state
        self.model.fit(np.array(X)/255,
                       np.array(y),
                       batch_size=MINI_BATCH_SIZE,
                       verbose=0,
                       shuffle=False,
                       callbacks=[self.tensorboard] if terminal_state else None)
        
        if terminal_state:
            self.target_update_counter += 1
        
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0
    
    def get_qs(self, state):
        return self.model.predict(np.array(state).reshape(-1, *state.shape)/255)[0]

env = gym.make('ChromeDino-v0')
agent = DQNAgent()

for episode in tqdm(range(1, EPISODES+1), ascii=True, unit='episodes'):
    
    agent.tensorboard.step = episode
    
    episode_reward = 0
    step = 1
    
    current_state = env.reset()
    
    done = False
    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(agent.get_qs(current_state))
        else:
            action = np.random.randint(0, env.action_space.n)
        
        new_state, reward, done, _ = env.step(action)
        
        episode_reward += reward
        
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)
        
        current_state = new_state
        step += 1
    
    ep_rewards.append(episode_reward)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
        
        if min_reward >= MIN_REWARD:
            agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
          
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)

  0%|          | 0/20000 [00:00<?, ?episodes/s]

INFO:tensorflow:Assets written to: models/Dino_run____31.60max__-12.81avg_-200.00min__1638695336.model\assets


  0%|          | 4/20000 [00:24<31:37:38,  5.69s/episodes]



  0%|          | 4/20000 [00:39<55:30:33,  9.99s/episodes]


ResourceExhaustedError:  OOM when allocating tensor with shape[64,256,148,598] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node sequential_4/conv2d_12/Relu
 (defined at C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\backend.py:4867)
]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_12325]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_4/conv2d_12/Relu:
In[0] sequential_4/conv2d_12/BiasAdd (defined at C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\layers\convolutional.py:265)

Operation defined at: (most recent call last)
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\runpy.py", line 193, in _run_module_as_main
>>>     "__main__", mod_spec)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\runpy.py", line 85, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
>>>     app.start()
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
>>>     self.io_loop.start()
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py", line 538, in run_forever
>>>     self._run_once()
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\asyncio\base_events.py", line 1782, in _run_once
>>>     handle._run()
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\asyncio\events.py", line 88, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
>>>     await self.process_one()
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
>>>     await dispatch(*args)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
>>>     await result
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
>>>     reply_content = await reply_content
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\IPython\core\interactiveshell.py", line 2902, in run_cell
>>>     raw_cell, store_history, silent, shell_futures)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\IPython\core\interactiveshell.py", line 2947, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\IPython\core\interactiveshell.py", line 3173, in run_cell_async
>>>     interactivity=interactivity, compiler=compiler, result=result)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\IPython\core\interactiveshell.py", line 3364, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Temp/ipykernel_11284/134644314.py", line 122, in <module>
>>>     agent.train(done, step)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Temp/ipykernel_11284/134644314.py", line 86, in train
>>>     callbacks=[self.tensorboard] if terminal_state else None)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 1216, in fit
>>>     tmp_logs = self.train_function(iterator)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 878, in train_function
>>>     return step_function(self, iterator)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 867, in step_function
>>>     outputs = model.distribute_strategy.run(run_step, args=(data,))
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 860, in run_step
>>>     outputs = model.train_step(data)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\training.py", line 808, in train_step
>>>     y_pred = self(x, training=True)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\sequential.py", line 373, in call
>>>     return super(Sequential, self).call(inputs, training=training, mask=mask)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\functional.py", line 452, in call
>>>     inputs, training=training, mask=mask)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\functional.py", line 589, in _run_internal_graph
>>>     outputs = node.layer(*args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\engine\base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\layers\convolutional.py", line 273, in call
>>>     return self.activation(outputs)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\activations.py", line 311, in relu
>>>     return backend.relu(x, alpha=alpha, max_value=max_value, threshold=threshold)
>>> 
>>>   File "C:\Users\dddru\AppData\Local\Programs\Python\Python37\lib\site-packages\keras\backend.py", line 4867, in relu
>>>     x = tf.nn.relu(x)
>>> 

In [None]:
agent.tensorboard

1