<a href="https://colab.research.google.com/github/FreeOnel/Deep-RL-Pacman/blob/main/16_04_Breakout_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
""" REFERENCES
General structure from - https://keras.io/examples/rl/deep_q_network_breakout/
Prioritised replay from - https://github.com/cocolico14/N-step-Dueling-DDQN-PER-Pacman
Preprocessing wrapper from - https://github.com/openai/gym/blob/master/gym/wrappers/atari_preprocessing.py
Deepmind rainbow dqn paper - https://arxiv.org/pdf/1710.02298.pdf
Atari environment - https://gym.openai.com/envs/Breakout-v0/

In [None]:
!pip install wandb
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install -U colabgymrender

Requirement already up-to-date: colabgymrender in /usr/local/lib/python3.7/dist-packages (1.0.8)


In [None]:
# Import necessary modules

import wandb
from wandb.keras import WandbCallback
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import cv2
from PIL import Image
import heapq
from collections import deque
import random
from itertools import count
#from colabgymrender.recorder import Recorder

In [None]:
# Set parameters


gamma = 0.99                                          # Discount factor 
epsilon = 1.0                                         # Epsilon greedy parameter
epsilon_min = 0.1                                     # Minimum epsilon greedy parameter
epsilon_max = 1.0                                     # Maximum epsilon greedy parameter
epsilon_interval = (epsilon_max - epsilon_min)        # Rate at which to reduce chance of random action being taken
epsilon_decay_steps = 300000                          # Number of steps over which to decay epsilon from max to min
batch_size = 32                                       # Size of batch taken from replay buffer
max_steps_per_episode = 10000                         # Set max to prevent episodes running too long
replay_steps = 4                                      # Perform replay after every 4th step
target_update = 10000                                  # Update target model with weights from main model every 10000 steps
max_buffer = 50000                                    # Max buffer length to save memory space
alpha = 0.6                                           # Replay prioritisation parameter

In [None]:
from gym.wrappers import AtariPreprocessing

# Instantiate environment

env = gym.make("BreakoutNoFrameskip-v4")
action_space = 4

# Built in preprocessor wrapper converts observations (grayscale, downsize, crop) and performs frame stacking
# Print np.shape(env.reset()) before and after to see changes
env = AtariPreprocessing(env)


In [None]:
# Create VANILLA Deep Q-Network

def create_model():
    # Network architecture as specified in Deepmind's *ADD YEAR* paper
    inputs = layers.Input(shape=(84, 84, 1,))

    # 3 convolutional layers with relu activation
    layer_1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    layer_2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer_1)
    layer_3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer_2)

    flat_layer_3 = layers.Flatten()(layer_3)

    # 2 fully connected layers, with no activation on the final layer
    layer_4 = layers.Dense(512, activation="relu")(flat_layer_3)
    layer_5 = layers.Dense(action_space, activation="linear")(layer_4)

    # Input is a state representation, output is action values
    return keras.Model(inputs=inputs, outputs=layer_5)


In [None]:
# Create DUELLING Deep Q-Network
"""
def create_model():
    # Network architecture as specified in Deepmind's *ADD YEAR* paper
    inputs = layers.Input(shape=(84, 84, 1,))

    # 3 convolutional layers with relu activation
    layer_1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    layer_2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer_1)
    layer_3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer_2)

    flat_layer_3 = layers.Flatten()(layer_3)

    # Duelling network
    value_layer = layers.Dense(1, activation='linear')(flat_layer_3)
    advantage_layer = layers.Dense(action_space, activation='linear')(flat_layer_3)

    Q = value_layer + tf.subtract(advantage_layer, tf.reduce_mean(advantage_layer, axis=1, keepdims=True))

    # Input is a state representation, output is action values
    return keras.Model(inputs=inputs, outputs=Q)

In [None]:
# Create main and target q-networks

model = create_model()
target_model = create_model()

In [None]:
# Use Adam optimizer to optimize efficiency of training
optimizer = keras.optimizers.Adam(learning_rate=0.00025)

# Define Huber loss function
loss_function = keras.losses.Huber()

# Compile main and target models with optimizer and loss function
model.compile(optimizer, loss_function)
target_model.compile(optimizer, loss_function)

In [None]:
# Define epsilon greedy step

def epsilon_greedy(action, step):
  epsilon = max(epsilon_min, epsilon_max - epsilon_interval * step/epsilon_decay_steps) # Decay epsilon with more steps
  if step % 1000 == 0:
    print("epsilon = ", epsilon, "at step ", step)
  if np.random.rand() < epsilon:
    return np.random.choice(action_space)
  else:
    return action

In [None]:
 # Calculate TD error for use in prioritised replay
 
 def calculate_td_error(transition):
        state, action, reward, next_state, Terminal = transition
        if not Terminal:
            next_state = tf.expand_dims(next_state, 0)
            all_actions = model(next_state, training=False)
            max_action = np.argmax(all_actions[0])
            target_q = (reward + gamma * target_model(next_state, training=False)[0][max_action])
        else:
            target_q = reward

        q = model(state, training=False)[0][action]
        
        td = tf.cast(q - target_q, dtype=tf.float32)
        td = td.numpy()

        return td

In [None]:
# Define replay

def replay(batch_size):

  # Semi Stochastic Prioritization
  prioritization = int(batch_size*alpha)
  batch_prioritized = heapq.nlargest(prioritization, buffer)
  batch_uniform = random.sample(buffer, batch_size-prioritization)
  batch = batch_prioritized + batch_uniform

  batch = [e for (_, _, e) in batch]
  states = []
  targets = []

  for state, action, reward, next_state, Terminal in batch:
    if not Terminal:
        next_s = np.expand_dims(next_state, axis=0)
        # Double DQN
        max_action = np.argmax(model(next_s, training=False)[0])
        target_q = (reward + gamma * target_model(next_s, training=False)[0][max_action])
    else:
        target_q = reward

    #current_s = np.expand_dims(state, axis=0)
    q = model(state, training=False)
    q = tf.make_ndarray(tf.make_tensor_proto(q))
    q[0][int(action)] = target_q
    states.append(state)
    targets.append(q.reshape(action_space))

  states = np.reshape(np.array(states), (32, 84, 84, 1))
  model.fit(states, np.array(targets), batch_size=batch_size, epochs=1, verbose=0)

  return

In [None]:
# Record training with wand api
wb = True
if wb:
  wandb.init(project='Breakout', entity='pacman_dqn')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
tf.config.run_functions_eagerly(True)

buffer = []
total_steps = 0
total_episodes = 0
Terminal = False
tiebreaker = count()
#directory = './video'
#env = Recorder(env, directory)

for ep in range(1, 20000):

  state = env.reset()
  episodic_reward = 0

  for step in range(1, max_steps_per_episode):
    
    state = tf.expand_dims(state, 0)
    all_actions = model(state, training=False)
    max_action = np.argmax(all_actions[0])
    action = epsilon_greedy(max_action, total_steps)

    next_state, reward, Terminal, _ = env.step(action)

    transition = (state, action, reward, next_state, Terminal)
    td_error = calculate_td_error(transition)
    
    # Use square of td_error to account for negatives
    square_error = td_error * td_error

    # Sort buffer by square error and use arbitrary count() as tiebreaker
    buffer.append((square_error, next(tiebreaker), transition))
    heapq.heapify(buffer)

    episodic_reward += reward
    total_steps += 1
    state = next_state

    # Perform replay every 4 steps
    if len(buffer) > batch_size and total_steps % replay_steps == 0 :
      replay(batch_size)
    
    # Update target model 
    if total_steps % target_update == 0:
      target_model.set_weights(model.get_weights())

    # Keep buffer to specified length
    if len(buffer) > max_buffer:
      del buffer[:1]

    if Terminal:
      if wb:
        wandb.log({'episodes:': ep, 'episodic_reward': episodic_reward, 'steps': step, 'average_steps_per_episode': total_steps/ep})
      print("episode: {}/{}, reward: {}".format(ep, 20000, episodic_reward))
      break




epsilon =  1.0 at step  0
episode: 1/20000, reward: 0.0
episode: 2/20000, reward: 0.0
episode: 3/20000, reward: 1.0
episode: 4/20000, reward: 0.0
episode: 5/20000, reward: 0.0
episode: 6/20000, reward: 0.0
epsilon =  0.997 at step  1000
episode: 7/20000, reward: 1.0
episode: 8/20000, reward: 3.0
episode: 9/20000, reward: 2.0
episode: 10/20000, reward: 1.0
episode: 11/20000, reward: 0.0
episode: 12/20000, reward: 1.0
epsilon =  0.994 at step  2000
episode: 13/20000, reward: 1.0
episode: 14/20000, reward: 1.0
episode: 15/20000, reward: 0.0
episode: 16/20000, reward: 0.0
episode: 17/20000, reward: 0.0
episode: 18/20000, reward: 0.0
epsilon =  0.991 at step  3000
episode: 19/20000, reward: 2.0
episode: 20/20000, reward: 0.0
episode: 21/20000, reward: 1.0
episode: 22/20000, reward: 0.0
episode: 23/20000, reward: 0.0
episode: 24/20000, reward: 2.0
epsilon =  0.988 at step  4000
episode: 25/20000, reward: 2.0
episode: 26/20000, reward: 1.0
episode: 27/20000, reward: 2.0
episode: 28/20000, rew