This notebook is set up to work on colab due to the high processing requirements of DQNs.

Therefore here we mount the google drive.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


Here we set the working folder, if you were to try to run this in your own google drive you would have to configure this working folder.

In [None]:
working_folder = '/content/gdrive/MyDrive/462/ResearchProject/'

In [None]:
import sys
sys.path.append(working_folder)

Loading in the atari roms.

In [None]:
!wget http://www.atarimania.com/roms/Roms.rar 
!unrar x -o+ /content/Roms.rar >/dev/nul
!python -m atari_py.import_roms /content/ROMS >/dev/nul

--2022-05-20 18:30:04--  http://www.atarimania.com/roms/Roms.rar
Resolving www.atarimania.com (www.atarimania.com)... 195.154.81.199
Connecting to www.atarimania.com (www.atarimania.com)|195.154.81.199|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19583716 (19M) [application/x-rar-compressed]
Saving to: ‘Roms.rar’


2022-05-20 18:31:45 (189 KB/s) - ‘Roms.rar’ saved [19583716/19583716]



Defining a separate class for the Replay Memory structure.

In [None]:
import numpy as np


class Memory:
    def __init__(self, size, rows, cols, history, batch_size):
        self.size = size # Total number of frames allowed to be stored
        self.rows = rows # Number of rows in each frame
        self.cols = cols # Number of cols in each frame
        self.history = history # Number of frames to be kept in a state
        self.batch_size = batch_size # Number of states to be sent to learn

        # Pre-allocating space for the state information
        self.frames = np.empty((self.size, self.rows, self.cols), dtype=np.uint8)
        self.actions = np.empty(self.size, dtype=np.uint8)
        self.rewards = np.empty(self.size, dtype=np.float32)
        self.dones = np.empty(self.size, dtype=bool)

        # Variables to check if the memory is full
        self.count = 0 # Pointer to most recent frame added
        self.filled = False

    # Function which takes new state attributes and adds it to the storage
    def add_memory(self, frame, action, reward, done):
        self.frames[self.count] = frame
        self.actions[self.count] = action
        self.rewards[self.count] = reward
        self.dones[self.count] = done
        
        # Checking if the memory if full
        if (self.count + 1) % self.size == 0:
            self.filled = True
        # Updating count value by one, and making sure it cant exceed max value
        self.count = (self.count + 1) % self.size


    # Function to get the random indices for the frames from memory
    def get_random_indices(self):
        # Checking if memory is full or the pointer is smaller than batch size,
        # as if its not we can use count as the max index value
        if self.count < self.batch_size or self.filled:
            return np.random.randint(self.history, self.size, size=self.batch_size)
        else:
            return np.random.randint(self.history, self.count, size=self.batch_size)


    # Function to collect together frames starting at a given index
    def create_state(self, index):
        state = np.empty((self.rows, self.cols, self.history))
        for i in range(self.history):
            state[:, :, i] = self.frames[index-i]
        return np.expand_dims(state, axis=0)
  

    # Function to create the mini-batch for training
    def get_minibatch(self):
        # Getting random indices
        random_indices = self.get_random_indices()
        # Loop to constantly create random indices until we have a valid set
        while True:
          # checking that all random indices are within max and the total number 
          # of frames in a state, this is so that with any index we can create a 
          # state without going out of bounds.
          if all(i > self.history + 1 and i < self.size - 1 for i in random_indices):
              break
          # Making sure only the oldest frame in state and next state can be a terminal frame
          elif not all(self.dones[i-j] == False for i in random_indices for j in range(self.history)):
              break
          # Making sure that a chosen index will not create a state which will 
          # have frames from different games
          elif all(i < self.count or i - self.history > self.count for i in random_indices):
              break
          else:
              random_indices = self.get_random_indices()

        # Allocating the memory for the batches          
        random_states = np.empty((self.batch_size, self.rows, self.cols, self.history))
        random_next_states = np.empty((self.batch_size, self.rows, self.cols, self.history))
        # Loop to create the state and new state
        for i, index in enumerate(random_indices):
            random_states[i] = self.create_state(index-1)
            # New state takes index because new state should always have the 
            # most recent frame
            random_next_states[i] = self.create_state(index)

        # Returning state information
        return random_states, self.actions[random_indices-1], self.rewards[random_indices], random_next_states, self.dones[random_indices]

In [None]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import random
import gym
import numpy as np
import pandas as pd
from keras.models import Model, load_model, clone_model
from keras.layers import Input, Dense, Conv2D, Flatten, Input, Lambda
import tensorflow as tf
import cv2

tf.compat.v1.disable_eager_execution()


# Function used to combine the value and advantage streams in the dueling
# architecture
def combineAdvantageValue(args):
    value, advantage = args
    mean = tf.reduce_mean(advantage, axis=1, keepdims=True)
    return value + (advantage - mean)

# Function to create the dueling model
def Dueling_Model(input_shape, action_space):
    initializer = tf.keras.initializers.VarianceScaling(scale=2)

    # Creating the layers
    input = Input(shape=input_shape)
    x = Conv2D(32, 8, 4, activation="relu", padding="valid", kernel_initializer=initializer, use_bias=False)(input)
    x = Conv2D(64, 4, 2, activation="relu", padding="valid", kernel_initializer=initializer, use_bias=False)(x)
    x = Conv2D(64, 3, 1, activation="relu", padding="valid", kernel_initializer=initializer, use_bias=False)(x)

    x = Flatten()(x)

    # Splitting in to advantage and value streams
    advantage = Dense(action_space, kernel_initializer=initializer)(x)
    value = Dense(1, kernel_initializer=initializer)(x)

    # Combining the streams
    output = Lambda(combineAdvantageValue)([value, advantage])
    
    final = Model(input, output)
    final.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.0000625))
    final.summary()
    return final

# Function to create the standard DQN network architecture
def Normal_Model(input_shape, action_space):
    initializer = tf.keras.initializers.VarianceScaling(scale=2)

    # Creating a sequential model of layers
    model = tf.keras.models.Sequential()

    model.add(Conv2D(32, 8, 4, activation="relu", padding="valid", kernel_initializer=initializer, use_bias=False, 
                     input_shape=input_shape))

    model.add(Conv2D(64, 4, 2, activation="relu", padding="valid", kernel_initializer=initializer, use_bias=False))

    model.add(Conv2D(64, 3, 1, activation="relu", padding="valid", kernel_initializer=initializer, use_bias=False))

    model.add(Flatten())

    model.add(Dense(512, activation="relu", kernel_initializer=initializer))
    model.add(Dense(action_space, activation="linear", kernel_initializer=initializer))

    model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.0000625))
    model.summary()
    return model

# Class to define the DQN algorithm which we use
class DQN:
    def __init__(self, file_name, DDQN=False, Dueling=False):
        # Defining our gym environment
        self.env = gym.make('PongDeterministic-v4')

        # setting the Input parameter of the file name, and whether we are using
        # DDQN and dueling
        self.file_name = file_name
        self.DDQN = DDQN
        self.Dueling = Dueling

        # Getting the number of actions for the environment
        self.action_size = self.env.action_space.n
        # mnih et al. used 10,000,000 frames, tested every 250,000 frames with 
        # 150,000 testing frames, for the interest of time I have reduced these:
        self.MAX_FRAMES = 4000000
        self.EPISODE_FRAMES = 150000
        self.TESTING_FRAMES = 50000

        # Defining the size of the memory
        self.memory_size = 1000000
        # How many frames we store before the models start learning
        self.FILL_MEMORY = 50000
        # How many steps we take before we update the second network
        self.target_updates = 10000
        # How many steps before we learn from the replay memory
        self.param_updates = 4

        self.gamma = 0.99  # discount rate
        # Values for the epsilon greedy implementation
        self.epsilon = 1
        self.epsilon_min = 0.1
        self.epsilon_decay = -(1-0.1)/self.memory_size # The rate of change of epsilon for the first 1,000,000 frames
        self.epsilon_decay_2 = -(0.1-0.01)/self.MAX_FRAMES # The rate of change of epsilon for the rest of training
  
        self.batch_size = 32
        self.rows = 84 # rows of each frame
        self.cols = 84 # cols of each frame
        self.history = 4
        
        # Allocating storage for the current state for training
        self.current_state = np.zeros((self.rows, self.cols, self.history))
        self.state_size = (self.rows, self.cols, self.history)
        
        #Initialising the replay memory
        self.memory = Memory(self.memory_size, self.rows, self.cols, self.history, self.batch_size)

        # create main model, choosing the architecture based on if we on Dueling or not
        if self.Dueling:
            self.model = Dueling_Model(input_shape=(self.state_size), action_space=self.action_size)
        else:
            self.model = Normal_Model(input_shape=(self.state_size), action_space=self.action_size)

        # Creating the second network model by cloning the main model
        self.target_model = clone_model(self.model)
        self.target_model.build(input_shape=(self.state_size,))
        self.target_model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.0000625))
        self.target_model.set_weights(self.model.get_weights())

        # Creating panda dataframes to store the rewards from each game in training and testing
        self.training_data = pd.DataFrame(columns=['Episode', 'Reward'])
        self.testing_data = pd.DataFrame(columns=['Episode', 'Reward'])


    # Function to pre process each frame
    def image_pre_process(self, state):
        # Cropping the image to just the playing space
        state = state[34:194:, 0:161, :]
        # Making the image grayscale
        state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
        # Resizing the image to the chosen size
        state = cv2.resize(state, (self.cols, self.rows), interpolation=cv2.INTER_CUBIC)
        return state

    # Function to add the most recent frame to the current state, and remove the oldest frame
    def add_history(self, state):
        self.current_state = np.roll(self.current_state, 1, axis=2)
        self.current_state[:, :, 0] = state
        return np.expand_dims(self.current_state, axis=0)

    # Function which resets the gym environment and the frames in current state
    def reset(self):
        frame_clean = self.env.reset()
        # Pre process the starting frame
        frame = self.image_pre_process(frame_clean)
        # Filling the current state with the starting frame
        for i in range(self.history):
            state_converted = self.add_history(frame)
        return state_converted, frame, frame_clean
    
    # Function for taking a step in the environment, and getting the information 
    # for the next state
    def step(self, action):
        next_frame, reward, done, info = self.env.step(action)
        # Getting a unprocessed frame for when we create a video in testing
        next_frame_clean = next_frame
        next_frame = self.image_pre_process(next_frame)

        next_state = self.add_history(next_frame)

        return next_state, next_frame, next_frame_clean, reward, done, info

    # Function which implenents the epislon greedy to choose whether we expore 
    # or exploit
    def act(self, state, step_count):
        # Checking if the memory has enough frames to start exploiting
        if step_count < self.FILL_MEMORY:
            return random.randrange(self.action_size)
        else:
            # First two if statements are checking which rate of decay needs to 
            # be used
            if self.epsilon > self.epsilon_min:
                self.epsilon = self.epsilon_decay*step_count + 1
            if self.epsilon <= self.epsilon_min:
                self.epsilon = self.epsilon_decay_2*step_count + 0.1
            # Checking if epsilon is greater than or equal some number between 
            # 0 and 1
            if np.random.random() <= self.epsilon:
                return random.randrange(self.action_size)
            else:
                # We choose the best action
                return np.argmax(self.model.predict(state/255))

    # Function which takes the weights from the main network and sets them to 
    # the second network
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # Function which updates the knowledge using bellman equations then fits the 
    # model with the new information
    def replay(self):
        # Gets a minibatch from replay memory
        state, action, reward, next_state, done = self.memory.get_minibatch()

        # Normalising the values in the frames
        state = state / 255
        next_state = next_state / 255

        # Getting the main networks predictions for the current states
        target = self.model.predict(state)

        # Getting a main network prediction of next states as well if DDQN
        if self.DDQN:
            target_next_arg = self.model.predict(next_state)

        # Getting a second network predictino of the next states
        target_next = self.target_model.predict(next_state)
        # Iterating through each state in the batch
        for i in range(self.batch_size):
            # If the state is a terminal state, set the Q-value to the reward received
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                if self.DDQN:
                    # Bellman equation for DDQN
                    model_one_prediction = np.argmax(target_next_arg[i])
                    target[i][action[i]] = reward[i] + self.gamma * (target_next[i][model_one_prediction])
                else:
                    # Bellman equation for DQN
                    target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))

        # Train the Neural Network with batches
        self.model.fit(state, target, batch_size=self.batch_size, verbose=0)

    # Function to save the current trained model network to the working folder
    def save(self, name):
        self.model.save(working_folder + name)

    # Function which makes sure rewards are between -1 and 1
    def clip_reward(self, reward):
        if reward >= 1:
            return 1
        elif reward == 0:
            return 0
        else:
            return -1

    # Function which runs the tests during training
    def test(self, episode_count):
        test_step_count = 0
        test_rewards = []
        # Interating until testing frames is reached
        while test_step_count < self.TESTING_FRAMES:
            count = 0
            done = False
            # Creating a video to store the frames of the test to view later.
            fourcc = cv2.VideoWriter_fourcc(*'XVID')
            video = cv2.VideoWriter(working_folder+'videos/{}_Pong_Ep_{}.avi'.format(self.file_name, episode_count), fourcc, 25, (160, 210))
            game_reward = 0
            # Reseting the game
            state, frame, frame_clean = self.reset()
            # While the game has not finished
            while not done:
                frame_clean = cv2.cvtColor(frame_clean, cv2.COLOR_RGB2BGR)

                if count < self.history:
                    action = 1
                else:
                    # Getting the best action each step
                    action = np.argmax(self.model.predict(state/255))

                # Writing the clean frame to the video
                cv2.imwrite('frame.jpg', frame_clean)
                img = cv2.imread('frame.jpg')
                video.write(img)

                # Taking the step with the action chosen
                next_state, next_frame, next_frame_clean, reward, done, info = self.step(action)

                game_reward += reward

                state = next_state
                frame_clean = next_frame_clean

                count += 1
                test_step_count += 1
                  
            video.release()
            test_rewards.append(game_reward)

        # Outputting to the console, the saving the test data to the data frame.
        print('##############################################')
        print('TESTING')
        print('Average Test Results: {}'.format(np.mean(test_rewards)))
        self.testing_data = self.testing_data.append({'Episode':episode_count, 'Reward':test_rewards}, ignore_index=True)

    
    # Function which defines the random agent playing the game
    def random(self):
        step_count = 0
        games_played = 0
        episode_count = 0
        # Stopping when number of steps taken exceeds max
        while step_count < self.MAX_FRAMES:
            current_steps = 0
            episode_rewards = []

            # Number of steps taken before each test is taken
            while current_steps < self.EPISODE_FRAMES:
                game_reward = 0
                state, frame, frame_clean = self.reset()
                done = False
                while not done:
                    # Each action is chosen randomly from the possible actions available 
                    action = self.env.action_space.sample()
                    next_state, next_frame, next_frame_clean, reward, done, info = self.step(action)

                    game_reward += reward

                    reward = self.clip_reward(reward)

                    state = next_state
                    frame = next_frame

                    current_steps += 1
                    step_count += 1

                games_played += 1

                episode_rewards.append(game_reward)
                if games_played % 1 == 0:
                    # print('Current Average Reward: {}, Current Steps: {}'.format(np.mean(episode_rewards), step_count))
                    print('Game Count: {}, Current Reward: {}, Current Steps: {}, Epsilon: {}'.format(games_played, game_reward, step_count, self.epsilon))

            # Outputting to console
            print('##############################################')
            print('TRAINING')
            print('Episode: {}, Average Reward: {}'.format(episode_count, np.mean(episode_rewards)))
            print("Saving trained model as {}.h5".format(self.file_name))
            # Saving the current model network
            self.save("{}.h5".format(self.file_name))
            # Creating the test
            self.test(episode_count)
            # Saving the current epoch training data
            self.training_data = self.training_data.append({'Episode':episode_count, 'Reward':episode_rewards}, ignore_index=True)

            episode_count += 1
        # After all steps taken, save data to csv
        self.save_data()


    # Function which runs the main training algorithm
    def run(self):
        step_count = 0
        games_played = 0
        episode_count = 0
        # Stopping after steps taken exceeds max
        while step_count < self.MAX_FRAMES:
            current_steps = 0
            episode_rewards = []
            # Stop loop to test after amount of frames per epoch
            while current_steps < self.EPISODE_FRAMES:
                game_reward = 0
                state, frame, frame_clean = self.reset()
                done = False
                while not done:
                    # Choosing action via epsilon-greedy
                    action = self.act(state, step_count)
                    # Taking step with chosen action
                    next_state, next_frame, next_frame_clean, reward, done, info = self.step(action)

                    # Updating total reward for this game
                    game_reward += reward

                    # Making sure reward is between -1 and 1
                    reward = self.clip_reward(reward)

                    # Adding state information to the replay memory
                    self.memory.add_memory(frame, action, reward, done)
                    state = next_state
                    frame = next_frame

                    # Checking if we should update the second network
                    if step_count % self.target_updates == 0 and step_count >= self.FILL_MEMORY:
                        self.update_target_model()

                    # Checking if should update the main network
                    if step_count % self.param_updates == 0 and step_count >= self.FILL_MEMORY:
                        self.replay()

                    current_steps += 1
                    step_count += 1

                games_played += 1

                episode_rewards.append(game_reward)
                if games_played % 1 == 0:
                    # print('Current Average Reward: {}, Current Steps: {}'.format(np.mean(episode_rewards), step_count))
                    print('Game Count: {}, Current Reward: {}, Current Steps: {}, Epsilon: {}'.format(games_played, game_reward, step_count, self.epsilon))

            # Output to console at end of each epoch
            print('##############################################')
            print('TRAINING')
            print('Episode: {}, Average Reward: {}'.format(episode_count, np.mean(episode_rewards)))
            print("Saving trained model as {}.h5".format(self.file_name))
            # Save current network weights
            self.save("{}.h5".format(self.file_name))
            # Perform test
            self.test(episode_count)
            # Saving current training data
            self.training_data = self.training_data.append({'Episode':episode_count, 'Reward':episode_rewards}, ignore_index=True)

            episode_count += 1
        # Saving final training data to csv
        self.save_data()

    # Function which takes the pandas data frames and saves them to csvs
    def save_data(self):
        self.training_data = self.training_data.explode('Reward')
        self.testing_data = self.testing_data.explode('Reward')

        self.training_data.to_csv(working_folder+'{}_train.csv'.format(self.file_name))
        self.testing_data.to_csv(working_folder+'{}_test.csv'.format(self.file_name))

# Normal DQN

In [None]:
file_name = 'NDQN'
DDQN = False
Dueling = False

pongNDQN = DQN(file_name, DDQN, Dueling)
pongNDQN.run()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 20, 20, 32)        8192      
                                                                 
 conv2d_4 (Conv2D)           (None, 9, 9, 64)          32768     
                                                                 
 conv2d_5 (Conv2D)           (None, 7, 7, 64)          36864     
                                                                 
 flatten_1 (Flatten)         (None, 3136)              0         
                                                                 
 dense_2 (Dense)             (None, 512)               1606144   
                                                                 
 dense_3 (Dense)             (None, 6)                 3078      
                                                                 
Total params: 1,687,046
Trainable params: 1,687,046
No

  updates=self.state_updates,


Game Count: 57, Current Reward: -20.0, Current Steps: 50509, Epsilon: 0.9545428
Game Count: 58, Current Reward: -20.0, Current Steps: 51540, Epsilon: 0.9536149
Game Count: 59, Current Reward: -21.0, Current Steps: 52332, Epsilon: 0.9529021
Game Count: 60, Current Reward: -19.0, Current Steps: 53318, Epsilon: 0.9520147
Game Count: 61, Current Reward: -21.0, Current Steps: 54202, Epsilon: 0.9512191
Game Count: 62, Current Reward: -21.0, Current Steps: 55189, Epsilon: 0.9503308
Game Count: 63, Current Reward: -20.0, Current Steps: 56090, Epsilon: 0.9495199
Game Count: 64, Current Reward: -21.0, Current Steps: 57064, Epsilon: 0.9486433
Game Count: 65, Current Reward: -21.0, Current Steps: 57889, Epsilon: 0.9479008
Game Count: 66, Current Reward: -21.0, Current Steps: 58799, Epsilon: 0.9470818
Game Count: 67, Current Reward: -18.0, Current Steps: 60000, Epsilon: 0.9460009
Game Count: 68, Current Reward: -21.0, Current Steps: 60942, Epsilon: 0.9451531
Game Count: 69, Current Reward: -20.0, C

# DDQN

In [None]:
file_name = 'DDQN'
DDQN = True
Dueling = False

pongDDQN = DQN(file_name, DDQN, Dueling)
pongDDQN.run()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 20, 20, 32)        8192      
                                                                 
 conv2d_4 (Conv2D)           (None, 9, 9, 64)          32768     
                                                                 
 conv2d_5 (Conv2D)           (None, 7, 7, 64)          36864     
                                                                 
 flatten_1 (Flatten)         (None, 3136)              0         
                                                                 
 dense_2 (Dense)             (None, 512)               1606144   
                                                                 
 dense_3 (Dense)             (None, 6)                 3078      
                                                                 
Total params: 1,687,046
Trainable params: 1,687,046
No

  updates=self.state_updates,


Game Count: 55, Current Reward: -21.0, Current Steps: 50453, Epsilon: 0.9545932
Game Count: 56, Current Reward: -21.0, Current Steps: 51333, Epsilon: 0.9538012
Game Count: 57, Current Reward: -20.0, Current Steps: 52263, Epsilon: 0.9529642
Game Count: 58, Current Reward: -18.0, Current Steps: 53358, Epsilon: 0.9519787
Game Count: 59, Current Reward: -21.0, Current Steps: 54122, Epsilon: 0.9512911
Game Count: 60, Current Reward: -21.0, Current Steps: 54946, Epsilon: 0.9505495
Game Count: 61, Current Reward: -19.0, Current Steps: 55995, Epsilon: 0.9496054
Game Count: 62, Current Reward: -20.0, Current Steps: 56833, Epsilon: 0.9488512
Game Count: 63, Current Reward: -20.0, Current Steps: 57671, Epsilon: 0.948097
Game Count: 64, Current Reward: -20.0, Current Steps: 58560, Epsilon: 0.9472969
Game Count: 65, Current Reward: -20.0, Current Steps: 59541, Epsilon: 0.946414
Game Count: 66, Current Reward: -19.0, Current Steps: 60668, Epsilon: 0.9453997
Game Count: 67, Current Reward: -21.0, Cur

#NDQN - Dueling


In [None]:
file_name = 'NDQN_Dueling'
DDQN = False
Dueling = True

pongNDQN_Dueling = DQN(file_name, DDQN, Dueling)
pongNDQN_Dueling.run()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 84, 84, 4)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 20, 20, 32)   8192        ['input_1[0][0]']                
                                                                                                  
 conv2d_1 (Conv2D)              (None, 9, 9, 64)     32768       ['conv2d[0][0]']                 
                                                                                                  
 conv2d_2 (Conv2D)              (None, 7, 7, 64)     36864       ['conv2d_1[0][0]']               
                                                                                              

  updates=self.state_updates,


Game Count: 56, Current Reward: -21.0, Current Steps: 50719, Epsilon: 0.9543538
Game Count: 57, Current Reward: -21.0, Current Steps: 51596, Epsilon: 0.9535645
Game Count: 58, Current Reward: -21.0, Current Steps: 52422, Epsilon: 0.9528211
Game Count: 59, Current Reward: -21.0, Current Steps: 53360, Epsilon: 0.9519769
Game Count: 60, Current Reward: -20.0, Current Steps: 54342, Epsilon: 0.9510931
Game Count: 61, Current Reward: -19.0, Current Steps: 55567, Epsilon: 0.9499906
Game Count: 62, Current Reward: -20.0, Current Steps: 56492, Epsilon: 0.9491581
Game Count: 63, Current Reward: -19.0, Current Steps: 57533, Epsilon: 0.9482212
Game Count: 64, Current Reward: -20.0, Current Steps: 58464, Epsilon: 0.9473833
Game Count: 65, Current Reward: -20.0, Current Steps: 59428, Epsilon: 0.9465157
Game Count: 66, Current Reward: -20.0, Current Steps: 60330, Epsilon: 0.9457039
Game Count: 67, Current Reward: -20.0, Current Steps: 61200, Epsilon: 0.9449209
Game Count: 68, Current Reward: -21.0, C

# DDQN - Dueling

In [None]:
file_name = 'DDQN-Dueling'
DDQN = True
Dueling = True

pongDDQN_Dueling = DQN(file_name, DDQN, Dueling)
pongDDQN_Dueling.run()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 84, 84, 4)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 20, 20, 32)   8192        ['input_1[0][0]']                
                                                                                                  
 conv2d_1 (Conv2D)              (None, 9, 9, 64)     32768       ['conv2d[0][0]']                 
                                                                                                  
 conv2d_2 (Conv2D)              (None, 7, 7, 64)     36864       ['conv2d_1[0][0]']               
                                                                                              

  updates=self.state_updates,


Game Count: 54, Current Reward: -20.0, Current Steps: 50557, Epsilon: 0.9544996
Game Count: 55, Current Reward: -19.0, Current Steps: 51687, Epsilon: 0.9534826
Game Count: 56, Current Reward: -21.0, Current Steps: 52632, Epsilon: 0.9526321
Game Count: 57, Current Reward: -20.0, Current Steps: 53498, Epsilon: 0.9518527
Game Count: 58, Current Reward: -21.0, Current Steps: 54393, Epsilon: 0.9510472
Game Count: 59, Current Reward: -21.0, Current Steps: 55185, Epsilon: 0.9503344
Game Count: 60, Current Reward: -21.0, Current Steps: 55977, Epsilon: 0.9496216
Game Count: 61, Current Reward: -20.0, Current Steps: 57092, Epsilon: 0.9486181
Game Count: 62, Current Reward: -21.0, Current Steps: 58032, Epsilon: 0.9477721
Game Count: 63, Current Reward: -21.0, Current Steps: 58796, Epsilon: 0.9470845
Game Count: 64, Current Reward: -19.0, Current Steps: 60063, Epsilon: 0.9459442
Game Count: 65, Current Reward: -20.0, Current Steps: 61205, Epsilon: 0.9449164
Game Count: 66, Current Reward: -20.0, C

# Random Agent

In [None]:
file_name = 'Random'
DDQN = False
Dueling = False

pong_random = DQN(file_name, DDQN, Dueling)
pong_random.random()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 20, 20, 32)        8192      
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32768     
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 64)          36864     
                                                                 
 flatten (Flatten)           (None, 3136)              0         
                                                                 
 dense (Dense)               (None, 512)               1606144   
                                                                 
 dense_1 (Dense)             (None, 6)                 3078      
                                                                 
Total params: 1,687,046
Trainable params: 1,687,046
Non-

  updates=self.state_updates,


##############################################
TESTING
Average Test Results: -21.0
Game Count: 220, Current Reward: -20.0, Current Steps: 201503, Epsilon: 1
Game Count: 221, Current Reward: -20.0, Current Steps: 202433, Epsilon: 1
Game Count: 222, Current Reward: -19.0, Current Steps: 203455, Epsilon: 1
Game Count: 223, Current Reward: -21.0, Current Steps: 204326, Epsilon: 1
Game Count: 224, Current Reward: -21.0, Current Steps: 205178, Epsilon: 1
Game Count: 225, Current Reward: -21.0, Current Steps: 205942, Epsilon: 1
Game Count: 226, Current Reward: -21.0, Current Steps: 206813, Epsilon: 1
Game Count: 227, Current Reward: -20.0, Current Steps: 207743, Epsilon: 1
Game Count: 228, Current Reward: -21.0, Current Steps: 208595, Epsilon: 1
Game Count: 229, Current Reward: -20.0, Current Steps: 209498, Epsilon: 1
Game Count: 230, Current Reward: -21.0, Current Steps: 210291, Epsilon: 1
Game Count: 231, Current Reward: -20.0, Current Steps: 211349, Epsilon: 1
Game Count: 232, Current Rewa