In [1]:
#import tensorflow as tf
#from keras import Sequential
#from keras.layers import Conv2D, Dense, Dropout, MaxPooling2D, Activation, Flatten, Input
#from keras.activations import relu
#from keras.callbacks import TensorBoard
#from keras.optimizers import Adam
import numpy as np
import gymnasium as gym
from gymnasium.wrappers import GrayScaleObservation, ResizeObservation, FrameStack
#import matplotlib.pyplot as plt
from collections import deque, namedtuple
from typing import NamedTuple, Type
import random

In [1]:
import gymnasium as gym
env = gym.make('ALE/Freeway-v5', render_mode='human')
#env.reset()
#env.render()
#env = GrayScaleObservation(gym.make('ALE/Breakout-v5', render_mode='human'))
#env = ResizeObservation(env, (84, 84))
#env = FrameStack(env, 4)

#observation, info = env.reset()
#env.render()

A.L.E: Arcade Learning Environment (version 0.8.0+919230b)
[Powered by Stella]


## Preprocess image utility function

In [9]:
# Create the buffer of frame to give to the Neural Network
num_frames = 4 # (Tau)
state_buffer = deque(maxlen=num_frames)
next_state_buffer = deque(maxlen=num_frames)

In [22]:
class ExperienceReplay:
    def __init__(self, memory_size=50000, burn_in=10000):
        """
        Construct a new buffer of replay experience
        :param memory_size: maximum size of the memory
        :param burn_in: used to initialise the memory.
                        At tbe beginning of training, the agent will take a max 'burn_in' number of completely random step to populate the buffer with.
                        This allows to have sufficient values to train on. E.g.: if burn_in is 10,000, then the memory will be filled with 10,000 random steps.
                        The burn_in cannot be lower than memory size
        """
        self.memory_size = memory_size
        self.burn_in = burn_in
        self.Buffer = namedtuple('Buffer', ['state', 'action', 'reward', 'done', 'next_state',])
        self.replay_memory = deque(maxlen=memory_size) # Store namedtuple Buffer

    def sample_batch(self, batch_size=32):
        """
        Randomly selects a batch of data stored in the memory
        :param batch_size: the number of data to select
        :return: I am not sure
        """
        # This return a random list of indexes from the buffer
        samples = np.random.choice(len(self.replay_memory), batch_size, replace=False)

        # The below line of code is really not clear at all, I don't know who said python is simpler to read but seriously, this is horrible
        # Basically, the part '*expression' unpack the value in the iterable container (the deque)
        # Which creates a 'list' of arguments that can be passed to a function.
        # e.g.:
        # assume an array of tuple [(5, 2), (6, 2), (3, 7)]
        # If we unpack this tuple '*[(5, 2), (6, 2), (3, 7)]', this will create a list of argument to be passed to a function, so:
        # zip(*[(5, 2), (6, 2), (3, 7)]) is similar to zip((5, 2), (6, 2), (3, 7)). We are basically removing the stuff related to the array like [].
        batch = zip(*[self.replay_memory[i] for i in samples])
        # zip returns an iterator of tuple where the first item in each passed iterator is paired together, and then the second items are paired together, etc...
        # https://www.w3schools.com/python/ref_func_zip.asp
        return batch

    def append(self, state, action, reward, done, next_state):
        """
        Add data to the memory buffer
        :param state: the current observation state (as ima array)
        :param action: the action that is taken according to the current state
        :param reward: the reward following the taken action (number)
        :param done: if the current step is terminated following the action
        :param next_state: the next state that follows the current one after the taken action
        """
        self.replay_memory.append(
            self.Buffer(state, action, reward, done, next_state))

    def burn_in_capacity(self):
        """
        Calculate the ratio between the data in the buffer and the burn in limit
        :return: Return the ratio from 0 to 1. value > 1 means that the burn in limit has been reached
        """
        return len(self.replay_memory) / self.burn_in

In [25]:
class DQNAgent:
    def __init__(self, env):
        self.env = env

        self.max_steps_per_episode = 10000

        # Get trained every step (.fit())
        self.model = self.create_q_model()
        # The nodel we .predict() against every step
        self.target_model = self.create_q_model()

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

        # Experience replay buffers
        self.action_history = []
        self.state_history = []
        self.state_next_history = []
        self.rewards_history = []
        self.done_history = []
        self.episode_reward_history = []
        # Maximum replay length
        # Note: The Deepmind paper suggests 1000000 however this causes memory issues
        self.MAX_MEMORY_LENGTH = 100000

        self.running_reward = 0
        self.episode_count = 0
        self.frame_count = 0

        # Number of frames to take random action and observe output
        self.EPSILON_RANDOM_FRAMES = 50000
        # Number of frames for exploration
        self.EPSILON_GREEDY_FRAMES = 1000000.0

        self.epsilon = 1 # Current exploration rate
        self.EPSILON_MIN = 0.1 # Minimum exploration rate
        self.EPSILON_MAX = 1.0 # Maximum exploration rate
        self.epsilon_decay = (self.EPSILON_MAX - self.EPSILON_MIN) # Rate of decay for the exploration rate

        self.UPDATE_AFTER_ACTIONS = 4
        # How often to update the target network
        self.UPDATE_TARGET_NETWORK = 10000

        self.discount_rate = 0.99

    def create_q_model(self):
        # Network defined by the Deepmind paper
        inputs = Input(shape=(84, 84, 4,))

        # Convolutions on the frames on the screen
        layer1 = Conv2D(32, 8, strides=4, activation="relu")(inputs)
        layer2 = Conv2D(64, 4, strides=2, activation="relu")(layer1)
        layer3 = Conv2D(64, 3, strides=1, activation="relu")(layer2)

        layer4 = Flatten()(layer3)

        layer5 = Dense(512, activation="relu")(layer4)
        action = Dense(self.env.action_space.n, activation="linear")(layer5)

        return tf.keras.Model(inputs=inputs, outputs=action)

    def train(self, batch_size=32):
        self.initialise_replay_memory()

        self.episode_count = 0

        while True:
            state, info = self.env.reset()
            episode_reward = 0

            for timestep in range(1, self.max_steps_per_episode):
                self.frame_count += 1

                action = self.get_action(state)

                # Decay probability of taking random action to balance exploitation vs exploration
                self.epsilon -= self.epsilon_decay / self.EPSILON_GREEDY_FRAMES
                self.epsilon = max(self.epsilon, self.EPSILON_MIN)

                # Apply the action in the environment
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                episode_reward += reward

                # Save actions and states in replay buffer
                self.action_history.append(action)
                self.state_history.append(state)
                self.state_next_history.append(next_state)
                self.done_history.append(done)
                self.rewards_history.append(reward)
                state = next_state

                # Update every fourth frame
                if self.frame_count % self.UPDATE_AFTER_ACTIONS == 0 and len(self.done_history) > batch_size:
                    batch = self.replay_memory.sample_batch()
                    states_sample, actions_sample, rewards_sample, dones_sample, next_states_sample = np.array([i for i in batch])

                    future_rewards = self.target_model.predict(next_states_sample)
                    updated_q_values = rewards_sample + self.discount_rate * tf.reduce_max(future_rewards, axis=1)
                    
                    # if final frame set the last value to -1
                    updated_q_values = updated_q_values * (1 - dones_sample) - dones_sample
                    
                    # Create a mask to only calculate loss on the updated Q Values
                    masks = tf.one_hot(actions_sample, self.env.action_space.n)
                    
                    with tf.GradientTape() as tape:
                        # Train the model on the states and updated Q-values
                        q_values = self.model(states_sample)
                        
                        # Apply the masks to the Q-values to get the Q-value for action taken
                        q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                        # Calculate loss between new Q-value and old Q-value
                        loss = tf.keras.losses.Huber(updated_q_values, q_action)

                    # Backpropagation
                    grads = tape.gradient(loss, self.model.trainable_variables)
                    self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

                if frame_count % self.UPDATE_TARGET_NETWORK == 0:
                    self.target_model.set_weights(self.model.get_weights())
                    template = "running reward: {:.2f} at episode {}, frame count {}"
                    print(template.format(self.rewards, self.episode_count, frame_count))

                if done:
                    break


            # Update running reward to check condition for solving
            self.episode_reward_history.append(episode_reward)
            if len(self.episode_reward_history) > 100:
                del self.episode_reward_history[:1]
            self.rewards = np.mean(self.episode_reward_history)

            self.episode_count += 1

            if self.rewards > 40:  # Condition to consider the task solved
                print("Solved at episode {}!".format(self.episode_count))
                break

    def initialise_replay_memory(self):
        self.env.reset()
        state_img = self.env.render()
        state = preprocess_observation(state_img)

        # Fill the memory replay buffer
        while self.replay_memory.burn_in_capacity() < 1:
            action = env.action_space.sample()
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            state_img = self.env.render()
            next_state = preprocess_observation(state_img)

            done = terminated or truncated

            self.rewards += reward
            self.replay_memory.append(state, action, reward, done, next_state)
            state = next_state
            if done:
                self.env.reset()
                state_img = self.env.render()
                state = preprocess_observation(state_img)

    def get_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            # Take random action
            action = env.action_space.sample()
        else:
            # Predict action Q-values from env state
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probabilities = self.model(state_tensor, trainable=False)
            action = tf.argmax(action_probabilities[0]).numpy()

        return action

In [26]:
agent = DQNAgent(env, ExperienceReplay())
agent.train()

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (5, 32) + inhomogeneous part.