In [2]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import logging
from DQN import DQN

from gym.wrappers.monitoring.video_recorder import VideoRecorder
import os

# Set TensorFlow logging level to suppress warnings
tf.get_logger().setLevel(logging.ERROR)

env = gym.make('BreakoutNoFrameskip-v4', render_mode='rgb_array')
env = gym.wrappers.AtariPreprocessing(env, frame_skip=4, grayscale_obs=True, scale_obs=True)
env = gym.wrappers.FrameStack(env, num_stack=4)

state_dim = env.observation_space.shape
state_dim = (state_dim[1], state_dim[2], state_dim[0])
num_actions = env.action_space.n
print(f"State dimension: {state_dim}", f"Number of actions: {num_actions}")

State dimension: (84, 84, 4) Number of actions: 4


In [4]:
# The first model makes the predictions for Q-values which are used to
# make a action.
model = DQN(state_dim, num_actions).model
# Build a target model for the prediction of future rewards.
# The weights of a target model get updated every 10000 steps thus when the
# loss between the Q-values is calculated the target Q-value is stable.
model_target = DQN(state_dim, num_actions).model

2023-10-30 09:38:57.458217: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-30 09:38:57.586944: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
# Experience replay buffer
class ReplayBuffer:
    def __init__(self, size=10000):
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

    def __len__(self):
        return len(self._storage)

    def add(self, state, action, reward, next_state, done):
        data = (state, action, reward, next_state, done)

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data

        self._next_idx = (self._next_idx + 1) % self._maxsize

    def _encode_sample(self, idxes):
        states, actions, rewards, next_states, dones = [], [], [], [], []

        for i in idxes:
            data = self._storage[i]
            state, action, reward, next_state, done = data
            states.append(np.array(state, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)
            next_states.append(np.array(next_state, copy=False))
            dones.append(done)

        states = np.array(states)
        states = np.transpose(states, (0, 2, 3, 1))
        next_states = np.array(next_states)
        next_states = np.transpose(next_states, (0, 2, 3, 1))
        rewards = np.array(rewards)
        actions = np.array(actions)
        dones = tf.convert_to_tensor(dones, dtype=tf.float32)

        return states, actions, rewards, next_states, dones
    
    def sample(self, batch_size):
        idxes = [np.random.randint(0, len(self._storage)) for _ in range(batch_size)]
        return self._encode_sample(idxes)
    
    def limit_size(self, size):
        self._maxsize = size
        self._storage = self._storage[:size]
        self._next_idx = min(self._next_idx, size)

In [7]:
# Configuration paramaters for the whole setup
gamma = 0.99  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken
batch_size = 32  # Size of batch taken from replay buffer
max_steps_per_episode = 10000

# Experience replay buffers
replay_buffer = ReplayBuffer()
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0
# Number of frames to take random action and observe output
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 100000.0
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000
# How often to update the target network
update_target_network = 10000
# Max no-op steps to perform at the start of an episode
max_no_op_steps = 30
# Use MSE loss 
loss_function = keras.losses.MSE
loss = 0

# Using Adam instead of RMSProp
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

In [8]:
while True:
    state = np.array(env.reset()[0])
    episode_reward = 0
    no_op_steps = 0

    print(f"Episode: {episode_count}",
          f"Frame count: {frame_count}\n",
          f"Average reward: {running_reward}\n",
          f"Epsilon: {epsilon}",
          f"Loss: {loss}\n",)

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1
        # Use epsilon-greedy for exploration
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            if no_op_steps > max_no_op_steps:
                # Can choose only between 1-3 actions
                action = np.random.choice([1, 2, 3])
            else:
                action = np.random.choice(num_actions)
                if action == 0:
                    no_op_steps += 1
        else:
            # Predict action Q-values
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            state_tensor = tf.transpose(state_tensor, perm=[0, 2, 3, 1])
            action_probs = model(state_tensor, training=False)
            # Take best action
            action = tf.argmax(action_probs[0]).numpy()


        # Decay probability of taking random action
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        # Apply the sampled action in our environment
        state_next, reward, done, _, _ = env.step(action)
        state_next = np.array(state_next)

        episode_reward += reward

        # Save actions and states in replay buffer
        replay_buffer.add(state, action, reward, state_next, done)
        state = state_next

        if len(replay_buffer) > batch_size:
            # Get samples from replay buffer
            state_sample, action_sample, rewards_sample, state_next_sample, done_sample = replay_buffer.sample(
                batch_size
            )

            # Build the updated Q-values for the sampled future states
            future_rewards = model_target.predict(state_next_sample, verbose=0)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )
            # If final frame set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, num_actions)
            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())

        # Limit the history
        if len(replay_buffer) > max_memory_length + 5000:
            replay_buffer.limit_size(max_memory_length)

        if done:
            break

    episode_reward_history.append(episode_reward)
    running_reward = np.mean(episode_reward_history)

    if running_reward > 40:
        print("Solved at episode {}!".format(episode_count))
        break

    episode_count += 1

Episode: 0 Frame count: 0
 Average reward: 0
 Epsilon: 1.0 Loss: 0

Episode: 1 Frame count: 220
 Average reward: 2.0
 Epsilon: 0.9980199999999919 Loss: 0.034463200718164444

Episode: 2 Frame count: 356
 Average reward: 1.0
 Epsilon: 0.9967959999999869 Loss: 0.024348225444555283

Episode: 3 Frame count: 533
 Average reward: 1.0
 Epsilon: 0.9952029999999804 Loss: 0.00018115987768396735

Episode: 4 Frame count: 782
 Average reward: 1.5
 Epsilon: 0.9929619999999713 Loss: 0.00027264998061582446

Episode: 5 Frame count: 975
 Average reward: 1.4
 Epsilon: 0.9912249999999642 Loss: 0.0007751152152195573

Episode: 6 Frame count: 1149
 Average reward: 1.3333333333333333
 Epsilon: 0.9896589999999578 Loss: 0.0006592734716832638

Episode: 7 Frame count: 1415
 Average reward: 1.5714285714285714
 Epsilon: 0.987264999999948 Loss: 0.0009473289828747511

Episode: 8 Frame count: 1688
 Average reward: 1.75
 Epsilon: 0.984807999999938 Loss: 0.034237928688526154

Episode: 9 Frame count: 1839
 Average reward:

KeyboardInterrupt: 

In [9]:
# Save model
model.save_weights('models/model_weights.h5')

Save video with the trained agent:

In [10]:
def create_video(env, model, folder="videos", name="video"):
    video_recorder = VideoRecorder(env, path=os.path.join(folder, name + ".mp4"))
    s, _ = env.reset()
    done = False
    epoch = 0

    while not done:
        video_recorder.capture_frame()
        s = np.array(s)
        s = tf.expand_dims(s, 0)
        s = tf.transpose(s, perm=[0, 2, 3, 1])
        action_probs = model(s, training=False)
        action = tf.argmax(action_probs[0]).numpy()
        s, _, done, _, _ = env.step(action)
        epoch += 1

    video_recorder.close()

In [13]:
create_video(env, model, folder="videos", name="video")

Moviepy - Building video videos/video.mp4.
Moviepy - Writing video videos/video.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready videos/video.mp4


KeyboardInterrupt: 