* Human-level control through deep reinforcement learning - https://www.nature.com/articles/nature14236
* Youtube video - https://www.youtube.com/watch?v=atqneVERSMg&list=PLgMYKvjKE10UZNku-Qx7-z2PEC-7KLiUn&index=9

In [2]:
import torch
import torch.nn as nn
from tqdm import tqdm
import numpy as np
import random
import gymnasium as gym
import ale_py
gym.register_envs(ale_py)
import matplotlib.pyplot as plt
from stable_baselines3.common.buffers import ReplayBuffer
from stable_baselines3.common.atari_wrappers import MaxAndSkipEnv

In [3]:
class DQN(nn.Module):
    def __init__(self, nb_actions):
        super(DQN, self).__init__()
        self.network = nn.Sequential(
            nn.Conv2d(4, 16, 8, stride=4), 
            nn.ReLU(),
            nn.Conv2d(16, 32, 4, stride=4), 
            nn.ReLU(),
            nn.Flatten(), 
            nn.Linear(2592, 256), 
            nn.ReLU(),
            nn.Linear(256, nb_actions)
        )

    def forward(self, x):
        return self.network(x / 255.0)  # Normalize input as Uint8

In [4]:
def Deep_Q_Learning(
        env, 
        batch_size = 32,
        M = 30_000_000,  #  48 to 72 hours of training on a modern GPU to converge (30_000_000)
        epsilon_start = 1.0,  # Random Action
        epsilon_end = 0.01,  # Best Action
        nb_exploration_steps = 1_000_000, 
        buffer_size=1_000_000,  # This one is important.
        gamma = 0.99,
        update_frequency = 4,
        training_start_it = 80_000,  # Don't train directly, let agent explore first with a random policy
        C = 10_000,
        device = 'cuda'
        ):
    
    # Initialize replay memory D to capacity N
    rb = ReplayBuffer(buffer_size,
                      env.observation_space,
                      env.action_space,
                      device=device,
                      n_envs=1,
                      optimize_memory_usage = True,
                      handle_timeout_termination = False)
    
    # Initialize action-value function Q with random weights
    q_network = DQN(env.action_space.n).to(device)
    # Initialize target action-value function Q_hat
    target_q_network = DQN(env.action_space.n).to(device)
    target_q_network.load_state_dict(q_network.state_dict())

    optimizer = torch.optim.Adam(q_network.parameters(), lr=1.25e-4)

     # To plot later
    smoothed_rewards = []
    rewards = []
    max_reward = 0

    epoch = 0

    progress_bar = tqdm(total=M)
    while epoch <= M:

        # Initialize sequense s1 = {x1} and preprocessed sequence φ1 = φ(s1)
        state = env.reset()
        dead = False
        total_rewards = 0

        for _ in range(random.randint(1, 30)):  # Noop and fire to reset the environment
            obs, _, _, _, info = env.step(1)

        # For t=1, T do
        while not dead:
            epsilon = max((epsilon_end - epsilon_start) * epoch / nb_exploration_steps + epsilon_start, epsilon_end)

            if random.random() < epsilon: # With probability ε select a random action
                action = np.array(env.action_space.sample())
            else: # Otherwise select the best action a = max_a Q(φ(s), a; θ)
                with torch.no_grad():
                    q = q_network(torch.tensor(state).unsqueeze(0).to(device))
                    action = torch.argmax(q, dim = 1).item()

            # Execute action a in emulator and observe reward r and image x2
            current_life = info['lives']
            obs, reward, dead, _, info = env.step(action)

            done = info['lives'] < current_life

            total_rewards += reward
            reward = np.sign(reward)  # Clip rewards to -1, 0, 1

            # set st+1 = st, at, xt+1 and preprocess φt+1 = φ(st+1)
            next_state = obs.copy()

            # Store transition (φt, at, rt, φt+1) in D
            rb.add(state[0], next_state, action, reward, done, info) # Store transition in the replay buffer

            if epoch > training_start_it and epoch % update_frequency == 0:
                # Sample random minibatch of transitions (φj, aj, rj, φj+1) from D
                batch = rb.sample(batch_size)
                with torch.no_grad():
                    max_q_value_next_state = target_q_network(batch.next_observations).max(dim=1).values
                    y_j = batch.rewards.squeeze(-1) + gamma * max_q_value_next_state * (1 - batch.dones.squeeze(-1).float())
                
                current_q_values = q_network(batch.observations).gather(1, batch.actions).squeeze(-1)
                
                loss = torch.nn.functional.huber_loss(y_j, current_q_values)

                # Perform a gradient descent step according to equation 3
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if (epoch % 50_000 == 0) and epoch > 0:
                smoothed_rewards.append(np.mean(rewards))
                rewards = []
                plt.plot(smoothed_rewards)
                plt.title("Average Reward on Breakout")
                plt.xlabel("Training Epochs")
                plt.ylabel("Average Reward per Episode")
                plt.show()
            
            epoch += 1
            if epoch % C == 0:
                target_q_network.load_state_dict(q_network.state_dict())

            state = obs
            progress_bar.update(1)

        rewards.append(total_rewards)  

        if total_rewards > max_reward:
            max_reward = total_rewards
            torch.save(q_network.cpu(), 'target_q_network_{epoch}_{max_reward}.pth')
            q_network.to(device)

In [None]:
env = gym.make("ALE/Breakout-v5", 
               render_mode='human'
               )
env = gym.wrappers.RecordEpisodeStatistics(env)
env = gym.wrappers.ResizeObservation(env, (84, 84))
env = gym.wrappers.GrayscaleObservation(env)
env = gym.wrappers.FrameStackObservation(env, 4) # It was frame stack in video
env = MaxAndSkipEnv(env, skip = 4)

Deep_Q_Learning(env, device = 'cuda',
                M = 1000,
                buffer_size = 100_000)
env.close()

: 