In [1]:
# Introduction
# In this project, I implement the Deep Q-Learning (DQN) algorithm to train an RL agent to play Breakout.
# We use the OpenAI Gym environment and PyTorch for this implementation.


In [2]:
# Setup Environment
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import matplotlib.pyplot as plt

In [3]:

# Select an Atari game
env = gym.make('Breakout-v0')

  logger.warn(


In [4]:
# Inspect the type and content of the initial state
state = env.reset()
print(f'Type of initial state: {type(state)}, Initial state: {state}')

Type of initial state: <class 'tuple'>, Initial state: (array([[[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       ...,

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        ...,
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]], dtype=uint8), {'lives': 5, 'episode_frame_number': 0, 'frame_number': 0})


In [5]:
# Implement DQN
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 512)
        self.fc2 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)


In [6]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.stack(state), action, reward, np.stack(next_state), done

    def __len__(self):
        return len(self.buffer)

In [9]:
def train_dqn(env, model, buffer, optimizer, batch_size, gamma, epsilon, epsilon_decay, min_epsilon, num_episodes):
    episode_rewards = []

    for episode in range(num_episodes):
        state = env.reset()
        if isinstance(state, tuple):  # Unpack if it's a tuple
            state, _ = state

        if len(state.shape) == 3 and state.shape[-1] in [1, 3]:  # If state is in (H, W, C) format
            state = np.transpose(state, (2, 0, 1))  # Convert to (C, H, W) format
        state = torch.FloatTensor(state).unsqueeze(0)
        total_reward = 0

        while True:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_values = model(state)
                    action = q_values.max(1)[1].item()

            next_state, reward, done, *info = env.step(action)  # Unpack info as well
            if isinstance(next_state, tuple):  # Unpack if it's a tuple
                next_state, _ = next_state

            if len(next_state.shape) == 3 and next_state.shape[-1] in [1, 3]:
                next_state = np.transpose(next_state, (2, 0, 1))  # Convert to (C, H, W) format
            next_state = torch.FloatTensor(next_state).unsqueeze(0)
            buffer.push(state, action, reward, next_state, done)

            if len(buffer) > batch_size:
                states, actions, rewards, next_states, dones = buffer.sample(batch_size)
                states = torch.FloatTensor(states)
                actions = torch.LongTensor(actions)
                rewards = torch.FloatTensor(rewards)
                next_states = torch.FloatTensor(next_states)
                dones = torch.FloatTensor(dones)

                # Ensure the sampled states have the correct shape
                states = states.squeeze(1)  # Remove the extra dimension
                next_states = next_states.squeeze(1)  # Remove the extra dimension

                current_q_values = model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
                next_q_values = model(next_states).max(1)[0]
                target_q_values = rewards + gamma * next_q_values * (1 - dones)

                loss = nn.MSELoss()(current_q_values, target_q_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            state = next_state
            total_reward += reward

            if done:
                break

        episode_rewards.append(total_reward)
        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        print(f'Episode: {episode}, Reward: {total_reward}, Epsilon: {epsilon}')

    return episode_rewards


In [10]:
# Training and Evaluation
num_actions = env.action_space.n
input_shape = (3, 210, 160)  # Shape of the input frame
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
min_epsilon = 0.01
batch_size = 32
num_episodes = 500
buffer_capacity = 10000

model = DQN(input_shape, num_actions)
buffer = ReplayBuffer(buffer_capacity)
optimizer = optim.Adam(model.parameters())

rewards = train_dqn(env, model, buffer, optimizer, batch_size, gamma, epsilon, epsilon_decay, min_epsilon, num_episodes)

plt.plot(rewards)
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.title('Training Progress')
plt.show()


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [None]:
# Conclusion
# In this project, I successfully implemented the DQN algorithm and trained an agent to play Breakout.
# The agent's performance improved over time as observed in the rewards plot.