In [1]:
#Importing libraries

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import matplotlib.pyplot as plt
from collections import deque
from torchvision import transforms



In [2]:
# Create the Atari environment
env = gym.make('Breakout-v0')

  logger.warn(


In [11]:
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Linear(self.feature_size(input_shape), 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

    def feature_size(self, input_shape):
        return self.features(torch.zeros(1, *input_shape)).view(1, -1).size(1)


In [12]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.array(state), action, reward, np.array(next_state), done

    def __len__(self):
        return len(self.buffer)

In [13]:
class DQNAgent:
    def __init__(self, model, num_actions, replay_buffer, batch_size=32, gamma=0.99, lr=0.0001):
        self.model = model
        self.num_actions = num_actions
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.gamma = gamma
        self.optimizer = optim.Adam(model.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

    def select_action(self, state, epsilon):
        if random.random() > epsilon:
            with torch.no_grad():
                return self.model(state).argmax(dim=1).item()
        else:
            return random.randrange(self.num_actions)

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)
        states = torch.FloatTensor(states).squeeze(1).to(device)
        actions = torch.LongTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).squeeze(1).to(device)
        dones = torch.FloatTensor(dones).to(device)

        states = states.view(self.batch_size, 4, 84, 84)
        next_states = next_states.view(self.batch_size, 4, 84, 84)

        q_values = self.model(states)
        next_q_values = self.model(next_states)

        q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]
        expected_q_value = rewards + self.gamma * next_q_value * (1 - dones)

        loss = self.loss_fn(q_value, expected_q_value)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make('Breakout-v0')
env.reset()

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], dtype=uint8),
 {'lives': 5, 'episode_frame_number': 0, 'frame_number': 0})

In [17]:
input_shape = (4, 84, 84)  # The input is a stack of 4 frames
nb_actions = env.action_space.n
model = DQN(input_shape, nb_actions).to(device)

replay_buffer = ReplayBuffer(10000)
agent = DQNAgent(model, nb_actions, replay_buffer)

RuntimeError: Calculated padded input size per channel: (4 x 84 x 84). Kernel size: (8 x 8 x 8). Kernel size can't be greater than actual input size

In [8]:
def preprocess_frame(frame):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Grayscale(),
        transforms.Resize((84, 84)),
        transforms.ToTensor()
    ])
    return transform(frame).numpy()

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
print(torch.cuda.is_available==True)

False


In [10]:
# Training parameters
num_episodes = 500
epsilon_start = 1.0
epsilon_final = 0.1
epsilon_decay = 50000
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * np.exp(-1. * frame_idx / epsilon_decay)


In [53]:
# Training loop
all_rewards = []
episode_reward = 0

# Handle env.reset() properly
initial_state = env.reset()
state = preprocess_frame(initial_state[0] if isinstance(initial_state, tuple) else initial_state)
state = np.stack([state] * 4, axis=0)  # Shape: (4, 84, 84)
state = torch.FloatTensor(state).unsqueeze(0).to(device)  # Shape: (1, 4, 84, 84)

for frame_idx in range(1, num_episodes * 10000 + 1):
    epsilon = epsilon_by_frame(frame_idx)
    action = agent.select_action(state, epsilon)
    
    # Unpack only the required values from env.step(action)
    step_result = env.step(action)
    next_frame = step_result[0]
    reward = step_result[1]
    done = step_result[2]
    
    next_state = preprocess_frame(next_frame[0] if isinstance(next_frame, tuple) else next_frame)
    next_state = np.append(state.cpu().numpy()[0, 1:, :, :], np.expand_dims(next_state, 0), axis=0)  # Shape: (4, 84, 84)
    
    replay_buffer.push(state.cpu().numpy(), action, reward, next_state, done)
    
    state = torch.FloatTensor(next_state).unsqueeze(0).to(device)  # Shape: (1, 4, 84, 84)
    episode_reward += reward

    if done:
        all_rewards.append(episode_reward)
        episode_reward = 0
        initial_state = env.reset()
        state = preprocess_frame(initial_state[0] if isinstance(initial_state, tuple) else initial_state)
        state = np.stack([state] * 4, axis=0)  # Shape: (4, 84, 84)
        state = torch.FloatFloatTensor(state).unsqueeze(0).to(device)  # Shape: (1, 4, 84, 84)

    agent.update()

    if frame_idx % 10000 == 0:
        print(f'Frame: {frame_idx}, Reward: {np.mean(all_rewards[-10:]):.2f}, Epsilon: {epsilon:.2f}')

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [32, 4, 1, 84, 84]

In [None]:

# Plotting rewards
plt.plot(all_rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()