In [8]:
# Importing  necessary libraries
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from torchvision import transforms

In [2]:
# Define the DQN Network
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        self.fc1 = nn.Linear(self.feature_size(input_shape), 512)
        self.fc2 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)
    
    def feature_size(self, input_shape):
        return self.conv3(self.conv2(self.conv1(torch.zeros(1, *input_shape)))).view(1, -1).size(1)


In [3]:

# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)


In [4]:

# Initialize environment, model, and replay buffer
env = gym.make('Breakout-v0')
input_shape = (4, 84, 84)  # example input dimensions, you may need to preprocess the input
num_actions = env.action_space.n
model = DQN(input_shape, num_actions)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
replay_buffer = ReplayBuffer(10000)


  logger.warn(


In [5]:

# Training function
def train_model(batch_size):
    if len(replay_buffer.buffer) < batch_size:
        return
    minibatch = replay_buffer.sample(batch_size)
    for state, action, reward, next_state, done in minibatch:
        state = torch.tensor(state, dtype=torch.float32)
        next_state = torch.tensor(next_state, dtype=torch.float32)
        action = torch.tensor([action], dtype=torch.long)
        reward = torch.tensor([reward], dtype=torch.float32)
        done = torch.tensor([done], dtype=torch.float32)

        q_values = model(state)
        next_q_values = model(next_state)
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]
        expected_q_value = reward + 0.99 * next_q_value * (1 - done)  # discount factor of 0.99

        loss = F.mse_loss(q_value, expected_q_value.detach())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [7]:

# Main training loop
num_episodes = 500
batch_size = 32
episode_rewards = []

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = env.action_space.sample()  
        next_state, reward, done, info = env.step(action)  
        replay_buffer.add(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        train_model(batch_size)
    
    episode_rewards.append(total_reward)
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")


ValueError: too many values to unpack (expected 4)

In [None]:

# Plot results
plt.plot(episode_rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Training Progress')
plt.show()
