In [2]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import matplotlib.pyplot as plt

# Select an Atari game (e.g., Breakout)
env_name = "Breakout-v0"
env = gym.make(env_name)

# Set random seed for reproducibility
seed = 42
env.seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

# Define hyperparameters
learning_rate = 0.001
discount_factor = 0.99
batch_size = 32
memory_size = 10000
epsilon_initial = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01

# Define the neural network model for DQN
class DQN(nn.Module):
    def __init__(self, num_actions):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0], 64)
        self.fc2 = nn.Linear(64, 64)
        self.output_layer = nn.Linear(64, num_actions)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.output_layer(x)

# Initialize the DQN model
num_actions = env.action_space.n
model = DQN(num_actions)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Initialize experience replay buffer
replay_buffer = deque(maxlen=memory_size)

# Initialize epsilon for epsilon-greedy exploration
epsilon = epsilon_initial

# Define function to select an action using epsilon-greedy policy
def select_action(state):
    if np.random.rand() < epsilon:
        return np.random.choice(num_actions)
    else:
        state_tensor = torch.tensor(state, dtype=torch.float32)
        q_values = model(state_tensor)
        return torch.argmax(q_values).item()

# Define function to update the target network
def update_target_network(target_model, model):
    target_model.load_state_dict(model.state_dict())

# Define function to train the DQN agent
def train_dqn():
    target_model = DQN(num_actions)
    update_target_network(target_model, model)
    rewards = []

    for episode in range(1000):
        state = env.reset()
        total_reward = 0

        while True:
            result = env.step(action)
            if len(result) == 4:
                next_state, reward, done, _ = result
            else:
                next_state, reward, done = result
            total_reward += reward

            replay_buffer.append((state, action, reward, next_state, done))
            state = next_state

            if len(replay_buffer) >= batch_size:
                minibatch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*minibatch)

                states_tensor = torch.tensor(states, dtype=torch.float32)
                next_states_tensor = torch.tensor(next_states, dtype=torch.float32)
                rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
                dones_tensor = torch.tensor(dones, dtype=torch.float32)

                target_q_values = rewards_tensor + discount_factor * torch.max(target_model(next_states_tensor), dim=1).values * (1 - dones_tensor)

                q_values = model(states_tensor)
                q_values = q_values.gather(1, torch.tensor(actions).unsqueeze(1)).squeeze()

                loss = nn.MSELoss()(q_values, target_q_values)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if done:
                break

        rewards.append(total_reward)
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        if episode % 10 == 0:
            update_target_network(target_model, model)
            print(f"Episode {episode}, Reward: {total_reward}, Epsilon: {epsilon:.4f}")

    return rewards

# Train the DQN agent
rewards = train_dqn()

# Plot the training progress
def plot_rewards(rewards):
    plt.plot(rewards)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Training Progress')
    plt.show()

plot_rewards(rewards)


  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


ValueError: too many values to unpack (expected 4)