# Deep Q-Networks (DQN) Implementation in Python

#### Deep Q-Networks (DQN)
Deep Q-Networks (DQN) is a reinforcement learning algorithm that combines Q-learning with deep neural networks. It uses a neural network to approximate the Q-values for each action in a given state, allowing it to handle environmnets with high-dimensional and continuous state spaces. DQN uses experience replay (storing past experiences and training on random batches) and a target networks to stabilise training

In [6]:
import gymnasium as gym 
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# Define the DQN model
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.linear1 = nn.Linear(input_size, 64)
        self.linear2 = nn.Linear(64, 32) 
        self.linear3 = nn.Linear(32, output_size)

    def forward(self, x):
        x = torch.relu(self.linear1(x))
        x = torch.relu(self.linear2(x))
        return self.linear3(x)
    
# Hyperparameters
env_name = 'CartPole-v1'
learning_rate = 0.001
batch_size = 32  # Fixed: removed duplicate definition
gamma = 0.99
buffer_size = 10000
epsilon = 0.1
target_update_freq = 100

# Initialize environment and DQN
env = gym.make(env_name)
input_size = env.observation_space.shape[0]
output_size = env.action_space.n    

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy_net = DQN(input_size, output_size).to(device)
target_net = DQN(input_size, output_size).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# Experience replay buffer - using deque for better performance
replay_buffer = deque(maxlen=buffer_size)
 
def train(num_episodes):
    step_count = 0  # Initialize step count

    for episode in range(num_episodes):
        # Handle both old and new gym API
        reset_result = env.reset()
        if isinstance(reset_result, tuple):
            state, _ = reset_result
        else:
            state = reset_result
        
        state = np.array(state)
        total_reward = 0  # Fixed: initialize total_reward for each episode
        done = False
        
        while not done:
            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    action = policy_net(torch.tensor(state, dtype=torch.float32, device=device)).argmax().item()

            # Take action and observe reward and next state
            step_result = env.step(action)
            
            # Handle different gym API versions
            if len(step_result) == 5:
                next_state, reward, done, truncated, _ = step_result
                done = done or truncated
            else:
                next_state, reward, done, _ = step_result
                
            next_state = np.array(next_state)
            total_reward += reward

            # Store experience in replay buffer 
            replay_buffer.append((state, action, reward, next_state, done))

            # Update current state
            state = next_state

            # Sample batch from replay buffer and train
            if len(replay_buffer) >= batch_size:
                batch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)

                # Convert to tensors and move to device
                states = torch.tensor(np.array(states), dtype=torch.float32, device=device)
                actions = torch.tensor(actions, dtype=torch.long, device=device)
                rewards = torch.tensor(rewards, dtype=torch.float32, device=device)
                next_states = torch.tensor(np.array(next_states), dtype=torch.float32, device=device)
                dones = torch.tensor(dones, dtype=torch.float32, device=device)

                # Compute Q-values and target Q-values
                current_q_values = policy_net(states).gather(1, actions.unsqueeze(1))
                next_q_values = target_net(next_states).max(1)[0].detach()
                target_q_values = rewards + gamma * next_q_values * (1 - dones)

                # Compute loss and update policy network
                loss = criterion(current_q_values, target_q_values.unsqueeze(1))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Update target network periodically
                step_count += 1
                if step_count % target_update_freq == 0:
                    target_net.load_state_dict(policy_net.state_dict())

        print(f"Episode {episode+1}: Total reward = {total_reward}")

# Train the agent
train(num_episodes=1000)

Episode 1: Total reward = 10.0
Episode 2: Total reward = 10.0
Episode 3: Total reward = 12.0
Episode 4: Total reward = 9.0
Episode 5: Total reward = 11.0
Episode 6: Total reward = 8.0
Episode 7: Total reward = 10.0
Episode 8: Total reward = 10.0
Episode 9: Total reward = 9.0
Episode 10: Total reward = 10.0
Episode 11: Total reward = 10.0
Episode 12: Total reward = 11.0
Episode 13: Total reward = 10.0
Episode 14: Total reward = 11.0
Episode 15: Total reward = 10.0
Episode 16: Total reward = 11.0
Episode 17: Total reward = 10.0
Episode 18: Total reward = 9.0
Episode 19: Total reward = 12.0
Episode 20: Total reward = 10.0
Episode 21: Total reward = 11.0
Episode 22: Total reward = 10.0
Episode 23: Total reward = 9.0
Episode 24: Total reward = 10.0
Episode 25: Total reward = 9.0
Episode 26: Total reward = 9.0
Episode 27: Total reward = 10.0
Episode 28: Total reward = 10.0
Episode 29: Total reward = 10.0
Episode 30: Total reward = 9.0
Episode 31: Total reward = 10.0
Episode 32: Total reward 