In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
from collections import deque
import random

In [31]:
# Define our neural network architecture for DQN
class DQN(nn.Module):
    """
    Deep Q-Network that takes the state as input and outputs Q-values for each action.
    In CartPole, we have 4 state values and 2 possible actions (left/right).
    """
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        # Create a simple feedforward neural network
        self.network = nn.Sequential(
            nn.Linear(input_size, 64),    # First hidden layer with 64 neurons
            nn.ReLU(),                    # Activation function
            nn.Linear(64, 64),            # Second hidden layer
            nn.ReLU(),
            nn.Linear(64, output_size)    # Output layer (Q-values for each action)
        )

    def forward(self, x):
        return self.network(x)


In [32]:
class ReplayMemory:
    """
    Replay Memory stores past experiences for training.
    This helps break correlation between consecutive samples and stabilizes learning.
    """
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)  # Using deque with fixed max length

    def push(self, state, action, reward, next_state, done):
        # Store transition (state, action, reward, next_state, done) in memory
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        # Randomly sample a batch of transitions from memory
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


In [33]:
class DQNAgent:
    """
    DQN Agent that interacts with and learns from the environment
    """
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        # Hyperparameters
        self.gamma = 0.99          # Discount factor for future rewards
        self.epsilon = 1.0         # Exploration rate
        self.epsilon_min = 0.01    # Minimum exploration rate
        self.epsilon_decay = 0.995 # Decay rate for exploration
        self.learning_rate = 0.001 # Learning rate for the optimizer
        self.batch_size = 64       # Size of batch to sample from replay memory

        # Create main network and target network (for stable learning)
        self.policy_net = DQN(state_size, action_size)
        self.target_net = DQN(state_size, action_size)
        self.target_net.load_state_dict(self.policy_net.state_dict())

        # Setup optimizer
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)

        # Initialize replay memory
        self.memory = ReplayMemory(10000)

    def select_action(self, state):
        """
        Select action using epsilon-greedy policy:
        - With probability epsilon: select random action (explore)
        - Otherwise: select best action according to policy network (exploit)
        """
        if random.random() < self.epsilon:
            return random.randrange(self.action_size)

        with torch.no_grad():
            state = torch.FloatTensor(state).unsqueeze(0)
            q_values = self.policy_net(state)
            return q_values.max(1)[1].item()

    def train(self):
        """
        Train the network using a batch of experiences from replay memory
        """
        if len(self.memory) < self.batch_size:
            return

        # Sample random batch from memory
        transitions = self.memory.sample(self.batch_size)

        # Convert batch-array of transitions to transition of batch-arrays
        batch = list(zip(*transitions))

        # Extract each component of the transition
        states = torch.FloatTensor(batch[0])
        actions = torch.LongTensor(batch[1]).unsqueeze(1)
        rewards = torch.FloatTensor(batch[2]).unsqueeze(1)
        next_states = torch.FloatTensor(batch[3])
        dones = torch.FloatTensor(batch[4]).unsqueeze(1)

        # Compute current Q values
        current_q_values = self.policy_net(states).gather(1, actions)

        # Compute next Q values using target network
        with torch.no_grad():
            max_next_q_values = self.target_net(next_states).max(1)[0].unsqueeze(1)

        # Compute expected Q values using Bellman equation
        expected_q_values = rewards + (1 - dones) * self.gamma * max_next_q_values

        # Compute loss (Mean Squared Error between current and expected Q values)
        loss = nn.MSELoss()(current_q_values, expected_q_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Decay epsilon for less exploration over time
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def update_target_network(self):
        """Update target network by copying parameters from policy network"""
        self.target_net.load_state_dict(self.policy_net.state_dict())


In [34]:
def train_agent():
    """
    Main training loop
    """
    # Create CartPole environment
    env = gym.make('CartPole-v1')

    # Initialize agent
    state_size = env.observation_space.shape[0]  # 4 for CartPole
    action_size = env.action_space.n             # 2 for CartPole
    agent = DQNAgent(state_size, action_size)

    # Training parameters
    n_episodes = 500
    max_steps = 500
    target_update = 10  # Update target network every 10 episodes

    # Lists to store rewards for plotting
    episode_rewards = []

    # Training loop
    for episode in range(n_episodes):
        state = env.reset()
        total_reward = 0

        # Episode loop
        for step in range(max_steps):
            # Select and perform action
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)

            # Store transition in memory
            agent.memory.push(state, action, reward, next_state, done)

            # Train the network
            agent.train()

            # Update state and reward
            state = next_state
            total_reward += reward

            if done:
                break

        # Update target network periodically
        if episode % target_update == 0:
            agent.update_target_network()

        episode_rewards.append(total_reward)

        # Print progress
        if (episode + 1) % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            print(f"Episode {episode + 1}/{n_episodes}, Average Reward: {avg_reward:.2f}")

    env.close()
    return episode_rewards

if __name__ == "__main__":
    rewards = train_agent()

Episode 10/500, Average Reward: 21.30
Episode 20/500, Average Reward: 13.10
Episode 30/500, Average Reward: 11.70
Episode 40/500, Average Reward: 12.70
Episode 50/500, Average Reward: 66.80
Episode 60/500, Average Reward: 149.10
Episode 70/500, Average Reward: 171.80
Episode 80/500, Average Reward: 169.70
Episode 90/500, Average Reward: 173.80
Episode 100/500, Average Reward: 187.70
Episode 110/500, Average Reward: 167.50
Episode 120/500, Average Reward: 173.30
Episode 130/500, Average Reward: 188.80
Episode 140/500, Average Reward: 162.00
Episode 150/500, Average Reward: 174.40
Episode 160/500, Average Reward: 191.50
Episode 170/500, Average Reward: 269.90
Episode 180/500, Average Reward: 242.40
Episode 190/500, Average Reward: 206.20
Episode 200/500, Average Reward: 301.30
Episode 210/500, Average Reward: 138.60
Episode 220/500, Average Reward: 250.70
Episode 230/500, Average Reward: 253.70
Episode 240/500, Average Reward: 228.00
Episode 250/500, Average Reward: 207.30
Episode 260/50