1.DQN for CartPole

In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

In [3]:
#  Define the Q-Network
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, action_dim)
        )
        
    def forward(self, x):
        return self.fc(x)

In [5]:
# Experience Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def push(self, transition):
        self.buffer.append(transition)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    def __len__(self):
        return len(self.buffer)


In [6]:

# Hyperparameters
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
lr = 1e-3
batch_size = 64
episodes = 500

In [7]:
# Initialize
policy_net = DQN(state_dim, action_dim)
target_net = DQN(state_dim, action_dim)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=lr)
replay_buffer = ReplayBuffer()

def choose_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        return policy_net(state_tensor).argmax().item()


In [None]:
# Training loop
for episode in range(episodes):
    state = env.reset()[0]
    total_reward = 0

    for t in range(200):
        action = choose_action(state)
        next_state, reward, done, _, _ = env.step(action)
        replay_buffer.push((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        if len(replay_buffer) > batch_size:
            batch = replay_buffer.sample(batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.FloatTensor(states)
            actions = torch.LongTensor(actions).unsqueeze(1)
            rewards = torch.FloatTensor(rewards).unsqueeze(1)
            next_states = torch.FloatTensor(next_states)
            dones = torch.BoolTensor(dones).unsqueeze(1)

            q_values = policy_net(states).gather(1, actions)
            next_q_values = target_net(next_states).max(1)[0].unsqueeze(1)
            target_q = rewards + gamma * next_q_values * (~dones)

            loss = nn.MSELoss()(q_values, target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if done:
            break

    # Update epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Update target network
    if episode % 10 == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.2f}")

env.close()

  if not isinstance(terminated, (bool, np.bool8)):
  states = torch.FloatTensor(states)


Episode 0, Total Reward: 82.0, Epsilon: 0.99
Episode 1, Total Reward: 29.0, Epsilon: 0.99
Episode 2, Total Reward: 18.0, Epsilon: 0.99
Episode 3, Total Reward: 20.0, Epsilon: 0.98
Episode 4, Total Reward: 13.0, Epsilon: 0.98
Episode 5, Total Reward: 16.0, Epsilon: 0.97
Episode 6, Total Reward: 16.0, Epsilon: 0.97
Episode 7, Total Reward: 27.0, Epsilon: 0.96
Episode 8, Total Reward: 16.0, Epsilon: 0.96
Episode 9, Total Reward: 16.0, Epsilon: 0.95
Episode 10, Total Reward: 10.0, Epsilon: 0.95
Episode 11, Total Reward: 21.0, Epsilon: 0.94
Episode 12, Total Reward: 42.0, Epsilon: 0.94
Episode 13, Total Reward: 25.0, Epsilon: 0.93
Episode 14, Total Reward: 11.0, Epsilon: 0.93
Episode 15, Total Reward: 13.0, Epsilon: 0.92
Episode 16, Total Reward: 25.0, Epsilon: 0.92
Episode 17, Total Reward: 17.0, Epsilon: 0.91
Episode 18, Total Reward: 21.0, Epsilon: 0.91
Episode 19, Total Reward: 12.0, Epsilon: 0.90
Episode 20, Total Reward: 15.0, Epsilon: 0.90
Episode 21, Total Reward: 15.0, Epsilon: 0.9