# CartPole Agent using Deep Q-Learning



In [None]:
# Install required packages
!pip install gym torch numpy matplotlib

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import matplotlib.pyplot as plt

In [None]:
# Define Q-network
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, action_dim)
        )

    def forward(self, x):
        return self.fc(x)

In [None]:
# Replay Buffer and Training Parameters
def train():
    env = gym.make("CartPole-v1")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    model = DQN(state_dim, action_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    episodes = 500
    gamma = 0.99
    epsilon = 1.0
    epsilon_min = 0.01
    epsilon_decay = 0.995
    batch_size = 64
    memory = deque(maxlen=10000)
    scores = []

    for e in range(episodes):
        state = env.reset()[0]
        state = torch.FloatTensor(state).unsqueeze(0)
        total_reward = 0

        for time_t in range(200):
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_values = model(state)
                    action = torch.argmax(q_values).item()

            next_state, reward, done, _, _ = env.step(action)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            memory.append((state, action, reward, next_state_tensor, done))
            state = next_state_tensor
            total_reward += reward

            if done:
                break

        scores.append(total_reward)

        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        if len(memory) > batch_size:
            minibatch = random.sample(memory, batch_size)
            for state_b, action_b, reward_b, next_state_b, done_b in minibatch:
                target = reward_b
                if not done_b:
                    target = reward_b + gamma * torch.max(model(next_state_b)).item()
                target_f = model(state_b).detach().clone()
                target_f[0][action_b] = target

                model.train()
                output = model(state_b)
                loss = criterion(output, target_f)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if e % 10 == 0:
            print(f"Episode {e}, Score: {total_reward}, Epsilon: {epsilon:.2f}")

    env.close()
    return scores

scores = train()

In [None]:
# Plot the performance
plt.plot(scores)
plt.xlabel('Episode')
plt.ylabel('Score')
plt.title('CartPole DQN Performance')
plt.grid(True)
plt.show()