In [None]:
# dqn_cartpole.py
import random
import math
import collections
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# gym vs gymnasium compatibility:
try:
    import gymnasium as gym
except Exception:
    import gym

# -----------------------
# Config / Hyperparams
# -----------------------
@dataclass
class Config:
    env_name: str = "CartPole-v1"
    seed: int = 0
    gamma: float = 0.99
    lr: float = 1e-3
    batch_size: int = 64
    replay_size: int = 10000
    min_replay_size: int = 1000
    eps_start: float = 1.0
    eps_end: float = 0.02
    eps_decay: int = 10000  # number of steps to decay epsilon
    target_update_freq: int = 1000  # steps
    max_steps: int = 200000
    eval_every: int = 5000
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    hidden_size: int = 128

cfg = Config()

# -----------------------
# Utilities
# -----------------------
def set_seed(env, seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)_


In [None]:
# dqn_cartpole.py
import random
import math
import collections
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# gym vs gymnasium compatibility:
try:
    import gymnasium as gym
except Exception:
    import gym

# -----------------------
# Config / Hyperparams
# -----------------------
@dataclass
class Config:
    env_name: str = "CartPole-v1"
    seed: int = 0
    gamma: float = 0.99
    lr: float = 1e-3
    batch_size: int = 64
    replay_size: int = 10000
    min_replay_size: int = 1000
    eps_start: float = 1.0
    eps_end: float = 0.02
    eps_decay: int = 10000  # number of steps to decay epsilon
    target_update_freq: int = 1000  # steps
    max_steps: int = 200000
    eval_every: int = 5000
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    hidden_size: int = 128

cfg = Config()

# -----------------------
# Utilities
# -----------------------
def set_seed(env, seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    try:
        env.reset(seed=seed)
        env.action_space.seed(seed)
    except TypeError:
        pass

# Simple replay buffer
Transition = collections.namedtuple('Transition', ('s', 'a', 'r', 's2', 'done'))

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        return Transition(*zip(*batch))

    def __len__(self):
        return len(self.buffer)

# -----------------------
# Q-network
# -----------------------
class QNetwork(nn.Module):
    def __init__(self, obs_dim, n_actions, hidden_size=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

# -----------------------
# Epsilon schedule
# -----------------------
class EpsilonGreedy:
    def __init__(self, start, end, decay):
        self.start = start
        self.end = end
        self.decay = decay
        self.step = 0

    def value(self):
        # linear decay
        frac = min(1.0, self.step / self.decay)
        eps = self.start + frac * (self.end - self.start)
        return eps

    def step_inc(self):
        self.step += 1

# -----------------------
# Training loop
# -----------------------
def compute_td_loss(batch, online_net, target_net, gamma, device):
    s = torch.tensor(np.array(batch.s), dtype=torch.float32, device=device)
    a = torch.tensor(batch.a, dtype=torch.int64, device=device).unsqueeze(-1)
    r = torch.tensor(batch.r, dtype=torch.float32, device=device).unsqueeze(-1)
    s2 = torch.tensor(np.array(batch.s2), dtype=torch.float32, device=device)
    done = torch.tensor(batch.done, dtype=torch.float32, device=device).unsqueeze(-1)

    q_values = online_net(s).gather(1, a)  # Q(s,a)
    with torch.no_grad():
        # Double DQN could be used, but standard DQN uses target_net for next Q
        q_next = target_net(s2).max(1)[0].unsqueeze(-1)
        target = r + (1.0 - done) * gamma * q_next

    loss = nn.functional.mse_loss(q_values, target)
    return loss

def evaluate(env, net, episodes=5, device="cpu"):
    total = 0.0
    for _ in range(episodes):
        obs, _ = env.reset()
        done = False
        ep_rew = 0.0
        while not done:
            with torch.no_grad():
                s = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
                q = net(s)
                action = int(q.argmax(dim=1).item())
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            ep_rew += reward
        total += ep_rew
    return total / episodes

def main():
    env = gym.make(cfg.env_name)
    test_env = gym.make(cfg.env_name)
    set_seed(env, cfg.seed)
    set_seed(test_env, cfg.seed + 123)

    obs_dim = env.observation_space.shape[0]
    n_actions = env.action_space.n

    online_net = QNetwork(obs_dim, n_actions, hidden_size=cfg.hidden_size).to(cfg.device)
    target_net = QNetwork(obs_dim, n_actions, hidden_size=cfg.hidden_size).to(cfg.device)
    target_net.load_state_dict(online_net.state_dict())

    optimizer = optim.Adam(online_net.parameters(), lr=cfg.lr)

    replay = ReplayBuffer(cfg.replay_size)
    eps_sched = EpsilonGreedy(cfg.eps_start, cfg.eps_end, cfg.eps_decay)

    # fill replay with random data
    obs, _ = env.reset()
    for _ in range(cfg.min_replay_size):
        action = env.action_space.sample()
        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        replay.push(obs, action, reward, next_obs, done)
        if done:
            obs, _ = env.reset()
        else:
            obs = next_obs

    obs, _ = env.reset()
    total_steps = 0
    episode_reward = 0.0
    episode = 0

    while total_steps < cfg.max_steps:
        eps = eps_sched.value()
        if random.random() < eps:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                s = torch.tensor(obs, dtype=torch.float32, device=cfg.device).unsqueeze(0)
                q = online_net(s)
                action = int(q.argmax(dim=1).item())

        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        replay.push(obs, action, reward, next_obs, done)
        episode_reward += reward
        obs = next_obs if not done else env.reset()[0]

        # learn
        batch = replay.sample(cfg.batch_size)
        loss = compute_td_loss(batch, online_net, target_net, cfg.gamma, cfg.device)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update target
        if total_steps % cfg.target_update_freq == 0:
            target_net.load_state_dict(online_net.state_dict())

        eps_sched.step_inc()
        total_steps += 1

        if done:
            episode += 1
            if episode % 10 == 0:
                avg_eval = evaluate(test_env, online_net, episodes=5, device=cfg.device)
                print(f"Step {total_steps:6d}  Episode {episode:4d}  EpReward {episode_reward:.1f}  EvalAvg {avg_eval:.2f}  Eps {eps:.3f}")
            else:
                print(f"Step {total_steps:6d}  Episode {episode:4d}  EpReward {episode_reward:.1f}  Eps {eps:.3f}")
            episode_reward = 0.0

    # final evaluation and save
    final_score = evaluate(test_env, online_net, episodes=20, device=cfg.device)
    print("Training finished. Final average score over 20 episodes:", final_score)
    torch.save(online_net.state_dict(), "dqn_cartpole.pth")
    print("Saved model to dqn_cartpole.pth")

if __name__ == "__main__":
    main()
