## DQN

In [44]:
import sys
import os
project_root = os.popen('pwd').read().strip()
sys.path.append(project_root)

In [45]:
import torch
import gymnasium as gym
from model import DQN, Data
from const import *
from utils import set_random_seed
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm  # 引入 tqdm 进度条
import numpy as np
import csv

In [46]:
SAVE_PATH_PREFIX = "../log/dqn"
os.makedirs(SAVE_PATH_PREFIX, exist_ok=True); os.makedirs(f"{SAVE_PATH_PREFIX}/ckpt", exist_ok=True)


def train_dqn(env_name, episodes, batch_size, epsilon, min_epsilon, decay_rate):
    """
    使用 DQN 算法在指定环境中训练，并保存训练数据和模型
    """
    # 创建环境
    env = gym.make(env_name)
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # 设置随机种子
    set_random_seed(SEED)
    
    # 初始化 DQN 模型
    dqn = DQN(num_states, num_actions, DEVICE, LR, GAMMA, MEMORY_CAPACITY)
    writer = SummaryWriter(f'{SAVE_PATH_PREFIX}/{env_name}')  # TensorBoard SummaryWriter

    reward_list = []  # 用于保存每幕的奖励

    # 开始训练
    for episode in tqdm(range(episodes), desc=f"Training DQN on {env_name}", unit="episode"):
        state, _ = env.reset(seed=SEED)  # 重置环境
        total_reward = 0  # 每幕的总奖励

        while True:
            # epsilon-greedy 策略选择动作
            action = dqn.choose_action(state, epsilon)

            # 执行动作，获取下一个状态和奖励
            next_state, reward, done, _, _ = env.step(action)

            # 存储经验到经验池
            dqn.memory.set(Data(state, action, reward, next_state, done))

            # 累积奖励
            total_reward += reward

            # 开始学习
            if len(dqn.memory.buffer) >= MIN_CAPACITY:
                dqn.learn(batch_size)

            # 幕结束
            if done:
                writer.add_scalar('Reward', total_reward, global_step=episode)
                reward_list.append(total_reward)  # 记录奖励
                break

            # 更新状态
            state = next_state

        # 保存模型
        if (episode + 1) % SAVING_ITERATION == 0:
            dqn_path = f"{SAVE_PATH_PREFIX}/ckpt/dqn_{env_name}_{episode + 1}.pth"
            torch.save(dqn.eval_net.state_dict(), dqn_path)
            tqdm.write(f"Saved model: {dqn_path}")

        # 更新 epsilon
        epsilon = max(min_epsilon, epsilon * decay_rate)

    # 保存奖励数据为 CSV 文件
    csv_path = f"{SAVE_PATH_PREFIX}/reward_{env_name}.csv"
    with open(csv_path, mode='w', newline='') as f:
        csv_writer = csv.writer(f)  # 使用新的变量名 csv_writer
        csv_writer.writerow(["Episode", "Reward"])
        for i, reward in enumerate(reward_list):
            csv_writer.writerow([i + 1, reward])
    print(f"Reward data saved to {csv_path}")

    env.close()
    writer.close()  # 这里是 TensorBoard 的 SummaryWriter

In [47]:
# 训练 CartPole 环境
train_dqn(
    env_name="CartPole-v1",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)

model using device: cpu


Training DQN on CartPole-v1:  50%|█████     | 1000/2000 [08:53<07:03,  2.36episode/s] 

Saved model: ../log/dqn/ckpt/dqn_CartPole-v1_1000.pth


Training DQN on CartPole-v1:  67%|██████▋   | 1348/2000 [13:39<06:36,  1.64episode/s]  


KeyboardInterrupt: 

In [None]:
# 训练 MountainCar 环境
train_dqn(
    env_name="MountainCar-v0",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)

model using device: cpu


Training DQN on MountainCar-v0:   0%|          | 6/2000 [00:47<4:24:17,  7.95s/episode]


KeyboardInterrupt: 

In [None]:
# 训练 LunarLander 环境
train_dqn(
    env_name="LunarLander-v3",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)

model using device: cpu


Training DQN on LunarLander-v3:  50%|█████     | 1001/2000 [07:48<03:33,  4.67episode/s]  

Saved model: ../log/dqn/ckpt/dqn_LunarLander-v3_1000.pth


Training DQN on LunarLander-v3: 100%|██████████| 2000/2000 [10:16<00:00,  3.24episode/s]

Saved model: ../log/dqn/ckpt/dqn_LunarLander-v3_2000.pth





## Double DQN

In [None]:
import torch
import gymnasium as gym
from model import DoubleDQN, Data
from const import *
from utils import set_random_seed
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import numpy as np

In [None]:
SAVE_PATH_PREFIX = "../log/double_dqn"
os.makedirs(SAVE_PATH_PREFIX, exist_ok=True); os.makedirs(f"{SAVE_PATH_PREFIX}/ckpt", exist_ok=True)

def train_double_dqn(env_name, episodes, batch_size, epsilon, min_epsilon, decay_rate):
    """
    使用 Double DQN 算法在指定环境中训练，并保存训练数据和模型
    """
    # 创建环境
    env = gym.make(env_name)
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # 设置随机种子
    set_random_seed(SEED)

    # 初始化 Double DQN 模型
    double_dqn = DoubleDQN(num_states, num_actions, DEVICE, LR, GAMMA, MEMORY_CAPACITY)

    # 初始化 TensorBoard Writer
    writer = SummaryWriter(f'{SAVE_PATH_PREFIX}/{env_name}_double_dqn')

    reward_list = []  # 保存每幕的奖励

    # 开始训练
    for episode in tqdm(range(episodes), desc=f"Training Double DQN on {env_name}", unit="episode"):
        state, _ = env.reset(seed=SEED)
        total_reward = 0

        while True:
            # epsilon-greedy 策略选择动作
            action = double_dqn.choose_action(state, epsilon)

            # 执行动作
            next_state, reward, done, _, _ = env.step(action)

            # 存储经验
            double_dqn.memory.set(Data(state, action, reward, next_state, done))

            # 累积奖励
            total_reward += reward

            # 开始学习
            if len(double_dqn.memory.buffer) >= MIN_CAPACITY:
                double_dqn.learn(batch_size)

            if done:
                writer.add_scalar('Reward', total_reward, global_step=episode)
                reward_list.append(total_reward)  # 记录奖励
                break

            state = next_state

        # 保存模型
        if (episode + 1) % SAVING_ITERATION == 0:
            double_dqn_path = f"{SAVE_PATH_PREFIX}/ckpt/double_dqn_{env_name}_{episode + 1}.pth"
            torch.save(double_dqn.eval_net.state_dict(), double_dqn_path)
            tqdm.write(f"Saved model: {double_dqn_path}")

        # 更新 epsilon
        epsilon = max(min_epsilon, epsilon * decay_rate)
        
    # 保存奖励数据到 CSV 文件
    csv_path = f"{SAVE_PATH_PREFIX}/reward_{env_name}.csv"
    with open(csv_path, mode='w', newline='') as f:
        csv_writer = csv.writer(f)  # 使用新的变量名 csv_writer
        csv_writer.writerow(["Episode", "Reward"])
        for i, reward in enumerate(reward_list):
            csv_writer.writerow([i + 1, reward])
    print(f"Reward data saved to {csv_path}")
    
    env.close()
    writer.close()

In [None]:

# 训练 CartPole 环境
train_double_dqn(
    env_name="CartPole-v1",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)


model using device: cpu


Training Double DQN on CartPole-v1:  30%|███       | 607/2000 [03:48<08:44,  2.65episode/s]  


KeyboardInterrupt: 

In [None]:

# 训练 MountainCar 环境
train_double_dqn(
    env_name="MountainCar-v0",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)


In [None]:

# 训练 LunarLander 环境
train_double_dqn(
    env_name="LunarLander-v3",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)

## Dueling DQN

In [None]:
import torch
import gymnasium as gym
from src.model import DuelingDQN, Data
from src.const import *
from src.utils import set_random_seed
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import numpy as np

In [None]:
SAVE_PATH_PREFIX = "../log/dueling_dqn"
os.makedirs(SAVE_PATH_PREFIX, exist_ok=True); os.makedirs(f"{SAVE_PATH_PREFIX}/ckpt", exist_ok=True)
def train_dueling_dqn(env_name, episodes, batch_size, epsilon, min_epsilon, decay_rate):
    """
    使用 Dueling DQN 算法在指定环境中训练，并保存训练数据和模型
    """
    # 创建环境
    env = gym.make(env_name)
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # 设置随机种子
    set_random_seed(SEED)

    # 初始化 Dueling DQN 模型
    dueling_dqn = DuelingDQN(num_states, num_actions, DEVICE, LR, GAMMA, MEMORY_CAPACITY)

    # 初始化 TensorBoard Writer
    writer = SummaryWriter(f'{SAVE_PATH_PREFIX}/{env_name}_dueling_dqn')

    reward_list = []  # 记录奖励

    # 开始训练
    for episode in tqdm(range(episodes), desc=f"Training Dueling DQN on {env_name}", unit="episode"):
        state, _ = env.reset(seed=SEED)
        total_reward = 0

        while True:
            # epsilon-greedy 策略选择动作
            action = dueling_dqn.choose_action(state, epsilon)

            # 执行动作
            next_state, reward, done, _, _ = env.step(action)

            # 存储经验
            dueling_dqn.memory.set(Data(state, action, reward, next_state, done))

            # 累积奖励
            total_reward += reward

            # 开始学习
            if len(dueling_dqn.memory.buffer) >= MIN_CAPACITY:
                dueling_dqn.learn(batch_size)

            if done:
                writer.add_scalar('Reward', total_reward, global_step=episode)
                reward_list.append(total_reward)  # 记录奖励
                break

            state = next_state

        # 保存模型
        if (episode + 1) % SAVING_ITERATION == 0:
            dueling_dqn_path = f"{SAVE_PATH_PREFIX}/ckpt/dueling_dqn_{env_name}_{episode + 1}.pth"
            torch.save(dueling_dqn.eval_net.state_dict(), dueling_dqn_path)
            tqdm.write(f"Saved model: {dueling_dqn_path}")

        # 更新 epsilon
        epsilon = max(min_epsilon, epsilon * decay_rate)

    # 保存奖励数据到 CSV 文件
    csv_path = f"{SAVE_PATH_PREFIX}/reward_{env_name}.csv"
    with open(csv_path, mode='w', newline='') as f:
        csv_writer = csv.writer(f)  # 使用新的变量名 csv_writer
        csv_writer.writerow(["Episode", "Reward"])
        for i, reward in enumerate(reward_list):
            csv_writer.writerow([i + 1, reward])
    print(f"Reward data saved to {csv_path}")
    
    
    env.close()
    writer.close()


In [None]:
# 训练 CartPole 环境
train_dueling_dqn(
    env_name="CartPole-v1",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)


In [None]:

# 训练 MountainCar 环境
train_dueling_dqn(
    env_name="MountainCar-v0",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)


In [None]:

# 训练 LunarLander 环境
train_dueling_dqn(
    env_name="LunarLander-v3",
    episodes=EPISODES,
    batch_size=BATCH_SIZE,
    epsilon=1.0,
    min_epsilon=EPSILON,
    decay_rate=0.995
)

## Evaluate

In [None]:
from gymnasium.wrappers import RecordVideo

def evaluate_model(env_name, model_path, video_dir, num_episodes=10):
    """
    测试训练好的模型，并录制视频
    """
    # 创建环境并包装为视频录制
    env = gym.make(env_name, render_mode='rgb_array')
    env = RecordVideo(env, video_dir=video_dir, episode_trigger=lambda x: True)
    
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.n

    # 加载模型
    dqn = DQN(num_states, num_actions, DEVICE, LR, GAMMA, MEMORY_CAPACITY)
    dqn.eval_net.load_state_dict(torch.load(model_path))
    dqn.eval_net.eval()

    total_rewards = []

    # 测试模型
    for episode in range(num_episodes):
        state, _ = env.reset(seed=SEED)
        total_reward = 0

        while True:
            # 贪婪策略选择动作
            action = dqn.choose_action(state, epsilon=0.0)
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward

            if done:
                total_rewards.append(total_reward)
                break
            state = next_state

    env.close()

    # 打印平均奖励
    avg_reward = np.mean(total_rewards)
    print(f"Average reward over {num_episodes} episodes: {avg_reward}")
    return avg_reward