In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque, namedtuple
import matplotlib.pyplot as plt
from gymnasium.wrappers import RecordVideo

# Dueling DQN 网络（无均值归一化）
class DuelingDQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DuelingDQN, self).__init__()
        # 共享特征层
        self.feature_layer = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU()
        )
        # 状态价值流 (V(s))
        self.value_stream = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        # 动作优势流 (A(s, a))
        self.advantage_stream = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim)
        )

    def forward(self, x):
        features = self.feature_layer(x)
        value = self.value_stream(features)
        advantage = self.advantage_stream(features)
        # 修改：直接使用 V(s) + A(s, a)，无均值归一化
        q_values = value + advantage
        return q_values

# 优先经验回放缓冲区
class PrioritizedReplayBuffer:
    def __init__(self, capacity, alpha=0.6):
        self.capacity = capacity
        self.alpha = alpha  # 优先级系数
        self.buffer = deque(maxlen=capacity)
        self.priorities = deque(maxlen=capacity)
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    def push(self, state, action, reward, next_state, done):
        max_priority = max(self.priorities) if self.buffer else 1.0
        experience = self.experience(state, action, reward, next_state, done)
        
        self.buffer.append(experience)
        self.priorities.append(max_priority)
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):
        priorities = np.array(self.priorities, dtype=np.float32)
        probs = priorities ** self.alpha
        probs /= probs.sum()
        
        indices = np.random.choice(len(self.buffer), batch_size, p=probs)
        experiences = [self.buffer[idx] for idx in indices]
        
        # 计算重要性采样权重
        weights = (len(self.buffer) * probs[indices]) ** (-beta)
        weights /= weights.max()
        
        state, action, reward, next_state, done = zip(*experiences)
        return (np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done),
                indices, np.array(weights, dtype=np.float32))

    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            self.priorities[idx] = priority + 1e-5  # 避免优先级为 0

    def __len__(self):
        return len(self.buffer)

# Dueling DQN 代理（结合 Double DQN 和 PER）
class DuelingDQNAgent:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # 主网络和目标网络
        self.q_network = DuelingDQN(state_dim, action_dim).to(self.device)
        self.target_network = DuelingDQN(state_dim, action_dim).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=0.0005)  # 降低学习率
        
        # 超参数
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999  # 更慢的衰减
        self.batch_size = 64
        self.memory = PrioritizedReplayBuffer(10000)
        self.target_update_freq = 10  # 更频繁的更新
        self.beta = 0.4  # 初始 beta，用于重要性采样
        self.beta_increment = 0.001  # beta 逐渐增加

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randrange(self.action_dim)
        state = torch.FloatTensor(state).to(self.device)
        with torch.no_grad():
            q_values = self.q_network(state)
        return q_values.argmax().item()

    def update(self, step):
        if len(self.memory) < self.batch_size:
            return
        
        # 从优先经验回放缓冲区采样
        state, action, reward, next_state, done, indices, weights = self.memory.sample(self.batch_size, self.beta)
        state = torch.FloatTensor(state).to(self.device)
        action = torch.LongTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        done = torch.FloatTensor(done).to(self.device)
        weights = torch.FloatTensor(weights).to(self.device)

        # 计算 Q 值（Double DQN）
        q_values = self.q_network(state).gather(1, action.unsqueeze(1)).squeeze(1)
        next_actions = self.q_network(next_state).argmax(1, keepdim=True)
        next_q_values = self.target_network(next_state).gather(1, next_actions).squeeze(1)
        target = reward + (1 - done) * self.gamma * next_q_values

        # 计算 TD 误差并更新优先级
        td_errors = (q_values - target).abs().detach().cpu().numpy()
        self.memory.update_priorities(indices, td_errors)

        # 计算损失（考虑重要性采样权重）
        loss = (nn.MSELoss(reduction='none')(q_values, target) * weights).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 更新探索率和 beta
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        self.beta = min(1.0, self.beta + self.beta_increment)

        # 定期更新目标网络
        if step % self.target_update_freq == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())

    def save_model(self, path="improved_dueling_dqn_cartpole.pth"):
        torch.save(self.q_network.state_dict(), path)

    def load_model(self, path="improved_dueling_dqn_cartpole.pth"):
        self.q_network.load_state_dict(torch.load(path))
        self.target_network.load_state_dict(self.q_network.state_dict())

# 训练函数
def train_dueling_dqn():
    env = gym.make("LunarLander-v3")
    agent = DuelingDQNAgent(state_dim=env.observation_space.shape[0], action_dim=env.action_space.n)
    
    num_episodes = 1000  # 增加训练回合
    max_steps = 500
    scores = []
    step = 0

    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0

        for t in range(max_steps):
            action = agent.select_action(state)
            next_state, reward, done, truncated, _ = env.step(action)
            done = done or truncated
            agent.memory.push(state, action, reward, next_state, done)
            agent.update(step)
            state = next_state
            episode_reward += reward
            step += 1

            if done:
                break

        scores.append(episode_reward)
        if episode % 100 == 0:
            print(f"Episode {episode}/{num_episodes}, Reward: {episode_reward}, Epsilon: {agent.epsilon:.3f}")

    # 保存模型
    agent.save_model()
    
    # 绘制训练过程中的奖励曲线
    plt.figure(figsize=(10, 5))
    plt.plot(scores)
    plt.title("Training Rewards Over Episodes (Improved Dueling DQN)")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.grid()
    plt.savefig("improved_dueling_dqn_training_rewards.png")
    plt.close()

    return agent, scores

# 测试和可视化
def test_dueling_dqn(agent):
    env = gym.make("LunarLander-v3", render_mode="rgb_array")
    env = RecordVideo(env, video_folder="videos", episode_trigger=lambda x: True)
    agent.epsilon = 0.0  # 测试时禁用探索

    num_test_episodes = 10  # 增加测试回合
    test_rewards = []

    for episode in range(num_test_episodes):
        state, _ = env.reset()
        episode_reward = 0
        done = False
        t = 0

        while not done and t < 500:
            action = agent.select_action(state)
            state, reward, done, truncated, _ = env.step(action)
            done = done or truncated
            episode_reward += reward
            t += 1

        test_rewards.append(episode_reward)
        print(f"Test Episode {episode+1}, Reward: {episode_reward}")

    env.close()
    return test_rewards

# 主函数
if __name__ == "__main__":
    # 训练
    print("开始训练改进后的 Dueling DQN...")
    agent, scores = train_dueling_dqn()

    # 测试和可视化
    print("\n开始测试和可视化...")
    test_rewards = test_dueling_dqn(agent)

    # 评估表现
    avg_reward = np.mean(test_rewards)
    std_reward = np.std(test_rewards)
    print(f"\n模型评估：")
    print(f"平均测试奖励：{avg_reward:.2f} ± {std_reward:.2f}")
    print(f"最后 100 回合平均训练奖励：{np.mean(scores[-100:]):.2f}")
    print("训练奖励曲线已保存为 'improved_dueling_dqn_training_rewards.png'")
    print("测试视频已保存至 'videos' 文件夹")
    if avg_reward > 475:
        print("表现优秀！代理成功学会了保持杆子平衡，接近最大奖励 500。")
    elif avg_reward > 300:
        print("表现良好！代理能够保持平衡一段时间，但还有改进空间。")
    else:
        print("表现一般。代理需要更多训练以提升性能。")

开始训练改进后的 Dueling DQN...
Episode 0/1000, Reward: -204.12318096323662, Epsilon: 0.948
Episode 100/1000, Reward: 266.309430861578, Epsilon: 0.010
Episode 200/1000, Reward: 272.61855883034536, Epsilon: 0.010
Episode 300/1000, Reward: 273.69067030753513, Epsilon: 0.010
Episode 400/1000, Reward: 288.7087505637114, Epsilon: 0.010
Episode 500/1000, Reward: 275.0519178867045, Epsilon: 0.010
Episode 600/1000, Reward: 284.21615728420375, Epsilon: 0.010
Episode 700/1000, Reward: -55.43972370006725, Epsilon: 0.010
Episode 800/1000, Reward: 57.18698442436812, Epsilon: 0.010
Episode 900/1000, Reward: 293.54411886000304, Epsilon: 0.010

开始测试和可视化...


  logger.warn(


Test Episode 1, Reward: 273.1900862975189
Test Episode 2, Reward: 262.0296100464012
Test Episode 3, Reward: 215.9819266426177
Test Episode 4, Reward: 155.30221958491722
Test Episode 5, Reward: 268.07588474504547
Test Episode 6, Reward: 271.9226480280818
Test Episode 7, Reward: 269.4135527644853
Test Episode 8, Reward: 274.2472508760417
Test Episode 9, Reward: 275.4015099810341
Test Episode 10, Reward: 265.4954523342444

模型评估：
平均测试奖励：253.11 ± 36.56
最后 100 回合平均训练奖励：233.91
训练奖励曲线已保存为 'improved_dueling_dqn_training_rewards.png'
测试视频已保存至 'videos' 文件夹
表现一般。代理需要更多训练以提升性能。


In [None]:
# !pip install "gymnasium[other]"