In [None]:
# Part 1: 导入必要的库
import torch
import numpy as np
import matplotlib.pyplot as plt
from env import SatelliteEnv # 使用 env1.py
from agents.mappo_agent import MAPPOAgent 
import pandas as pd

# Part 2: 初始化环境和训练参数
# 初始化卫星环境
env = SatelliteEnv(service_type='delay_sensitive', multi_agent=True)

# 获取环境参数
observations = env.get_observation()
obs_sizes = [len(o['cache_states']) for o in observations] # 基于区域观察的维度
n_agents = len(obs_sizes)  # 区域数量作为智能体数量
action_dim = env.k_paths  # 每个智能体的动作空间

# 设置训练超参数
hidden_dim = 64 
learning_rate = 0.001
episodes = 1000
steps_per_episode = 50
batch_size = 128
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995

# 初始化 MAPPO 智能体
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mappo_agent = MAPPOAgent(
    n_agents=n_agents,
    obs_sizes=obs_sizes, 
    action_dim=action_dim,
    hidden_dim=hidden_dim,
    learning_rate=learning_rate,
    device=device
)

# Part 3: 训练循环
epsilon = epsilon_start
episode_rewards = []
metrics_history = []

for episode in range(episodes):
    observations = env.reset()
    episode_reward = 0
    episode_metrics = []
    
    for step in range(steps_per_episode):
        # 基于 epsilon-greedy 选择动作
        actions = mappo_agent.select_action(observations, epsilon)
        
        # 在环境中执行动作
        next_observations, reward, done = env.step(actions)
        
        # 存储经验
        mappo_agent.store_experience((observations, actions, reward, next_observations, done))
        
        # 如果经验池中有足够样本,则进行学习
        if len(mappo_agent.replay_buffer.buffer) >= batch_size:
            mappo_agent.update(batch_size, gamma)
            
        episode_reward += reward
        observations = next_observations
        
        # 记录性能指标
        if len(env.get_candidate_paths(env.src, env.dst)) > 0:
            path = env.get_candidate_paths(env.src, env.dst)[0]
            delay, packet_loss, delivery = env.calculate_qos_metrics(path)
            episode_metrics.append({
                'delay': delay,
                'packet_loss': packet_loss, 
                'delivery': delivery
            })
    
    # 更新 epsilon
    epsilon = max(epsilon_end, epsilon * epsilon_decay)
    
    # 记录回合奖励
    episode_rewards.append(episode_reward)
    metrics_history.extend(episode_metrics)
    
    # 打印训练进度
    if (episode + 1) % 10 == 0:
        print(f'Episode {episode+1}/{episodes}')
        print(f'Average Reward: {np.mean(episode_rewards[-10:])}')
        print(f'Epsilon: {epsilon:.3f}')

# Part 4: 可视化训练结果
# 绘制奖励曲线
plt.figure(figsize=(10, 5))
plt.plot(episode_rewards)
plt.title('Training Rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

# 绘制 QoS 指标变化
metrics_df = pd.DataFrame(metrics_history)
plt.figure(figsize=(15, 5))

plt.subplot(131)
plt.plot(metrics_df['delay'])
plt.title('End-to-End Delay')
plt.xlabel('Step')
plt.ylabel('Delay (s)')

plt.subplot(132)
plt.plot(metrics_df['packet_loss'])
plt.title('Packet Loss Rate')
plt.xlabel('Step')
plt.ylabel('Loss Rate')

plt.subplot(133)
plt.plot(metrics_df['delivery'])
plt.title('Delivery Rate')
plt.xlabel('Step')
plt.ylabel('Delivery Rate')

plt.tight_layout()
plt.show()

# 保存模型
torch.save({
    'actors': [actor.state_dict() for actor in mappo_agent.actors],
    'critic': mappo_agent.critic.state_dict()
}, 'mappo_model_final.pth')