In [None]:
# 这个Actor-Critic代码实现基于Monte Carlo回报的。

# 导入依赖库
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.distributions.categorical import Categorical

# 检查CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 定义Actor网络
class ActorNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ActorNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, output_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        return self.network(x)

# 定义Critic网络
class CriticNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(CriticNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1)
        )
    
    def forward(self, x):
        return self.network(x)

# Actor-Critic代理类
class ActorCriticAgent:
    def __init__(self, env_name="Acrobot-v1", hidden_dim=256, lr_actor=0.0005, lr_critic=0.001):
        # 初始化环境
        self.env = gym.make(env_name)
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n
        
        # 初始化独立网络
        self.actor = ActorNetwork(self.state_dim, hidden_dim, self.action_dim).to(device)
        self.critic = CriticNetwork(self.state_dim, hidden_dim).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)
        
        # 存储训练数据
        self.rewards_history = []
        self.actor_losses = []
        self.critic_losses = []
    
    def save_checkpoint(self, episode, filename_prefix="acrobot_ac_checkpoint"):
        # 保存检查点
        filename = f"{filename_prefix}_{episode}.pth"
        checkpoint = {
            'actor_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'actor_optimizer': self.actor_optimizer.state_dict(),
            'critic_optimizer': self.critic_optimizer.state_dict()
        }
        torch.save(checkpoint, filename)
        print(f"Saved checkpoint: {filename}")
    
    def save_model(self, filename="acrobot_ac_final.pth"):
        # 保存最终模型
        checkpoint = {
            'actor_state_dict': self.actor.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'actor_optimizer': self.actor_optimizer.state_dict(),
            'critic_optimizer': self.critic_optimizer.state_dict()
        }
        torch.save(checkpoint, filename)
        print(f"Final model weights saved to {filename}")
    
    def load_model(self, filename="acrobot_ac_final.pth"):
        # 加载模型权重
        try:
            checkpoint = torch.load(filename, map_location=device)
            self.actor.load_state_dict(checkpoint['actor_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer'])
            self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer'])
            print(f"Loaded model weights: {filename}")
            return True
        except FileNotFoundError:
            print(f"Error: Model weights file {filename} not found")
            return False
    
    def train(self, n_episodes=2000, gamma=0.99):
        # 训练Actor-Critic
        print("Starting training...")
        self.rewards_history = []
        self.actor_losses = []
        self.critic_losses = []
        
        for episode in range(n_episodes):
            state, _ = self.env.reset()
            log_probs = []
            values = []
            rewards = []
            done = False
            
            # 收集轨迹
            while not done:
                state_tensor = torch.FloatTensor(state).to(device)
                action_probs = self.actor(state_tensor)
                value = self.critic(state_tensor)
                m = Categorical(action_probs)
                action = m.sample()
                log_prob = m.log_prob(action)
                
                next_state, reward, terminated, truncated, _ = self.env.step(action.item())
                done = terminated or truncated
                
                log_probs.append(log_prob)
                values.append(value)
                rewards.append(reward)
                state = next_state
            
            # 计算回报
            returns = []
            R = 0
            for r in rewards[::-1]:
                R = r + gamma * R
                returns.insert(0, R)
            returns = torch.tensor(returns, device=device, dtype=torch.float32)
            returns = (returns - returns.mean()) / (returns.std() + 1e-9)
            
            # 计算Actor和Critic损失
            actor_loss = 0
            critic_loss = 0
            for log_prob, value, R in zip(log_probs, values, returns):
                # 计算单步优势
                advantage = R - value.item()
                # 使用单步优势，计算actor损失
                actor_loss += -log_prob * advantage
                # 评估同一状态下，实际回报值和Critic预测状态值之间的损失
                critic_loss += (R - value) ** 2
            
            actor_loss = actor_loss.mean()
            critic_loss = critic_loss.mean()
            
            # 更新Actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            
            # 更新Critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            total_reward = sum(rewards)
            self.rewards_history.append(total_reward)
            self.actor_losses.append(actor_loss.item())
            self.critic_losses.append(critic_loss.item())
            
            # 打印训练数据
            if episode % 50 == 0:
                print(f"Episode {episode:4d}: Reward = {total_reward:6.2f}, "
                      f"Avg Reward (last 50) = {np.mean(self.rewards_history[-50:]):6.2f}, "
                      f"Actor Loss = {actor_loss.item():.4f}, Critic Loss = {critic_loss.item():.4f}")
            
            # 保存检查点
            if episode % 500 == 0 and episode > 0:
                self.save_checkpoint(episode)
        
        # 保存最终模型
        self.save_model()
    
    def plot_training_results(self):
        # 绘制训练结果（所有文本用英文）
        plt.figure(figsize=(15, 5))
        
        plt.subplot(1, 3, 1)
        plt.plot(self.rewards_history, label="Episode Reward", alpha=0.5)
        plt.plot(np.convolve(self.rewards_history, np.ones(100)/100, mode='valid'), 
                 label="Moving Avg (100)", color='red')
        plt.title("Training Reward Curve")
        plt.xlabel("Episode")
        plt.ylabel("Total Reward")
        plt.legend()
        
        plt.subplot(1, 3, 2)
        plt.plot(self.actor_losses, label="Actor Loss", alpha=0.5)
        plt.plot(np.convolve(self.actor_losses, np.ones(100)/100, mode='valid'), 
                 label="Moving Avg (100)", color='red')
        plt.title("Actor Loss Curve")
        plt.xlabel("Episode")
        plt.ylabel("Loss")
        plt.legend()
        
        plt.subplot(1, 3, 3)
        plt.plot(self.critic_losses, label="Critic Loss", alpha=0.5)
        plt.plot(np.convolve(self.critic_losses, np.ones(100)/100, mode='valid'), 
                 label="Moving Avg (100)", color='red')
        plt.title("Critic Loss Curve")
        plt.xlabel("Episode")
        plt.ylabel("Loss")
        plt.legend()
        
        plt.tight_layout()
        plt.savefig("training_results.png")
        plt.show()
        print("Training results saved as training_results.png")
    
    def evaluate_training(self):
        # 评估训练表现
        avg_train_reward = np.mean(self.rewards_history[-100:])
        print(f"\nTraining Evaluation:")
        print(f"Average Reward (Last 100 Episodes): {avg_train_reward:.2f}")
        if avg_train_reward > -100:
            print("Performance: Excellent - Solved Acrobot (threshold > -100)")
        elif avg_train_reward > -200:
            print("Performance: Good - Stable but not optimal")
        else:
            print("Performance: Needs improvement")
    
    def test(self, num_episodes=5):
        # 测试与可视化
        test_env = gym.make("Acrobot-v1", render_mode="human")
        if not self.load_model():
            return []
        
        self.actor.eval()
        self.critic.eval()
        episode_rewards = []
        print("\nStarting testing...")
        
        for episode in range(num_episodes):
            state, _ = test_env.reset()
            done = False
            total_reward = 0
            test_env.render()
            print(f"\nTest Episode {episode}:")
            print("Step | State (cosθ1, sinθ1, cosθ2, sinθ2) | Action | Prob Left | Prob None | Prob Right | Reward | State Value")
            print("-" * 110)
            step = 0
            
            while not done:
                state_tensor = torch.FloatTensor(state).to(device)
                action_probs = self.actor(state_tensor)
                value = self.critic(state_tensor)
                m = Categorical(action_probs)
                action = m.sample()
                prob_left = action_probs[0].item()
                prob_none = action_probs[1].item()
                prob_right = action_probs[2].item()
                
                state, reward, terminated, truncated, _ = test_env.step(action.item())
                done = terminated or truncated
                total_reward += reward
                
                print(f"{step:4d} | {state[0]:.3f}, {state[1]:.3f}, {state[2]:.3f}, {state[3]:.3f} | "
                      f"{action.item()} | {prob_left:.3f} | {prob_none:.3f} | {prob_right:.3f} | "
                      f"{reward:.2f} | {value.item():.2f}")
                step += 1
                test_env.render()
            
            episode_rewards.append(total_reward)
            print(f"Test Episode {episode} Total Reward: {total_reward:.2f}")
        
        test_env.close()
        return episode_rewards
    
    def evaluate_testing(self, test_rewards):
        # 评估测试表现
        if test_rewards:
            avg_test_reward = np.mean(test_rewards)
            print(f"\nTest Evaluation:")
            print(f"Average Reward (Test Episodes): {avg_test_reward:.2f}")
            if avg_test_reward > -100:
                print("Performance: Excellent - Solved Acrobot (threshold > -100)")
            elif avg_test_reward > -200:
                print("Performance: Good - Stable but not optimal")
            else:
                print("Performance: Needs improvement")

In [None]:
# 创建代理并运行
agent = ActorCriticAgent()

In [None]:
# 训练
agent.train()


In [None]:
# 绘制训练结果
agent.plot_training_results()

In [None]:
# 训练评估
agent.evaluate_training()


In [None]:

# 测试
test_rewards = agent.test()

In [None]:
# 测试评估
agent.evaluate_testing(test_rewards)