In [1]:
# !pip install gym

In [2]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt

In [3]:
env = gym.make("CartPole-v1")
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<CartPoleEnv<CartPole-v1>>>>>

In [4]:
# 策略网络
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        super(PolicyNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        return self.network(x)

In [None]:
# REINFORCE算法实现
class REINFORCE:
    def __init__(self, env_name, hidden_dim=128, lr=0.001, gamma=0.99):
        self.env = gym.make(env_name)
        self.gamma = gamma  # 折扣因子
        
        # 获取环境的状态空间和动作空间维度
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n
        
        # 初始化策略网络
        self.policy = PolicyNetwork(self.state_dim, self.action_dim, hidden_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        
        # 记录训练数据
        self.rewards_history = []
    
    def select_action(self, state):
        """
        根据当前状态选择动作
        """
        state = torch.FloatTensor(state)
        probs = self.policy(state)
        # 创建一个类别分布
        dist = Categorical(probs)
        # 从分布中采样一个动作
        action = dist.sample()
        # 返回动作和对应的对数概率
        return action.item(), dist.log_prob(action)
    
    def compute_returns(self, rewards):
        """
        计算每一步的rewards
        """
        returns = []
        R = 0
        
        # 从后往前计算累积回报
        for r in reversed(rewards):
            R = r + self.gamma * R
            returns.insert(0, R)
        
        # 将回报转换为张量并标准化
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)
        
        return returns
    
    def update_policy(self, log_probs, returns):
        """更新策略网络"""
        # 计算策略梯度损失
        policy_loss = []
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)  # 负号是因为我们在最大化回报
        
        # 将损失列表转换为张量并求和
        policy_loss = torch.cat(policy_loss).sum()
        
        # 梯度清零
        self.optimizer.zero_grad()
        # 反向传播
        policy_loss.backward()
        # 更新参数
        self.optimizer.step()
    
    def train(self, num_episodes=1000, max_steps=1000):
        """训练策略网络"""
        for episode in range(num_episodes):
            state, _ = self.env.reset()
            rewards = []
            log_probs = []
            
            # 收集一条轨迹
            for _ in range(max_steps):
                # 选择动作
                action, log_prob = self.select_action(state)
                # 执行动作
                next_state, reward, done, truncated, _ = self.env.step(action)
                
                # 存储奖励和对数概率
                rewards.append(reward)
                log_probs.append(log_prob)
                
                state = next_state
                
                if done or truncated:
                    break
            
            # 计算回报
            returns = self.compute_returns(rewards)
            
            # 更新策略
            self.update_policy(log_probs, returns)
            
            # 记录每个episode的总奖励
            total_reward = sum(rewards)
            self.rewards_history.append(total_reward)
            
            # 打印训练进度
            if episode % 10 == 0:
                avg_reward = np.mean(self.rewards_history[-10:])
                print(f"Episode {episode}, Average Reward (last 10): {avg_reward:.2f}")
            
            # 如果平均回报足够高，可以提前终止训练
            if len(self.rewards_history) >= 100 and np.mean(self.rewards_history[-100:]) >= 195:
                print(f"Environment solved in {episode} episodes!")
                break
    
    def evaluate(self, num_episodes=10, render=False):
        """评估策略"""
        total_rewards = []
        
        for _ in range(num_episodes):
            state, _ = self.env.reset()
            episode_reward = 0
            done = False
            
            while not done:
                if render:
                    self.env.render()
                
                # 选择动作（评估模式下不需要记录log_prob）
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state)
                    probs = self.policy(state_tensor)
                    dist = Categorical(probs)
                    action = dist.sample().item()
                
                # 执行动作
                state, reward, done, truncated, _ = self.env.step(action)
                episode_reward += reward
                
                if truncated:
                    break
            
            total_rewards.append(episode_reward)
        
        avg_reward = np.mean(total_rewards)
        print(f"Average Reward over {num_episodes} episodes: {avg_reward:.2f}")
        return avg_reward
    
    def plot_rewards(self):
        """绘制奖励历史"""
        plt.figure(figsize=(10, 5))
        plt.plot(self.rewards_history)
        plt.title('Reward History')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.grid(True)
        plt.savefig('reinforce_rewards.png')
        plt.show()
    
    def save_model(self, path):
        """保存模型"""
        torch.save(self.policy.state_dict(), path)
    
    def load_model(self, path):
        """加载模型"""
        self.policy.load_state_dict(torch.load(path))
        self.policy.eval()

In [6]:
# 使用示例
if __name__ == "__main__":
    # 创建REINFORCE代理
    agent = REINFORCE(env_name="CartPole-v1", hidden_dim=128, lr=0.001, gamma=0.99)
    
    # 训练代理
    agent.train(num_episodes=1000)
    
    # 绘制奖励历史
    agent.plot_rewards()
    
    # 评估代理
    agent.evaluate(num_episodes=10, render=True)
    
    # 保存模型
    agent.save_model("reinforce_policy.pth")

  if not isinstance(terminated, (bool, np.bool8)):


RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated