## Step1 初始化 Jupyter 环境 & 导入包

In [None]:
# 用于在 Jupyter 中强制刷新参数
%reset -f

# 导入相关的包
import os
import sys
from collections import deque
from pathlib import Path

import torch
import torch.nn.functional as F
import ale_py
import pygame
import imageio
import gymnasium as gym
import numpy as np

from tqdm.notebook import tqdm
from torchvision.transforms import v2
from loguru import logger

## Step2 设置相关参数

In [None]:
# 相关功能
is_training = 1                     # 是否进行训练
is_evaluate = 0                     # 是否进行评估, 此时会渲染游戏画面
need_record = 0                     # 是否开启录像, 前提是 is_evaluate=1 才有效, 不会渲染游戏画面

# 日志等级
log_level = "INFO"
logger.remove()
logger.add(sys.stderr, level=log_level)

# 环境信息
env_id = "ALE/Galaxian-v5"          # 游戏环境名
env_height = 128                    # 游戏画面高度
env_width = 128                     # 游戏画面宽度
max_steps = 10000                   # 每个回合的最大步数
render_mode = "rgb_array"           # 渲染模式，可选 "human"、"rgb_array" 等

# PPO 算法参数
frame_stack = 4                     # 帧堆叠的数量
gamma = 0.98                        # 折扣因子, 控制未来奖励的重要性
lmbda = 0.95                        # GAE 参数, 控制轨迹长度
clip_eps = 0.2                      # PPO 截断的范围
epoch = 10                          # 样本重复训练的次数

# 训练参数
num_train_episodes = 30000          # 训练的总回合数
a_lr = 5e-6                         # actor 学习率
c_lr = 1e-5                         # critic 学习率
max_same_action = 60                # 最大连续相同动作次数，防止模型陷入局部最优解
timestep_reward = 300               # 如果每隔指定的时间步, 并且生命值不减少的话, 则给予奖励/惩罚

# 评估参数
num_eval_episodes = 10              # 评估的回合数
reward_threshold = 1000             # 评估奖励阈值, 如果高于阈值时, 日志等级为 Success, 否则为 Warning
is_sample_action = False           # 是否采样动作进行评估, 如果为 False 则使用模型最优动作进行评估
eval_of_reward = 50                 # 保存用于评估 Reward 的数值, 数值为最多仅统计最近多少个
eval_of_step = 50                   # 保存用于评估 Step 的数值, 数值为最多仅统计最近多少个

# 保存策略
save_dir = "./Gym_ALE_Galaxian_PPO_FrameStack"                   # 数据保存的目录
save_freq = 100                                                  # 模型保存的频率
max_checkpoints = 5                                              # 最大保存的模型数量
checkpoint_perfix_A = "CheckPoint_Gym_ALE_Galaxian_A_"           # 模型保存的前缀 Actor
checkpoint_perfix_C = "CheckPoint_Gym_ALE_Galaxian_C_"           # 模型保存的前缀 Critic
evaluate_record_perfix = "Video_Gym_ALE_Galaxian_"               # 评估记录保存的前缀
evaluate_record_fps = 30                                         # 评估记录保存的帧率
evaluate_record_quality = 10                                     # 评估记录保存的质量, 值为 0 ~ 10

# 其余参数初始化
# device = "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"
gym.register_envs(ale_py)            # Arcade Learning Environment(ALE) 环境需要提前注册

## Step3 预处理函数 & 工具

In [None]:
def get_max_checkpoint_id(checkpoint_perfix, save_dir=save_dir):
    """
    获取最新的模型路径, 并返回 "模型路径" 和 checkpoint 对应的 id
    """
    # 如果指定目录不存在, 则直接创建该目录
    if not Path(save_dir).exists():
        Path(save_dir).mkdir(parents=True)
        logger.debug("The specified directory does not exist, will create this folder")
        return None
    
    # 获取所有的模型文件
    checkpoints = []
    current_path = Path(save_dir)
    for entry in current_path.iterdir():
        if entry.is_file() and entry.suffix == ".pth" and entry.name.startswith(checkpoint_perfix):
            id = entry.name.split(checkpoint_perfix)[-1].split(".")[0]
            checkpoints.append(int(id))
    
    # 寻找最大的 checkpoint id
    if checkpoints.__len__() == 0:
        logger.info(f"Not found any {checkpoint_perfix} files, will random initialization of network parameters")
        return None
    else:
        max_checkpoint_id = max(checkpoints)
        max_checkpoint_path = os.path.abspath(f"{save_dir}/{checkpoint_perfix}{max_checkpoint_id}.pth")
        logger.info(f"Found max checkpoints, max_checkpoint_id is {max_checkpoint_id}")
        return {"max_checkpoint_path" : max_checkpoint_path, "max_checkpoint_id" : max_checkpoint_id}

def del_old_checkpoint(checkpoint_perfix, save_dir=save_dir, max_checkpoints=max_checkpoints):
    """
    删除旧的模型文件, 只保留最新的 max_checkpoints 个模型文件
    """
    if Path(save_dir).exists():
        checkpoints = []
        for entry in Path(save_dir).iterdir():
            if entry.is_file() and entry.suffix == ".pth" and entry.name.startswith(checkpoint_perfix):
                id = int(entry.name.split(checkpoint_perfix)[-1].split(".")[0])
                checkpoints.append(id)
    
    if checkpoints.__len__() > max_checkpoints:
        min_checkpoint_id = min(checkpoints)
        min_checkpoint_path = os.path.abspath(f"{save_dir}/{checkpoint_perfix}{min_checkpoint_id}.pth")
        os.remove(min_checkpoint_path)
        logger.warning(f"Delete old checkpoint file {min_checkpoint_path}")

## Step4 定义智能体

In [None]:
class ActorNet(torch.nn.Module):
    """
    策略网络
    """
    def __init__(self, action_dim):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(in_channels=frame_stack, out_channels=frame_stack * 2, kernel_size=3, stride=1, padding=1)
        self.conv2 = torch.nn.Conv2d(in_channels=frame_stack * 2, out_channels=frame_stack * 4, kernel_size=3, stride=1, padding=1)
        self.fc1 = torch.nn.LazyLinear(out_features=256)
        self.fc2 = torch.nn.LazyLinear(out_features=256)
        self.fc3 = torch.nn.LazyLinear(out_features=action_dim)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, start_dim=1)
        x = F.mish(self.fc1(x))
        x = F.mish(self.fc2(x))
        x = self.fc3(x)

        return x


class CriticNet(torch.nn.Module):
    """
    价值网络
    """
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(in_channels=frame_stack, out_channels=frame_stack * 2, kernel_size=3, stride=1, padding=1)
        self.conv2 = torch.nn.Conv2d(in_channels=frame_stack * 2, out_channels=frame_stack * 4, kernel_size=3, stride=1, padding=1)
        self.fc1 = torch.nn.LazyLinear(out_features=256)
        self.fc2 = torch.nn.LazyLinear(out_features=256)
        self.fc3 = torch.nn.LazyLinear(out_features=1)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, start_dim=1)
        x = F.mish(self.fc1(x))
        x = F.mish(self.fc2(x))
        x = self.fc3(x)

        return x

In [None]:
class Agent:
    """
    智能体代理类, 封装了各种方法
    """
    def __init__(self, action_dim):
        # Global Args
        self.max_checkpoint_a = get_max_checkpoint_id(checkpoint_perfix=checkpoint_perfix_A)
        self.max_checkpoint_c = get_max_checkpoint_id(checkpoint_perfix=checkpoint_perfix_C)
        self.memory = {"St":[], "At":[], "Rt":[], "St+1":[], "Done":[]}

        # Evaluation Args
        self.loss_a = []
        self.loss_c = []
        self.reward = deque(maxlen=eval_of_reward)
        self.step = deque(maxlen=eval_of_step)

        # Init Actor Network
        self.a_net = ActorNet(action_dim)
        self.c_net = CriticNet()
        if self.max_checkpoint_a is not None:
            self.a_net.load_state_dict(torch.load(self.max_checkpoint_a["max_checkpoint_path"]))
        if self.max_checkpoint_c is not None:
            self.c_net.load_state_dict(torch.load(self.max_checkpoint_c["max_checkpoint_path"]))
        self.a_net.to(device)
        self.c_net.to(device)

        # Init Optimizer
        self.a_optimizer = torch.optim.AdamW(self.a_net.parameters(), lr=a_lr)
        self.c_optimizer = torch.optim.AdamW(self.c_net.parameters(), lr=c_lr)

        # Transfoms
        self.transform = v2.Compose([
            v2.ToImage(),
            v2.ToDtype(torch.float32, scale=True),
            v2.Grayscale(1),
            v2.Resize((128, 128)),
        ])

    def processing_states(self, frame_buffer):
        """
        对输入的 frame_buffer 进行预处理, 并返回模型可以处理的 Tensor 对象
        """
        # 将形状处理为 [batch_size=1, color_channel * stack_size, height, width]
        states = torch.stack(tuple(self.transform(frame_buffer)), dim=0)
        states = states.reshape(1, frame_stack, env_height, env_width)
        logger.debug(f"Processing states shape: {states.shape}")
        return states
    
    def select_action(self, state, sample=True):
        """
        选择动作, 某些算法需要对模型的输出进行采样, 因此可以将 sample 设置为 True
        :param state:  神经网络可以接收的输入形状: [batch_size, color_channel * stack_size, height, width]
        :param sample: 动作是否是采样, 如果不是则直接选择概率最高的动作
        """
        state = state.to(device)
        if sample:
            # https://pytorch.ac.cn/docs/stable/distributions.html#categorical
            # 采样 & 动作的对数概率最好采用这种方法, 可以避免梯度消失的问题
            logits = self.a_net(state)
            action_dist = torch.distributions.Categorical(logits=logits)
            action = action_dist.sample()
            return action.item()
        else:
            action_logits = self.a_net(state)
            action = action_logits.argmax(dim=1)
            return action.item()
    
    def compute_advantage(self, gamma, lmbda, td_err):
        """
        广义优势估计 (Generalized Advantage Estimation, GAE)
        Args:
        - gamma:  折扣因子 (0 ~ 1), 控制未来奖励的重要性
        - lmbda:  GAE 衰减参数 (0 ~ 1), 控制轨迹长度
        - td_err: 时间差分误差 (TD Error) 的张量
        """
        # 将 td_err 从计算图中卸载下来, 避免影响梯度传递(核心思想为对于常量, 最好都执行 detach 来避免影响梯度传递)
        td_err = td_err.cpu().detach().numpy()
        advantage_list = []
        advantage = 0.0
        # 逆序遍历 td_err
        for delta in td_err[::-1]:
            # 核心公式: A_t = γλA_{t+1} + δ_t
            # gamma * lmbda 控制信息衰减的乘子 (γλ)
            # delta 当前时间步的 td_err (δ_t)
            advantage = gamma * lmbda * advantage + delta
            advantage_list.append(advantage)
        # 将计算结果逆序回原始顺序 (因为之前是逆序的)
        advantage_list.reverse()
        # 将列表转换为 np.array, 这样转换为 tensor 的速度会更快
        advantages = np.array(advantage_list, dtype=np.float32)
        return torch.from_numpy(advantages).reshape(-1, 1).to(device)
    
    def update(self):
        """
        PPO 算法更新
        """
        # 提取对应的数据(数据也是常量, 因此不需要计算梯度)
        # 注意, 这里要提前处理形状, 防止在计算时广播导致形状不对
        with torch.no_grad():
            states = torch.stack(self.memory["St"]).float().squeeze(1).to(device)
            actions = torch.tensor(self.memory["At"], dtype=torch.long).unsqueeze(1).to(device)
            rewards = torch.tensor(self.memory["Rt"], dtype=torch.float32).unsqueeze(1).to(device)
            next_states = torch.stack(self.memory["St+1"]).float().squeeze(1).to(device)
            dones = torch.tensor(self.memory["Done"], dtype=torch.float32).unsqueeze(1).to(device)
            logger.debug(f"states shape: {states.shape}, actions shape: {actions.shape}, rewards shape: {rewards.shape}, next_states shape: {next_states.shape}, dones shape: {dones.shape}")

            # 对奖励进行归一化
            # rewards = (rewards - rewards.mean()) / rewards.std()

        # 这里不需要保留和计算梯度, 只作为常量来看待, 否则会影响梯度传播, 例如 old_log_probs 和 old_action_dists
        # https://github.com/boyu-ai/Hands-on-RL/issues/96
        # 要么在 with torch.no_grad() 下获取, 要么手动执行 detach() 方法主动分离计算图
        with torch.no_grad():
            # Critic 网络: TD目标 & TD误差
            td_tgt = rewards + gamma * self.c_net(next_states) * (1 - dones)
            td_err = td_tgt - self.c_net(states)
            # GAE 的结果
            advantage = self.compute_advantage(gamma, lmbda, td_err)
            # 可选: 优势函数归一化, 提高稳定性
            # advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
            # 动作概率
            old_action_dists = torch.distributions.Categorical(logits=self.a_net(states))
            old_log_probs = old_action_dists.log_prob(actions.squeeze())
        
        # 更新主循环
        for _ in range(epoch):
            # 获取当前的动作概率
            new_action_dists = torch.distributions.Categorical(logits=self.a_net(states))
            log_probs = new_action_dists.log_prob(actions.squeeze())
            # 熵正则化
            entropy = new_action_dists.entropy().mean() * 0.01
            # 计算新老概率的差距
            ratio = torch.exp(log_probs - old_log_probs)
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-clip_eps, 1+clip_eps) * advantage
            # 损失函数
            a_loss = -torch.min(surr1, surr2).mean() - entropy
            c_loss = F.mse_loss(self.c_net(states), td_tgt.detach(), reduction='mean')
            # 执行优化
            self.a_optimizer.zero_grad()
            a_loss.backward()
            self.loss_a.append(a_loss.item())
            self.a_optimizer.step()
            self.c_optimizer.zero_grad()
            c_loss.backward()
            self.loss_c.append(c_loss.item())
            self.c_optimizer.step()

        logger.info(f"Mean Actor Loss: {np.mean(self.loss_a):.4f} | Mean Critic Loss: {np.mean(self.loss_c):.4f}")
        self.loss_a.clear()
        self.loss_c.clear()
        
        # 清空经验池中的数据
        self.memory["St"].clear()
        self.memory["At"].clear()
        self.memory["Rt"].clear()
        self.memory["St+1"].clear()
        self.memory["Done"].clear()

    def save_model(self, episodes):
        """
        保存模型到指定路径, 并根据实际情况删除老的模型
        """
        # 没有任何已存在的模型文件, 即首次启动训练
        if self.max_checkpoint_a is None:
            max_checkpoint_path_a = os.path.abspath(f"{save_dir}/{checkpoint_perfix_A}{episodes}.pth")
        # 已存在模型文件的情况
        else:
            max_checkpoint_path_a = os.path.abspath(f"{save_dir}/{checkpoint_perfix_A}{episodes + int(self.max_checkpoint_a["max_checkpoint_id"])}.pth")

        # 没有任何已存在的模型文件, 即首次启动训练
        if self.max_checkpoint_c is None:
            max_checkpoint_path_c = os.path.abspath(f"{save_dir}/{checkpoint_perfix_C}{episodes}.pth")
        # 已存在模型文件的情况
        else:
            max_checkpoint_path_c = os.path.abspath(f"{save_dir}/{checkpoint_perfix_C}{episodes + int(self.max_checkpoint_c["max_checkpoint_id"])}.pth")

        # 保存模型参数
        torch.save(self.a_net.state_dict(), max_checkpoint_path_a)
        torch.save(self.c_net.state_dict(), max_checkpoint_path_c)
        logger.info(f"Actor Model saved to {max_checkpoint_path_a}")
        logger.info(f"Critic Model saved to {max_checkpoint_path_c}")

        # 删掉老模型
        del_old_checkpoint(checkpoint_perfix=checkpoint_perfix_A)
        del_old_checkpoint(checkpoint_perfix=checkpoint_perfix_C)


## Step5 调整环境

In [None]:
# 定制环境
class AleCustomEnv(gym.Wrapper):
    """
    ALE 定制环境, 继承自 gym.Wrapper 类
    """
    def __init__(self, env):
        super().__init__(env)
        self.current_lives = 4          # 剩余生命值
        self.live_time = 0              # 生存时间, 超过一定时间会给予奖励
        self.previous_action = None     # 上一次执行的动作
        self.same_action_count = 0      # 重复动作的次数

    def reset(self):
        """
        重置环境, 这里定制了一些需要重置的参数
        """
        # 重置观察结果
        observation = self.env.reset()

        # 重置相关计数
        self.current_lives = 4
        self.live_time = 0
        self.previous_action = None
        self.same_action_count = 0
        self.same_action_display = False

        return observation
    
    def step(self, action):
        """
        执行动作, 并调整了env 的行为或奖励机制
        """
        # 调用原始环境的 step 方法
        observation, reward, terminated, truncated, info = self.env.step(action)

        # 如果生命值发生变化, 则给予惩罚
        if info['lives'] != self.current_lives:
            self.current_lives = info['lives']
            self.live_time = 0
            reward = -100
            logger.debug(f"lives -1, current lives: {self.current_lives}")
        
        # 鼓励/惩罚不作为的时间
        elif reward == 0:
            self.live_time += 1
            if self.live_time == timestep_reward:
                self.live_time = 0
                reward = -10
                logger.debug(f"live_time -10")

        # 如果重复次数过多, 则给予惩罚
        if self.previous_action == action:
            self.same_action_count += 1
            if self.same_action_count >= max_same_action:
                reward = -50
                if self.same_action_display is False:
                    self.same_action_display = True
                    logger.error(f"same action too many times, same_action_count = {self.same_action_count}")
        else:
            same_action = self.same_action_count
            self.same_action_count = 0
            self.previous_action = action
            if self.same_action_display is True:
                self.same_action_display = False
                logger.error(f"same action it's over, total {same_action}")

        # 返回最终结果: observation, reward, terminated, truncated, info
        return observation, reward, terminated, truncated, info

## Step6 训练智能体

In [None]:
if is_training:
    # 训练用的主环境
    env = gym.make(env_id, render_mode=render_mode)
    env = AleCustomEnv(env)

    # 实例化智能体 (动作空间必须是离散的)
    if isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n
        agent = Agent(action_dim=action_dim)
    else:
        logger.error("Action space is not Discrete!")
        raise ValueError("Action space is not Discrete!")
    
    # 循环每个回合
    for episode in tqdm(range(num_train_episodes)):
        # 初始化环境
        state, info = env.reset()
        steps = 0
        total_reward = 0
        frame_buffer = deque(maxlen=frame_stack)
        # 初始化帧缓冲区
        for _ in range(frame_stack): 
            frame_buffer.append(state)

        # 回合中的每一步
        while steps < max_steps:
            # 1. 预处理当前状态
            state = agent.processing_states(frame_buffer)
            # 2. 选择动作, 每个宏观步骤只选择一次
            action = agent.select_action(state)

            # 初始化累积奖励和结束标志
            accumulated_reward = 0
            done = False
            # 3. 在一个宏观步骤中, 重复执行相同动作 frame_stack 次
            for _ in range(frame_stack):
                observation, reward, terminated, truncated, info = env.step(action)
                total_reward += reward
                accumulated_reward += reward
                # 更新帧缓冲区 & 步数加一
                frame_buffer.append(observation)
                steps += 1
                # 如果中途回合结束, 则立即跳出内部循环
                if terminated or truncated:
                    done = True
                    break
            
            # 4. 在宏观步骤结束后，处理下一个状态
            next_state = agent.processing_states(frame_buffer)
            # 5. 保存到经验区
            agent.memory["St"].append(state)
            agent.memory["At"].append(action)
            agent.memory["Rt"].append(accumulated_reward)
            agent.memory["St+1"].append(next_state)
            agent.memory["Done"].append(done)

            # 6. 判断是否结束
            if done:
                # 整理统计数据
                agent.reward.append(total_reward)
                agent.step.append(steps)
                # 判断日志显示的颜色
                if total_reward >= reward_threshold:
                    logger.success(f"Episode {episode + 1} | Total steps {steps} | Total Reward: {total_reward} | Mean Step: {np.mean(agent.step):.2f} | Mean Reward: {np.mean(agent.reward):.2f}")
                else:
                    logger.warning(f"Episode {episode + 1} | Total steps {steps} | Total Reward: {total_reward} | Mean Step: {np.mean(agent.step):.2f} | Mean Reward: {np.mean(agent.reward):.2f}")
                break
        
        # 更新模型
        agent.update()
        
        # 保存模型
        if (episode + 1) % save_freq == 0 and episode != 0:
            episodes = episode + 1
            agent.save_model(episodes)

## Step7 评估智能体

In [None]:
# 评估但不录制视频
if is_evaluate == 1 and need_record == 0:
    eval_env = gym.make(env_id, render_mode="human")
    eval_env = AleCustomEnv(eval_env)
# 评估且需要录制视频
elif is_evaluate == 1 and need_record == 1:
    eval_env = gym.make(env_id, render_mode="rgb_array")
    eval_env = AleCustomEnv(eval_env)

# 如果启用了评估
if is_evaluate == 1:
    # 初始化用于评估的参数
    frame_record = []
    max_reward = None

    # 实例化用于评估的智能体
    agent = Agent(action_dim=eval_env.action_space.n)
    
    # 循环每个回合
    for episode in tqdm(range(num_eval_episodes)):
        # 初始化环境
        state, info = eval_env.reset()
        steps = 0
        total_reward = 0
        frame_buffer = deque(maxlen=frame_stack)

        # 初始化帧缓冲区
        for _ in range(frame_stack): 
            frame_buffer.append(state)

        # 回合中的每一步
        while steps < max_steps:
            # 1. 预处理当前状态
            state = agent.processing_states(frame_buffer)
            # 2. 选择动作, 每个宏观步骤只选择一次
            action = agent.select_action(state, sample=is_sample_action)

            # 初始化结束标志
            done = False
            # 3. 在一个宏观步骤中, 重复执行相同动作 frame_stack 次
            for _ in range(frame_stack):
                observation, reward, terminated, truncated, info = eval_env.step(action)
                total_reward += reward
                # 更新帧缓冲区 & 步数加一
                frame_buffer.append(observation)
                # 是否需要录像
                if need_record:
                    frame_record.append(eval_env.render())
                steps += 1
                # 如果中途回合结束, 则立即跳出内部循环
                if terminated or truncated:
                    done = True
                    break

            # 4. 判断是否结束
            if done:
                # 整理统计数据
                agent.reward.append(total_reward)
                agent.step.append(steps)
                # 判断日志显示的颜色
                if total_reward >= reward_threshold:
                    logger.success(f"Episode {episode + 1} | Total steps {steps} | Total Reward: {total_reward}")
                else:
                    logger.warning(f"Episode {episode + 1} | Total steps {steps} | Total Reward: {total_reward}")
                # 如果需要记录视频, 则保留最好的记录
                if need_record and (max_reward is None or total_reward > max_reward):
                    np_frame_record = np.array(frame_record)
                    max_reward = total_reward
                    frame_record.clear()
                break

    # 记录评估结果(只记录最好的奖励轮次)
    if need_record:
        record_file = f"{os.path.abspath(os.path.join(save_dir, evaluate_record_perfix))}{int(max_reward)}.mp4"
        imageio.mimsave(record_file, np_frame_record, fps=evaluate_record_fps, quality=evaluate_record_quality)
        logger.info(f"The best evaluation record is: {record_file}")

    # 关闭环境
    eval_env.close()
    pygame.quit()