In [1]:
import numpy as np
import random
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from collections import deque
import copy
import pygame
import torch.nn.functional as F

pygame 2.6.1 (SDL 2.28.4, Python 3.12.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
class GridWorld:
    def __init__(self, grid_map, start, goal, max_steps=50000, goal_list=None):
        """
        初始化 GridWorld 环境
        """
        self.grid_map = grid_map
        self.base_grid_map = grid_map.copy()  # 保存基础地图（无动态障碍物）
        self.rows, self.cols = grid_map.shape
        self.max_steps = max_steps
        self.goal_list = goal_list if goal_list else []  # 目标点列表
        
        if start is not None and goal is not None:
            self.start = start
            self.goal = goal
            self.agent_pos = self.start
            self.distance = np.linalg.norm(np.array(self.start) - np.array(self.goal))
        else:
            self.reset_dynamic(start, goal)

    def reset_dynamic(self, num_obstacles=30, num_dynamic_obstacles=10):
        """
        每次 reset 时动态生成障碍物，并使用传入的起点和目标，或者随机生成。
        """
        self.grid_map = self.base_grid_map.copy()

        # 固定障碍物初始化
        valid_positions = [
            (i, j) for i in range(1, self.rows - 1) for j in range(1, self.cols - 1)
            if self.grid_map[i, j] == 0
        ]
        random_obstacles = random.sample(valid_positions, num_obstacles)
        for x, y in random_obstacles:
            self.grid_map[x, y] = 1

        # 动态障碍物初始化
        self.dynamic_obstacles = []
        for _ in range(num_dynamic_obstacles):
            x, y = random.choice(valid_positions)
            direction = random.choice(['up', 'down', 'left', 'right'])
            speed = random.randint(1, 2)

            self.dynamic_obstacles.append({
                'position': (x, y),
                'direction': direction,
                'speed': speed
            })

        # 确保起点和目标点不在障碍物中
        self.valid_positions = [
            (i, j) for i in range(1, self.rows - 1) for j in range(1, self.cols - 1)
            if self.grid_map[i, j] == 0
        ]
        self.agent_pos = self.start
        self.steps = 0
        self.distance = np.linalg.norm(np.array(self.start) - np.array(self.goal))

    def reset(self):
        """
        重置环境到初始状态
        """
        self.reset_dynamic()
        self.goal_list_copy = self.goal_list.copy()
        self.current_goal_index = 0
        self.goal = self.goal_list[self.current_goal_index] if self.goal_list else None
        return self.get_state()

    def get_state(self, n_frames=4):
        """
        获取过去几帧的状态，生成时间序列。
        """
        nearby_grid = np.ones((7, 7), dtype=int)
        x_min, x_max = max(0, self.agent_pos[0] - 3), min(self.rows, self.agent_pos[0] + 4)
        y_min, y_max = max(0, self.agent_pos[1] - 3), min(self.cols, self.agent_pos[1] + 4)
        r_min, r_max = 3 - (self.agent_pos[0] - x_min), 3 + (x_max - self.agent_pos[0])
        c_min, c_max = 3 - (self.agent_pos[1] - y_min), 3 + (y_max - self.agent_pos[1])

        nearby_grid[r_min:r_max, c_min:c_max] = self.grid_map[x_min:x_max, y_min:y_max]
        nearby_flat = nearby_grid.flatten()

        dx = self.goal[0] - self.agent_pos[0]
        dy = self.goal[1] - self.agent_pos[1]
        distance_to_goal = np.sqrt(dx**2 + dy**2)
        angle_to_goal = np.arctan2(dy, dx)

        # 当前帧的状态
        current_state = np.concatenate(([distance_to_goal, angle_to_goal], nearby_flat))
        return current_state
    
    def step(self, action):
        """
        执行动作并更新环境状态
        """
        state = self.get_state()
        action = random.choices(range(9), weights=action, k=1)[0]
        actions = [(-1, 0), (1, 0), (0, -1), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1), (0 , 0)]
        delta = actions[action]
        next_pos = (self.agent_pos[0] + delta[0], self.agent_pos[1] + delta[1])

        # 检查是否越界或碰到障碍物（包括动态障碍物和固定障碍物）
        for obstacle in self.dynamic_obstacles:
            if self.agent_pos == obstacle['position']:
                reward = -3.0
                done = True
                return self.get_state(), reward, done

        if not (0 <= next_pos[0] < self.rows and 0 <= next_pos[1] < self.cols) or self.grid_map[next_pos] == 1:
            reward = -3.0
            done = True
            return self.get_state(), reward, done

        self.agent_pos = next_pos
        reward = -0.5
        done = False

        next_distance = np.sqrt((self.goal[0] - self.agent_pos[0])**2 + (self.goal[1] - self.agent_pos[1])**2)
        if(self.distance > next_distance):
            reward += 0.6
        else:
            reward -= 0.4
        self.distance = next_distance

        action_vector = np.array([delta[0], delta[1]])
        goal_vector = np.array([self.goal[0] - self.agent_pos[0], self.goal[1] - self.agent_pos[1]])
        goal_vector_norm = goal_vector / (np.linalg.norm(goal_vector) + 1e-5)
        
        if np.linalg.norm(goal_vector) > 0.1:
            alignment_reward = np.dot(action_vector, goal_vector_norm)
        else:
            alignment_reward = 0
        
        reward += alignment_reward * 0.4

        if self.distance < 0.5:
            reward += 10
            if self.current_goal_index + 1 < len(self.goal_list):
                self.current_goal_index += 1
                self.goal = self.goal_list[self.current_goal_index]  
            else:
                done = True

        # 更新动态障碍物的位置
        for obstacle in self.dynamic_obstacles:
            x, y = obstacle['position']
            direction = obstacle['direction']
            speed = obstacle['speed']
            
            if direction == 'up':
                new_pos = (x - speed, y)
            elif direction == 'down':
                new_pos = (x + speed, y)
            elif direction == 'left':
                new_pos = (x, y - speed)
            elif direction == 'right':
                new_pos = (x, y + speed)
            
            if 0 <= new_pos[0] < self.rows and 0 <= new_pos[1] < self.cols:
                if self.grid_map[new_pos[0], new_pos[1]] == 1:
                    if direction == 'up':
                        obstacle['direction'] = 'down'
                    elif direction == 'down':
                        obstacle['direction'] = 'up'
                    elif direction == 'left':
                        obstacle['direction'] = 'right'
                    elif direction == 'right':
                        obstacle['direction'] = 'left'
                else:
                    # 更新障碍物的位置
                    self.grid_map[x, y] = 0  # 清除旧位置
                    self.grid_map[new_pos[0], new_pos[1]] = 1  # 设置新位置
                    obstacle['position'] = new_pos

        # 计算最小障碍物距离
        min_distance_to_obstacle = float('inf')
        for i in range(self.rows):
            for j in range(self.cols):
                if self.grid_map[i, j] == 1:  # 障碍物
                    distance_to_obstacle = np.linalg.norm(np.array(self.agent_pos) - np.array((i, j)))
                    min_distance_to_obstacle = min(min_distance_to_obstacle, distance_to_obstacle)

        # 奖励：距离障碍物越近，惩罚越大
        if min_distance_to_obstacle == 1.0:
            reward -= 2.0
        elif min_distance_to_obstacle <= 2.0:
            reward -= 1.5
        elif min_distance_to_obstacle <= 3.0:
            reward -= 0.2
        else :
            reward -= 0.01

        # 检查是否碰到动态障碍物
        for obstacle in self.dynamic_obstacles:
            if self.agent_pos == obstacle['position']:
                reward -= 3.0
                done = True
                return self.get_state(), reward, done

        self.steps += 1
        if self.steps >= self.max_steps:
            done = True
            reward -= 10

        return self.get_state(), reward, done

In [3]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, action_dim)
        

    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return F.softmax(self.l3(a), dim=-1)

In [4]:
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 1)

        # Q2 architecture
        self.l4 = nn.Linear(state_dim + action_dim, 256)
        self.l5 = nn.Linear(256, 256)
        self.l6 = nn.Linear(256, 1)


    def forward(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2


    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

In [5]:
class ReplayBuffer:
    def __init__(self, max_size=20000):
        """
        初始化经验回放缓冲区
        :param max_size: 缓冲区的最大容量
        """
        self.max_size = max_size
        self.storage = []
        self.ptr = 0

    def append(self, transition):
        """
        添加新的经验到缓冲区
        :param transition: (state, action, reward, next_state, done)
        """
        # 检查是否存在 NaN 或非法值
        state, action, reward, next_state, done = transition
        if np.any(np.isnan(state)) or np.any(np.isnan(action)) or np.isnan(reward) or np.any(np.isnan(next_state)):
            print("Invalid data detected in transition. Skipping storage:")
            print(f"State: {state}, Action: {action}, Reward: {reward}, Next State: {next_state}, Done: {done}")
            return

        # 如果缓冲区未满，则追加新位置
        if len(self.storage) < self.max_size:
            self.storage.append(None)
        
        # 存储数据并更新指针
        self.storage[self.ptr] = (state, action, reward, next_state, done)
        self.ptr = (self.ptr + 1) % self.max_size

    def sample(self, batch_size):
        """
        随机采样一批经验
        :param batch_size: 批量大小
        :return: 批量经验 (states, actions, rewards, next_states, dones)
        """
        indices = np.random.randint(0, len(self.storage), size=batch_size)
        states, actions, rewards, next_states, dones = [], [], [], [], []

        for i in indices:
            state, action, reward, next_state, done = self.storage[i]
            states.append(np.array(state, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)  # 保持标量
            next_states.append(np.array(next_state, copy=False))
            dones.append(done)  # 保持标量

        # 返回可解包的元组
        return states, actions, rewards, next_states, dones
    
    def __len__(self):
        """
        返回缓冲区中存储的经验数量。
        """
        return len(self.storage)

In [6]:
class TD3:
    def __init__(self, state_dim, action_dim, actor_lr=2e-4, critic_lr=2e-3, gamma=0.8, tau=0.005, device="cuda",policy_noise=0.2, epsilon=1.0, epsilon_min=0.001, epsilon_decay=0.95,
		noise_clip=0.5,
		policy_freq=2):
        # 初始化 Actor 和 Critic
        self.actor = Actor(state_dim, action_dim).to(device)
        self.actor_target = Actor(state_dim, action_dim).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # 优化器
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)

        # 超参数
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        # 经验回放缓冲区
        self.replay_buffer = ReplayBuffer(max_size=100000)

        # 设备信息
        self.device = device
        self.total_it = 0
    
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        probs = self.actor(state).cpu().data.numpy().flatten()
        return probs

    def store_transition(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, float(done)))
        
    def soft_update(self, source, target):
        for target_param, source_param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(self.tau * source_param.data + (1 - self.tau) * target_param.data)

    def train(self, batch_size=256):
        if len(self.replay_buffer.storage) < batch_size:
            return None, None  # 如果缓冲区样本不足，返回 None

        # 从 ReplayBuffer 中采样
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        
        # 转换为 Tensor
        states = torch.tensor(np.array(states), dtype=torch.float32).to(self.device)
        actions = torch.tensor(np.array(actions), dtype=torch.float32).to(self.device)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float32).unsqueeze(1).to(self.device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).to(self.device)
        dones = torch.tensor(np.array(dones), dtype=torch.float32).unsqueeze(1).to(self.device)
        
        # Critic 更新
        with torch.no_grad():
            #noise = (
			#	torch.randn_like(actions) * self.policy_noise
			#).clamp(-self.noise_clip, self.noise_clip)
            #next_actions = (
			#	self.actor_target(next_states) + noise
			#)
            next_actions = self.actor_target(next_states)
            target_Q1, target_Q2 = self.critic_target(next_states, next_actions)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = rewards + (1 - dones) * self.gamma * target_Q
            
        current_Q1, current_Q2 = self.critic(states, actions)
        
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor 更新
        actor_loss = -self.critic.Q1(states, self.actor(states)).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 更新目标网络
        self.soft_update(self.actor, self.actor_target)
        self.soft_update(self.critic, self.critic_target)
    
        return actor_loss.item(), critic_loss.item()

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self, actor_path="actor.pth", critic_path="critic.pth"):
        torch.save(self.actor.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)
        print(f"Actor model saved to {actor_path}")
        print(f"Critic model saved to {critic_path}")

    def load_model(self, actor_path="actor.pth", critic_path="critic.pth"):
        self.actor.load_state_dict(torch.load(actor_path))
        self.critic.load_state_dict(torch.load(critic_path))
        print(f"Actor model loaded from {actor_path}")
        print(f"Critic model loaded from {critic_path}")

In [7]:
def train_agent(env, agent, episodes=1000, max_steps=1000, actor_path="actor.pth", critic_path="critic.pth", log_dir="runs"):
    """
    使用 DDPG 训练智能体并保存模型。
    """
    run_id = time.strftime("%Y%m%d-%H%M%S")
    log_dir = os.path.join(log_dir, f"run_{run_id}")
    os.makedirs(log_dir, exist_ok=True)
    writer = SummaryWriter(log_dir)

    best_reward = -float('inf')

    with tqdm(total=episodes, desc="训练进度", unit="episode") as pbar:
        for episode in range(episodes):
            state = env.reset()
            done = False
            total_reward = 0
            episode_steps = 0

            while not done:
                action = agent.select_action(state)
                next_state, reward, done = env.step(action)
                agent.store_transition(state, action, reward, next_state, done)
                actor_loss, critic_loss = agent.train(batch_size=256)  # 获取损失值
                state = next_state

                total_reward += reward
                episode_steps += 1

                if episode_steps >= max_steps:
                    done = True
                    
            # 每个回合结束后，衰减 epsilon（探索率）
            agent.decay_epsilon()    
            # 记录到 TensorBoard
            writer.add_scalar("Reward/Episode", total_reward, episode)
            if actor_loss is not None and critic_loss is not None:
                writer.add_scalar("Loss/Actor", actor_loss, episode)
                writer.add_scalar("Loss/Critic", critic_loss, episode)

            pbar.set_postfix({"总奖励": total_reward})
            pbar.update(1)

            # 保存模型
            if total_reward > best_reward:
                best_reward = total_reward
                agent.save_model(actor_path, critic_path)

        # 保存最终模型
        agent.save_model(actor_path, critic_path)
        writer.close()

    print(f"训练完成，Actor 模型保存至 {actor_path}，Critic 模型保存至 {critic_path}")


In [8]:
grid_map = np.loadtxt("grid_map_final.txt", dtype=int)
nodes=[(51, 12), (55, 23), (55, 30), (32, 47), (9, 50), (9, 64)]
env = GridWorld(grid_map , start=(58, 5) , goal=(51, 12),goal_list=nodes)

# 实例化 DDPG Agent
state_dim = 51
action_dim = 9 
agent = TD3(state_dim, action_dim)
# 开始训练
train_agent(
    env=env,
    agent=agent,
    episodes=10000,
    max_steps=50000,
    actor_path="ddpg_actor.pth",  # 保存 Actor 的路径
    critic_path="ddpg_critic.pth",  # 保存 Critic 的路径
    log_dir="runs"
)

训练进度:   0%|          | 1/10000 [00:00<44:20,  3.76episode/s, 总奖励=-6.2] 

Actor model saved to ddpg_actor.pth
Critic model saved to ddpg_critic.pth
Actor model saved to ddpg_actor.pth
Critic model saved to ddpg_critic.pth


训练进度:   0%|          | 7/10000 [00:01<18:24,  9.05episode/s, 总奖励=-16.4]

Actor model saved to ddpg_actor.pth
Critic model saved to ddpg_critic.pth


训练进度:   0%|          | 12/10000 [00:01<11:18, 14.73episode/s, 总奖励=-12.9]

Actor model saved to ddpg_actor.pth
Critic model saved to ddpg_critic.pth


训练进度:  12%|█▏        | 1151/10000 [00:13<01:42, 86.47episode/s, 总奖励=-3] 


KeyboardInterrupt: 