In [53]:
import numpy as np
import random
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from collections import deque
import pygame

In [54]:
import numpy as np
import random

class GridWorld:
    def __init__(self, grid_map, start, goal, max_steps=50000, goal_list=None):
        """
        初始化 GridWorld 环境
        """
        self.grid_map = grid_map
        self.base_grid_map = grid_map.copy()  # 保存基础地图（无动态障碍物）
        self.rows, self.cols = grid_map.shape
        self.max_steps = max_steps
        self.goal_list = goal_list if goal_list else []  # 目标点列表
        
        if start is not None and goal is not None:
            self.start = start
            self.goal = goal
            self.agent_pos = self.start
            self.distance = np.linalg.norm(np.array(self.start) - np.array(self.goal))
        else:
            self.reset_dynamic(start, goal)

        # 定义可能的动作
        self.actions = [(-1, 0), (1, 0), (0, -1), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1), (0 , 0)]

    def reset_dynamic(self, num_obstacles=30, num_dynamic_obstacles=10):
        """
        每次 reset 时动态生成障碍物，并使用传入的起点和目标，或者随机生成。
        """
        self.grid_map = self.base_grid_map.copy()

        # 固定障碍物初始化
        valid_positions = [
            (i, j) for i in range(1, self.rows - 1) for j in range(1, self.cols - 1)
            if self.grid_map[i, j] == 0
        ]
        random_obstacles = random.sample(valid_positions, num_obstacles)
        for x, y in random_obstacles:
            self.grid_map[x, y] = 1

        # 动态障碍物初始化
        self.dynamic_obstacles = []
        for _ in range(num_dynamic_obstacles):
            x, y = random.choice(valid_positions)
            direction = random.choice(['up', 'down', 'left', 'right'])
            speed = random.randint(1, 2)

            self.dynamic_obstacles.append({
                'position': (x, y),
                'direction': direction,
                'speed': speed
            })

        # 确保起点和目标点不在障碍物中
        self.valid_positions = [
            (i, j) for i in range(1, self.rows - 1) for j in range(1, self.cols - 1)
            if self.grid_map[i, j] == 0
        ]
        self.agent_pos = self.start
        self.steps = 0
        self.distance = np.linalg.norm(np.array(self.start) - np.array(self.goal))

    def reset(self):
        """
        重置环境到初始状态
        """
        self.reset_dynamic()
        self.goal_list_copy = self.goal_list.copy()
        self.current_goal_index = 0
        self.goal = self.goal_list[self.current_goal_index] if self.goal_list else None
        return self.get_state()

    def get_state(self):
        """
        获取当前帧的状态，不使用历史帧。
        """
        nearby_grid = np.ones((7, 7), dtype=int)
        x_min, x_max = max(0, self.agent_pos[0] - 3), min(self.rows, self.agent_pos[0] + 4)
        y_min, y_max = max(0, self.agent_pos[1] - 3), min(self.cols, self.agent_pos[1] + 4)
        r_min, r_max = 3 - (self.agent_pos[0] - x_min), 3 + (x_max - self.agent_pos[0])
        c_min, c_max = 3 - (self.agent_pos[1] - y_min), 3 + (y_max - self.agent_pos[1])

        nearby_grid[r_min:r_max, c_min:c_max] = self.grid_map[x_min:x_max, y_min:y_max]
        nearby_flat = nearby_grid.flatten()

        dx = self.goal[0] - self.agent_pos[0]
        dy = self.goal[1] - self.agent_pos[1]
        distance_to_goal = np.sqrt(dx**2 + dy**2)
        angle_to_goal = np.arctan2(dy, dx)

        # 当前帧的状态
        current_state = np.concatenate(([distance_to_goal, angle_to_goal], nearby_flat))
        return current_state

    def step(self, action):
        """
        执行动作并更新环境状态
        """
        state = self.get_state()
        distance_to_goal = state[0]  
        angle_to_goal = state[1]  
        nearby_flat = state[2:]

        actions = [(-1, 0), (1, 0), (0, -1), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1), (0 , 0)]
        delta = actions[action]
        next_pos = (self.agent_pos[0] + delta[0], self.agent_pos[1] + delta[1])

        # 检查是否越界或碰到障碍物（包括动态障碍物和固定障碍物）
        for obstacle in self.dynamic_obstacles:
            if self.agent_pos == obstacle['position']:
                reward = -3.0
                done = True
                return self.get_state(), reward, done

        if not (0 <= next_pos[0] < self.rows and 0 <= next_pos[1] < self.cols) or self.grid_map[next_pos] == 1:
            reward = -3.0
            done = True
            return self.get_state(), reward, done

        self.agent_pos = next_pos
        reward = -0.5
        done = False

        next_distance = np.sqrt((self.goal[0] - self.agent_pos[0])**2 + (self.goal[1] - self.agent_pos[1])**2)
        if(self.distance > next_distance):
            reward += 0.6
        else:
            reward -= 0.4
        self.distance = next_distance

        action_vector = np.array([delta[0], delta[1]])
        goal_vector = np.array([self.goal[0] - self.agent_pos[0], self.goal[1] - self.agent_pos[1]])
        goal_vector_norm = goal_vector / (np.linalg.norm(goal_vector) + 1e-5)
        
        if np.linalg.norm(goal_vector) > 0.1:
            alignment_reward = np.dot(action_vector, goal_vector_norm)
        else:
            alignment_reward = 0
        
        reward += alignment_reward * 0.4

        if self.distance < 0.5:
            reward += 10
            if self.current_goal_index + 1 < len(self.goal_list):
                self.current_goal_index += 1
                self.goal = self.goal_list[self.current_goal_index]  
            else:
                done = True

        # 更新动态障碍物的位置
        for obstacle in self.dynamic_obstacles:
            x, y = obstacle['position']
            direction = obstacle['direction']
            speed = obstacle['speed']
            
            if direction == 'up':
                new_pos = (x - speed, y)
            elif direction == 'down':
                new_pos = (x + speed, y)
            elif direction == 'left':
                new_pos = (x, y - speed)
            elif direction == 'right':
                new_pos = (x, y + speed)
            
            if 0 <= new_pos[0] < self.rows and 0 <= new_pos[1] < self.cols:
                if self.grid_map[new_pos[0], new_pos[1]] == 1:
                    if direction == 'up':
                        obstacle['direction'] = 'down'
                    elif direction == 'down':
                        obstacle['direction'] = 'up'
                    elif direction == 'left':
                        obstacle['direction'] = 'right'
                    elif direction == 'right':
                        obstacle['direction'] = 'left'
                else:
                    # 更新障碍物的位置
                    self.grid_map[x, y] = 0  # 清除旧位置
                    self.grid_map[new_pos[0], new_pos[1]] = 1  # 设置新位置
                    obstacle['position'] = new_pos

        # 计算最小障碍物距离
        min_distance_to_obstacle = float('inf')
        for i in range(self.rows):
            for j in range(self.cols):
                if self.grid_map[i, j] == 1:  # 障碍物
                    distance_to_obstacle = np.linalg.norm(np.array(self.agent_pos) - np.array((i, j)))
                    min_distance_to_obstacle = min(min_distance_to_obstacle, distance_to_obstacle)

        # 奖励：距离障碍物越近，惩罚越大
        if min_distance_to_obstacle == 1.0:
            reward -= 2.0
        elif min_distance_to_obstacle <= 2.0:
            reward -= 1.5
        elif min_distance_to_obstacle <= 3.0:
            reward -= 0.2
        else :
            reward -= 0.01

        # 检查是否碰到动态障碍物
        for obstacle in self.dynamic_obstacles:
            if self.agent_pos == obstacle['position']:
                reward -= 3.0
                done = True
                return self.get_state(), reward, done

        self.steps += 1
        if self.steps >= self.max_steps:
            done = True
            reward -= 10

        return self.get_state(), reward, done


In [55]:
import numpy as np
import random

class reGridWorld:
    def __init__(self, grid_map, start, goal, max_steps=50000, goal_list=None):
        """
        初始化 GridWorld 环境
        """
        self.grid_map = grid_map
        self.base_grid_map = grid_map.copy()  # 保存基础地图（无动态障碍物）
        self.rows, self.cols = grid_map.shape
        self.max_steps = max_steps
        self.goal_list = goal_list if goal_list else []  # 目标点列表
        self.history = deque(maxlen=4)  # 保存过去3帧状态
        
        if start is not None and goal is not None:
            self.start = start
            self.goal = goal
            self.agent_pos = self.start
            self.distance = np.linalg.norm(np.array(self.start) - np.array(self.goal))
        else:
            self.reset_dynamic(start, goal)

    def reset_dynamic(self, num_obstacles=30, num_dynamic_obstacles=10):
        """
        每次 reset 时动态生成障碍物，并使用传入的起点和目标，或者随机生成。
        """
        self.grid_map = self.base_grid_map.copy()

        # 固定障碍物初始化
        valid_positions = [
            (i, j) for i in range(1, self.rows - 1) for j in range(1, self.cols - 1)
            if self.grid_map[i, j] == 0
        ]
        random_obstacles = random.sample(valid_positions, num_obstacles)
        for x, y in random_obstacles:
            self.grid_map[x, y] = 1

        # 动态障碍物初始化
        self.dynamic_obstacles = []
        for _ in range(num_dynamic_obstacles):
            x, y = random.choice(valid_positions)
            direction = random.choice(['up', 'down', 'left', 'right'])
            speed = random.randint(1, 2)

            self.dynamic_obstacles.append({
                'position': (x, y),
                'direction': direction,
                'speed': speed
            })

        # 确保起点和目标点不在障碍物中
        self.valid_positions = [
            (i, j) for i in range(1, self.rows - 1) for j in range(1, self.cols - 1)
            if self.grid_map[i, j] == 0
        ]
        self.agent_pos = self.start
        self.steps = 0
        self.distance = np.linalg.norm(np.array(self.start) - np.array(self.goal))

    def reset(self):
        """
        重置环境到初始状态
        """
        self.reset_dynamic()
        self.goal_list_copy = self.goal_list.copy()
        self.current_goal_index = 0
        self.goal = self.goal_list[self.current_goal_index] if self.goal_list else None
        return self.get_state()

    def get_state(self, n_frames=4):
        """
        获取过去几帧的状态，生成时间序列。
        """
        nearby_grid = np.ones((7, 7), dtype=int)
        x_min, x_max = max(0, self.agent_pos[0] - 3), min(self.rows, self.agent_pos[0] + 4)
        y_min, y_max = max(0, self.agent_pos[1] - 3), min(self.cols, self.agent_pos[1] + 4)
        r_min, r_max = 3 - (self.agent_pos[0] - x_min), 3 + (x_max - self.agent_pos[0])
        c_min, c_max = 3 - (self.agent_pos[1] - y_min), 3 + (y_max - self.agent_pos[1])

        nearby_grid[r_min:r_max, c_min:c_max] = self.grid_map[x_min:x_max, y_min:y_max]
        nearby_flat = nearby_grid.flatten()

        dx = self.goal[0] - self.agent_pos[0]
        dy = self.goal[1] - self.agent_pos[1]
        distance_to_goal = np.sqrt(dx**2 + dy**2)
        angle_to_goal = np.arctan2(dy, dx)

        # 当前帧的状态
        current_state = np.concatenate(([distance_to_goal, angle_to_goal], nearby_flat))

        # 更新历史状态队列
        self.history.append(current_state)

        # 如果历史帧数不足 n_frames，则用当前帧填充
        while len(self.history) < n_frames:
            self.history.appendleft(current_state)

        # 返回过去 n 帧的状态序列
        return np.array(self.history)  # 这里返回的是一个形状为 (n_frames, state_dim) 的二维数组
    
    def step(self, action):
        """
        执行动作并更新环境状态
        """
        state = self.get_state()
        distance_to_goal = state[0]  
        angle_to_goal = state[1]  
        nearby_flat = state[2:]

        actions = [(-1, 0), (1, 0), (0, -1), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1), (0 , 0)]
        delta = actions[action]
        next_pos = (self.agent_pos[0] + delta[0], self.agent_pos[1] + delta[1])

        # 检查是否越界或碰到障碍物（包括动态障碍物和固定障碍物）
        for obstacle in self.dynamic_obstacles:
            if self.agent_pos == obstacle['position']:
                reward = -3.0
                done = True
                return self.get_state(), reward, done

        if not (0 <= next_pos[0] < self.rows and 0 <= next_pos[1] < self.cols) or self.grid_map[next_pos] == 1:
            reward = -3.0
            done = True
            return self.get_state(), reward, done

        self.agent_pos = next_pos
        reward = -0.5
        done = False

        next_distance = np.sqrt((self.goal[0] - self.agent_pos[0])**2 + (self.goal[1] - self.agent_pos[1])**2)
        if(self.distance > next_distance):
            reward += 0.6
        else:
            reward -= 0.4
        self.distance = next_distance

        action_vector = np.array([delta[0], delta[1]])
        goal_vector = np.array([self.goal[0] - self.agent_pos[0], self.goal[1] - self.agent_pos[1]])
        goal_vector_norm = goal_vector / (np.linalg.norm(goal_vector) + 1e-5)
        
        if np.linalg.norm(goal_vector) > 0.1:
            alignment_reward = np.dot(action_vector, goal_vector_norm)
        else:
            alignment_reward = 0
        
        reward += alignment_reward * 0.4

        if self.distance < 0.5:
            reward += 10
            if self.current_goal_index + 1 < len(self.goal_list):
                self.current_goal_index += 1
                self.goal = self.goal_list[self.current_goal_index]  
            else:
                done = True

        # 更新动态障碍物的位置
        for obstacle in self.dynamic_obstacles:
            x, y = obstacle['position']
            direction = obstacle['direction']
            speed = obstacle['speed']
            
            if direction == 'up':
                new_pos = (x - speed, y)
            elif direction == 'down':
                new_pos = (x + speed, y)
            elif direction == 'left':
                new_pos = (x, y - speed)
            elif direction == 'right':
                new_pos = (x, y + speed)
            
            if 0 <= new_pos[0] < self.rows and 0 <= new_pos[1] < self.cols:
                if self.grid_map[new_pos[0], new_pos[1]] == 1:
                    if direction == 'up':
                        obstacle['direction'] = 'down'
                    elif direction == 'down':
                        obstacle['direction'] = 'up'
                    elif direction == 'left':
                        obstacle['direction'] = 'right'
                    elif direction == 'right':
                        obstacle['direction'] = 'left'
                else:
                    # 更新障碍物的位置
                    self.grid_map[x, y] = 0  # 清除旧位置
                    self.grid_map[new_pos[0], new_pos[1]] = 1  # 设置新位置
                    obstacle['position'] = new_pos

        # 计算最小障碍物距离
        min_distance_to_obstacle = float('inf')
        for i in range(self.rows):
            for j in range(self.cols):
                if self.grid_map[i, j] == 1:  # 障碍物
                    distance_to_obstacle = np.linalg.norm(np.array(self.agent_pos) - np.array((i, j)))
                    min_distance_to_obstacle = min(min_distance_to_obstacle, distance_to_obstacle)

        # 奖励：距离障碍物越近，惩罚越大
        if min_distance_to_obstacle == 1.0:
            reward -= 2.0
        elif min_distance_to_obstacle <= 2.0:
            reward -= 1.5
        elif min_distance_to_obstacle <= 3.0:
            reward -= 0.2
        else :
            reward -= 0.01

        # 检查是否碰到动态障碍物
        for obstacle in self.dynamic_obstacles:
            if self.agent_pos == obstacle['position']:
                reward -= 3.0
                done = True
                return self.get_state(), reward, done

        self.steps += 1
        if self.steps >= self.max_steps:
            done = True
            reward -= 10

        return self.get_state(), reward, done

In [56]:
class DuelingDQN_with_LSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256, lstm_layers=1):
        super(DuelingDQN_with_LSTM, self).__init__()
        
        # 输入层到隐藏层的全连接
        self.fc1 = nn.Linear(input_dim, 128)  # 输入到128维的全连接层
        self.lstm = nn.LSTM(input_size=128, hidden_size=hidden_dim, num_layers=lstm_layers, batch_first=True)
        
        # 状态价值（V(s)）分支
        self.value_fc = nn.Linear(hidden_dim, 128)  # 将 hidden_dim 改为 128
        self.value_output = nn.Linear(128, 1)  # 状态的价值
        
        # 优势（A(s, a)）分支
        self.advantage_fc = nn.Linear(hidden_dim, 128)  # 将 hidden_dim 改为 128
        self.advantage_output = nn.Linear(128, output_dim)  # 输出维度

    def forward(self, x):
        batch_size, time_steps, _ = x.size()  # 获取输入的维度
        
        # 前向传播
        x = torch.relu(self.fc1(x))  # 经过第一层全连接层
        
        # LSTM层处理
        lstm_out, (h_n, c_n) = self.lstm(x)  # LSTM处理
        
        # 使用最后一个时间步的输出
        lstm_out_last = lstm_out[:, -1, :]  # 只取最后一个时间步的输出
        
        # 计算状态价值 V(s)
        value = torch.relu(self.value_fc(lstm_out_last))
        value = self.value_output(value)
        
        # 计算动作优势 A(s, a)
        advantage = torch.relu(self.advantage_fc(lstm_out_last))
        advantage = self.advantage_output(advantage)
        
        # 组合 V(s) 和 A(s, a) 得到 Q(s, a)
        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))  # 这里减去均值是为了稳定性
        
        return q_values  # 返回Q值

In [57]:
import torch
import torch.nn as nn

class DQN_with_LSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128, lstm_layers=1):
        super(DQN_with_LSTM, self).__init__()
        
        # 输入层到隐藏层的全连接
        self.fc1 = nn.Linear(input_dim, 128)  # 输入到128维的全连接层
        self.lstm = nn.LSTM(input_size=128, hidden_size=hidden_dim, num_layers=lstm_layers, batch_first=True)
        
        # Q值的输出层
        self.q_fc = nn.Linear(hidden_dim, output_dim)  # 最终的 Q 值输出，维度为动作空间大小

    def forward(self, x):
        batch_size, time_steps, _ = x.size()
        
        # 前向传播
        x = torch.relu(self.fc1(x))  # 经过第一层全连接层
        
        # LSTM层处理
        lstm_out, (h_n, c_n) = self.lstm(x)  # LSTM处理
        
        # 使用最后一个时间步的输出
        lstm_out_last = lstm_out[:, -1, :]  # 只取最后一个时间步的输出
        
        # 计算 Q 值
        q_values = self.q_fc(lstm_out_last)
        
        return q_values  # 返回Q值


In [58]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        super(DQN, self).__init__()
        # Fully connected layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # 输入层到隐藏层
        self.fc2 = nn.Linear(hidden_dim, output_dim)  # 隐藏层到输出层

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # 通过ReLU激活函数
        q_values = self.fc2(x)  # 输出Q值
        return q_values

In [59]:
class RainbowDQN:
    def __init__(self, env, gamma=0.8, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.9, 
                learning_rate=4e-4, batch_size=64, memory_size=10000, device=None, alpha=0.6, beta=0.4,
                n_step=4, shared_replay_buffer=None):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.alpha = alpha  # 优先级的重要性
        self.beta = beta    # 用于优先级采样的偏差修正
        self.n_step = n_step  # Multi-step 参数
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        
        # 使用 Dueling DQN 网络
        self.q_network = DQN(51, 9).to(self.device)  # 输入51是特征维度，输出9是动作维度
        self.target_network = DQN(51, 9).to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)

        # 使用共享缓冲区（如果提供），否则实例化私有缓冲区
        self.memory = shared_replay_buffer or deque(maxlen=memory_size)
        self.priority_sum = 0  # 初始化总优先级为 0
        self.update_target_network()
    
    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())
    
    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, 8)
        
        state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
        state_tensor = state_tensor.unsqueeze(0)  # (1, n_frames, state_dim) 转换为三维张量
        
        q_values = self.q_network(state_tensor)
        return torch.argmax(q_values).item()
    
    def store_transition(self, state, action, reward, next_state, done):
        # 计算多步TD误差
        td_error = 0.0  # 初始时，TD error 为 0

        if len(self.memory) > 0:
            state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device).unsqueeze(0)  # (1, n_frames, state_dim)
            next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device).unsqueeze(0)  # (1, n_frames, state_dim)
            
            # 计算 Q 值和 TD Error
            q_values = self.q_network(state_tensor)
            action_tensor = torch.tensor([action], dtype=torch.long).to(self.device)
            current_q_values = q_values.gather(1, action_tensor.view(-1, 1)).squeeze()
            next_q_values = self.target_network(next_state_tensor).max()
            td_error = abs(reward + self.gamma * next_q_values.item() - current_q_values.item())
        
        # 存储经验并附加优先级
        priority = (td_error + 1e-5) ** self.alpha  # 1e-5 防止优先级为零
        self.memory.append((state, action, reward, next_state, done, priority))
        self.priority_sum += priority

    def train(self):
        if len(self.memory) < self.batch_size:
            return

        # 使用均匀采样
        indices = random.sample(range(len(self.memory)), self.batch_size)
        batch = [self.memory[idx] for idx in indices]
        # 使用优先级采样
        # probabilities = np.array([transition[5] for transition in self.memory])
        # probabilities += 1e-5  # 防止某些优先级为零
        # probabilities /= probabilities.sum()  # 归一化概率总和为1
        # indices = np.random.choice(len(self.memory), self.batch_size, p=probabilities)
        # batch = [self.memory[idx] for idx in indices]
        states, actions, rewards, next_states, dones, priorities = zip(*batch)
        
        # 转换为 Tensor
        states = torch.tensor(np.array(states), dtype=torch.float32).to(self.device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).to(self.device)

        # 确保状态是三维的
        if states.dim() == 2:
            states = states.unsqueeze(0)  # (1, n_frames, state_dim)
        if next_states.dim() == 2:
            next_states = next_states.unsqueeze(0)  # (1, n_frames, state_dim)

        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)

        # 计算多步奖励目标
        target_q_values = rewards
        for i in range(1, self.n_step):
            target_q_values += (self.gamma ** i) * rewards

        next_q_values = self.target_network(next_states).max(1)[0]
        target_q_values += (self.gamma ** self.n_step) * next_q_values * (1 - dones)

        # 计算损失
        current_q_values = self.q_network(states).gather(1, actions.view(-1, 1)).squeeze()
        loss = nn.MSELoss()(current_q_values, target_q_values)

        # 反向传播
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 更新目标网络
        self.update_target_network()

    def update(self):
        self.update_target_network()

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self, file_path="dqn_model.pth"):
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon
        }, file_path)
        print(f"Model saved to {file_path}")
    
    def load_model(self, file_path="dqn_model.pth"):
        checkpoint = torch.load(file_path)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']
        print(f"Model loaded from {file_path}")
        

In [60]:
def visualize_training(env, agent, episodes=1000, max_steps=1000, save_path="dqn_model.pth"):
    import pygame
    import numpy as np

    # 初始化 Pygame
    pygame.init()
    grid_size = 10  # 每个格子的像素大小
    screen = pygame.display.set_mode((env.cols * grid_size, env.rows * grid_size))
    pygame.display.set_caption("Training Visualization")
    clock = pygame.time.Clock()

    # 初始化字体
    pygame.font.init()
    font = pygame.font.SysFont("Arial", 24)

    best_reward = -float('inf')

    for episode in range(episodes):
        env.reset_dynamic(num_obstacles=min(5 + episode // 100, 20))  # 动态调整障碍物数量
        state = env.reset()
        done = False
        total_reward = 0
        path = [env.agent_pos]  # 路径记录
        steps = 0

        while not done and steps < max_steps:
            # 渲染环境
            screen.fill((255, 255, 255))  # 清空屏幕，设置白色背景

            # 绘制栅格地图
            for i in range(env.rows):
                for j in range(env.cols):
                    color = (255, 255, 255)  # 默认白色
                    if env.grid_map[i, j] == 1:
                        color = (0, 0, 0)  # 黑色障碍物
                    pygame.draw.rect(screen, color, (j * grid_size, i * grid_size, grid_size, grid_size))

            # 绘制目标
            pygame.draw.rect(
                screen,
                (0, 0, 255),
                (env.goal[1] * grid_size, env.goal[0] * grid_size, grid_size, grid_size)
            )  # 蓝色目标点

            # 绘制路径轨迹
            for pos in path:
                pygame.draw.circle(
                    screen,
                    (200, 200, 200),  # 浅灰色轨迹
                    (int(pos[1] * grid_size + grid_size // 2), int(pos[0] * grid_size + grid_size // 2)),
                    5
                )

            # 绘制智能体
            car_x = int(env.agent_pos[1] * grid_size + grid_size // 2)
            car_y = int(env.agent_pos[0] * grid_size + grid_size // 2)
            car_radius = grid_size // 4
            pygame.draw.circle(screen, (255, 0, 0), (car_x, car_y), car_radius)

            # 智能体选择动作
            state_tensor = torch.tensor(state, dtype=torch.float32).to(agent.device)
            action = agent.select_action(state_tensor)
            next_state, reward, done = env.step(action)
            agent.store_transition(state, action, reward, next_state, done)
            agent.train()

            # 更新状态和累计奖励
            state = next_state
            total_reward += reward
            path.append(env.agent_pos)  # 记录路径
            steps += 1

            # 显示单步奖励
            reward_text = font.render(f"Episode: {episode}", True, (0, 0, 0))
            screen.blit(reward_text, (10, 10))  # 在屏幕左上角显示

            # 显示总奖励
            total_reward_text = font.render(f"Total Reward: {total_reward:.2f}", True, (0, 0, 0))
            screen.blit(total_reward_text, (10, 40))  # 显示总奖励

            pygame.display.flip()  # 更新显示

            # 检查是否按下退出事件
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    return

            # 限制帧率
            clock.tick(60)

        # 记录最佳模型
        if total_reward > best_reward:
            best_reward = total_reward
            agent.save_model(save_path)

        # 每 10 轮打印日志
        if episode % 10 == 0:
            print(f"Episode {episode}: Total Reward = {total_reward:.2f}")

    pygame.quit()
    print("Training Completed!")


In [61]:
import os
import time
import torch
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import numpy as np

def train_agent(env, agent, episodes=1000, max_steps=1000, save_path="dqn_model.pth", log_dir="runs"): 
    # 初始化 TensorBoard 日志记录器
    run_id = time.strftime("%Y%m%d-%H%M%S")
    log_dir = os.path.join(log_dir, f"run_{run_id}")
    os.makedirs(log_dir, exist_ok=True)
    writer = SummaryWriter(log_dir)

    best_reward = -float('inf')  # 记录最佳奖励
    step_counter = 0  # 全局步数计数器
    all_rewards = []  # 保存所有回合奖励
    all_losses = []  # 保存所有回合平均损失
    all_epsilons = []  # 保存所有回合探索率

    # 使用 tqdm 进度条显示训练过程
    with tqdm(total=episodes, desc="训练进度", unit="episode") as pbar:
        for episode in range(episodes):
            env.reset_dynamic()  # 重置环境动态障碍物
            state = env.reset()  # 获取初始状态
            done = False
            total_reward = 0  # 当前回合总奖励
            episode_steps = 0  # 当前回合步数计数
            losses = []  # 当前回合的损失列表
            q_values = []  # 当前回合的 Q 值记录

            while not done:
                # 转换状态为 Tensor
                state_tensor = torch.tensor(state, dtype=torch.float32).to(agent.device)
                
                # 智能体选择动作
                action = agent.select_action(state_tensor)
                
                # 环境执行动作并返回新状态、奖励和是否结束
                next_state, reward, done = env.step(action)
                
                # 存储经验和训练模型
                agent.store_transition(state, action, reward, next_state, done)
                loss = agent.train()  # 返回当前训练的损失
                state = next_state
                total_reward += reward
                episode_steps += 1
                step_counter += 1

                # 记录损失和 Q 值分布
                if loss is not None:
                    losses.append(loss.item())
                with torch.no_grad():
                    q_values.append(agent.q_network(state_tensor.unsqueeze(0)).cpu().numpy().flatten())

                # 如果达到最大步数，则结束当前回合
                if episode_steps >= max_steps:
                    done = True

            # 每个回合结束后，衰减 epsilon（探索率）
            agent.decay_epsilon()

            # 记录奖励、epsilon 和损失到 TensorBoard
            writer.add_scalar("Reward/Episode", total_reward, episode)
            writer.add_scalar("Epsilon", agent.epsilon, episode)
            if losses:
                avg_loss = np.mean(losses)
                writer.add_scalar("Loss/Episode", avg_loss, episode)
                all_losses.append(avg_loss)
            all_rewards.append(total_reward)
            all_epsilons.append(agent.epsilon)

            # 记录 Q 值分布
            if q_values:
                q_values_flat = np.concatenate(q_values)
                writer.add_histogram("Q_values/Distribution", q_values_flat, episode)

            # 更新 tqdm 进度条信息
            pbar.set_postfix({"总奖励": total_reward, "Epsilon": round(agent.epsilon, 3)})
            pbar.update(1)

            # 如果当前回合的奖励是历史最高，则保存模型
            if total_reward > best_reward:
                best_reward = total_reward
                agent.save_model(save_path)

            # 每 10 个回合刷新 TensorBoard 数据
            if episode % 10 == 0:
                writer.flush()

        # 保存最终模型
        agent.save_model(save_path)

        # 关闭 TensorBoard 日志记录器
        writer.close()

    print(f"训练完成，模型已保存到 {save_path}")

    # 保存奖励、损失和 epsilon 数据以便后续绘图
    np.save(os.path.join(log_dir, "rewards.npy"), all_rewards)
    np.save(os.path.join(log_dir, "losses.npy"), all_losses)
    np.save(os.path.join(log_dir, "epsilons.npy"), all_epsilons)


In [62]:
def check_trained_model(env, agent, model_path, max_steps=1000):
    """
    交互式测试训练好的模型，动态添加障碍物来评估避障能力。

    :param env: 环境实例 (GridWorld)
    :param agent: 智能体实例
    :param model_path: 已训练好的模型文件路径
    :param max_steps: 每次测试的最大步数
    """
    # 加载训练好的模型
    agent.load_model(model_path)
    start_time = time.time()
    print(f"Loaded model from {model_path}")

    # 设置 epsilon 为 0（测试时只选择最优动作）
    agent.epsilon = 0

    # 初始化 Pygame
    pygame.init()
    grid_size = 10  # 每个格子的像素大小
    screen = pygame.display.set_mode((env.cols * grid_size, env.rows * grid_size))
    pygame.display.set_caption("Trained Model Test")
    clock = pygame.time.Clock()

    # 初始化字体
    pygame.font.init()
    font = pygame.font.SysFont("Arial", 24)

    # 测试
    state = env.reset()
    done = False
    total_reward = 0
    path = [env.agent_pos]  # 路径记录
    steps = 0

    while not done and steps < max_steps:
        # 渲染环境
        screen.fill((255, 255, 255))  # 清空屏幕，设置白色背景

        # 绘制栅格地图
        for i in range(env.rows):
            for j in range(env.cols):
                color = (255, 255, 255)  # 默认白色
                if env.grid_map[i, j] == 1:
                    color = (0, 0, 0)  # 黑色障碍物
                pygame.draw.rect(screen, color, (j * grid_size, i * grid_size, grid_size, grid_size))

        # 绘制目标
        pygame.draw.rect(
            screen,
            (0, 0, 255),
            (env.goal[1] * grid_size, env.goal[0] * grid_size, grid_size, grid_size)
        )  # 蓝色目标点

        # 绘制路径轨迹
        for pos in path:
            pygame.draw.rect(
                screen,
                (200, 200, 200),  # 浅灰色轨迹
                (int(pos[1] * grid_size), int(pos[0] * grid_size), grid_size, grid_size)  # 使用矩形表示路径
            )

        # 绘制智能体
        car_x = int(env.agent_pos[1] * grid_size + grid_size // 2)
        car_y = int(env.agent_pos[0] * grid_size + grid_size // 2)
        car_radius = grid_size // 4

        # 绘制小车的主体
        pygame.draw.circle(screen, (255, 0, 0), (car_x, car_y), car_radius)

        # 智能体选择动作
        action = agent.select_action(state)  # 测试时选择最优动作
        next_state, reward, done = env.step(action)

        pygame.display.flip()  # 更新显示

        # 更新状态和累计奖励
        state = next_state
        total_reward += reward
        path.append(env.agent_pos)  # 记录路径
        steps += 1

        # 检查是否按下退出事件或者添加障碍物
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return
            elif event.type == pygame.MOUSEBUTTONDOWN:
                # 获取鼠标点击的坐标
                mouse_x, mouse_y = pygame.mouse.get_pos()
                # 转换为对应的栅格坐标
                grid_x = mouse_x // grid_size
                grid_y = mouse_y // grid_size
                # 在地图中添加障碍物
                if env.grid_map[grid_y, grid_x] == 0:  # 只在空白位置添加障碍物
                    env.grid_map[grid_y, grid_x] = 1
                    print(f"Obstacle added at ({grid_y}, {grid_x})")
        
        # 限制帧率
        clock.tick(0.7)
    end_time = time.time()
    pygame.quit()  # 测试完成后关闭窗口

    # 输出测试结果
    print(f"Test Completed. Total Reward: {total_reward}, Steps Taken: {steps}")
    print(f"仿真完成，耗时 {end_time - start_time:.2f} 秒")


In [63]:
def check_trained_model_num(env, agent, model_path, max_steps=1000, num_runs=100): 
    """
    交互式测试训练好的模型，动态添加障碍物来评估避障能力，并统计成功的次数。

    :param env: 环境实例 (GridWorld)
    :param agent: 智能体实例
    :param model_path: 已训练好的模型文件路径
    :param max_steps: 每次测试的最大步数
    :param num_runs: 测试的总次数
    """
    # 加载训练好的模型
    agent.load_model(model_path)
    print(f"Loaded model from {model_path}")

    # 设置 epsilon 为 0（测试时只选择最优动作）
    agent.epsilon = 0

    success_count = 0  # 记录成功次数

    # 测试多次
    for run in range(num_runs):
        state = env.reset()
        done = False
        steps = 0
        total_reward = 0
        path = [env.agent_pos]  # 路径记录

        while not done and steps < max_steps:
            # 智能体选择动作
            action = agent.select_action(state)  # 测试时选择最优动作
            next_state, reward, done = env.step(action)
            
            # 更新状态和累计奖励
            state = next_state
            total_reward += reward
            path.append(env.agent_pos)  # 记录路径
            steps += 1

            # 检查是否到达目标
            if env.agent_pos == env.goal:
                success_count += 1
                break  # 达到目标则结束当前测试

    # 输出成功次数
    print(f"Test Completed. Success Count: {success_count}/{num_runs}")

In [64]:
# 定义网格地图
grid_map = np.zeros((60, 60))  # 60x60 的空地图
grid_map[15, 15] = 1  # 设置障碍物
grid_map[14, 15] = 1
grid_map[15, 14] = 1
grid_map[7, 7] = 1
grid_map[17, 12] = 1
grid_map[18 , 16] = 1
grid_map[24 , 14] =1

# 边界障碍物
grid_map[:, 0] = 1
grid_map[:, -1] = 1
grid_map[0, :] = 1
grid_map[-1, :] = 1

# 实例化 GridWorld 环境
#env = GridWorld(grid_map)

In [65]:
grid_map = np.loadtxt("grid_map_final.txt", dtype=int)
#nodes = [(58, 5), (51, 12), (55, 23), (55, 30), (32, 47), (9, 50), (9, 64)]
nodes=[(51, 12), (55, 23), (55, 30), (32, 47), (9, 50), (9, 64)]
env = GridWorld(grid_map , start=(58, 5) , goal=(51, 12),goal_list=nodes)
# 实例化 RainbowDQN
agent = RainbowDQN(
    env=env,
    gamma=0.9,
    epsilon=1.0,
    epsilon_min=0.0001,
    epsilon_decay=0.995,
    learning_rate=0.005,
    batch_size=64,
    memory_size=200000
)
# 训练智能体
train_agent(env, agent, episodes=2000, max_steps=50000, save_path="dqn_model.pth", log_dir="runs")
#print(self.goal ,self.agent_pos , self.distance)

训练进度:   0%|          | 2/2000 [00:00<01:29, 22.39episode/s, 总奖励=-3, Epsilon=0.985]  

Model saved to dqn_model.pth
Model saved to dqn_model.pth


  state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
训练进度:   0%|          | 5/2000 [00:00<03:40,  9.05episode/s, 总奖励=-5.98, Epsilon=0.975]


RuntimeError: The size of tensor a (9) must match the size of tensor b (64) at non-singleton dimension 1

In [None]:
grid_map = np.loadtxt("grid_map_final.txt", dtype=int)
nodes=[(51, 12), (55, 23), (55, 30), (32, 47), (9, 50), (9, 64)]
env = GridWorld(grid_map , start=(58, 5) , goal=(51, 12),goal_list=nodes)
#[(58, 5), (51, 12), (55, 23), (55, 30), (32, 47), (9, 50), (9, 64), (5, 69)]
# 实例化 RainbowDQN
agent = RainbowDQN(
    env=env,
    gamma=0.99,
    epsilon=1.0,
    epsilon_min=0.01,
    epsilon_decay=0.995,
    learning_rate=5e-3,
    batch_size=64,
    memory_size=20000
)
check_trained_model(
    env=env,
    agent=agent,
    model_path="dqn_model.pth",  # 替换为你的模型文件路径
    max_steps=1000
)

  checkpoint = torch.load(file_path)


Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Test Completed. Total Reward: 9.64344713487131, Steps Taken: 14
仿真完成，耗时 20.07 秒


In [None]:
# 加载环境和模型
grid_map = np.loadtxt("grid_map_final.txt", dtype=int)
env = GridWorld(grid_map , start=(58, 5), goal=(51, 12), goal_list=nodes)
agent = RainbowDQN(
    env=env,
    gamma=0.99,
    epsilon=1.0,
    epsilon_min=0.01,
    epsilon_decay=0.995,
    learning_rate=5e-3,
    batch_size=64,
    memory_size=20000
)

# 运行100次测试
check_trained_model_num(
    env=env,
    agent=agent,
    model_path="dqn_model30_10_f.pth",  # 替换为你的模型文件路径
    max_steps=1000,
    num_runs=1000  # 运行100次测试
)

Model loaded from dqn_model30_10_f.pth
Loaded model from dqn_model30_10_f.pth


  checkpoint = torch.load(file_path)


Test Completed. Success Count: 646/1000
