In [76]:
import numpy as np
import random
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from collections import deque
import pygame

In [None]:
class GridWorld:
    def __init__(self, grid_map, start, goal, max_steps=50000, goal_list=None, obs=30, dobs=10):
        self.grid_map = grid_map
        self.base_grid_map = grid_map.copy()
        self.rows, self.cols = grid_map.shape
        self.max_steps = max_steps
        self.goal_list = goal_list if goal_list else [] 
        self.history = deque(maxlen=4) 
        self.num_obstacles = obs
        self.num_dobs = dobs
        
        if start is not None and goal is not None:
            self.start = start
            self.goal = goal
            self.agent_pos = self.start
            self.distance = np.linalg.norm(np.array(self.start) - np.array(self.goal))
        else:
            self.reset_dynamic(start, goal)

    def reset_dynamic(self):
        """
        每次 reset 时动态生成障碍物，并使用传入的起点和目标，或者随机生成。
        """
        self.grid_map = self.base_grid_map.copy()
        self.dynamic_obstacles = []
        self.obstacle_history = {i: deque(maxlen=4) for i in range(self.num_dobs)}  # 每个障碍物的历史
        # 固定障碍物初始化
        valid_positions = [
            (i, j) for i in range(1, self.rows - 1) for j in range(1, self.cols - 1)
            if self.grid_map[i, j] == 0
        ]
        random_obstacles = random.sample(valid_positions, self.num_obstacles)
        for x, y in random_obstacles:
            self.grid_map[x, y] = 1
            
        for _ in range(self.num_dobs):
            x, y = random.choice(valid_positions)
            direction = random.choice(['up', 'down', 'left', 'right'])
            speed = random.randint(1, 2)
            self.dynamic_obstacles.append({
                'position': (x, y),
                'direction': direction,
                'speed': speed,
                'history': deque([None]*4, maxlen=4)
            })

        # 确保起点和目标点不在障碍物中
        self.valid_positions = [
            (i, j) for i in range(1, self.rows - 1) for j in range(1, self.cols - 1)
            if self.grid_map[i, j] == 0
        ]
        self.agent_pos = self.start
        self.steps = 0
        self.distance = np.linalg.norm(np.array(self.start) - np.array(self.goal))

    def reset(self):
        """
        重置环境到初始状态
        """
        self.reset_dynamic()
        self.goal_list_copy = self.goal_list.copy()
        self.current_goal_index = 0
        self.goal = self.goal_list[self.current_goal_index] if self.goal_list else None
        return self.get_state()

    def get_state(self, n_frames=4):  
        nearby_grid = np.ones((7, 7), dtype=int)  # 初始为1表示空白区域
        x_min, x_max = max(0, self.agent_pos[0] - 3), min(self.rows, self.agent_pos[0] + 4)
        y_min, y_max = max(0, self.agent_pos[1] - 3), min(self.cols, self.agent_pos[1] + 4)
        r_min, r_max = 3 - (self.agent_pos[0] - x_min), 3 + (x_max - self.agent_pos[0])
        c_min, c_max = 3 - (self.agent_pos[1] - y_min), 3 + (y_max - self.agent_pos[1])

        # 获取当前智能体周围的区域
        nearby_grid[r_min:r_max, c_min:c_max] = self.grid_map[x_min:x_max, y_min:y_max]
        # nearby_flat = nearby_grid.flatten()

        # 计算距离目标的距离和角度
        dx = self.goal[0] - self.agent_pos[0]
        dy = self.goal[1] - self.agent_pos[1]
        distance_to_goal = np.sqrt(dx**2 + dy**2)
        angle_to_goal = np.arctan2(dy, dx)

        # current_state = np.concatenate(([distance_to_goal, angle_to_goal], nearby_flat))

        # 添加动态障碍物的预测位置到状态
        for idx, obstacle in enumerate(self.dynamic_obstacles):  # 遍历所有障碍物
            self.obstacle_history[idx].append(obstacle['position'])
            if len(self.obstacle_history[idx]) >= 4:
                # 计算障碍物的速度（基于过去4帧）
                pos_diff = np.array(self.obstacle_history[idx][-1]) - np.array(self.obstacle_history[idx][-2])
                predicted_pos = np.array(obstacle['position']) + pos_diff  # 预测未来位置
            else:
                predicted_pos = np.array(obstacle['position'])  # 若历史帧数不足，使用当前位

            # 将动态障碍物预测位置映射到7x7网格范围内
            dx_obstacle = predicted_pos[0] - self.agent_pos[0]
            dy_obstacle = predicted_pos[1] - self.agent_pos[1]

            # 如果障碍物预测位置在检测范围内，则临时将该栅格设为3
            if abs(dx_obstacle) <= 3 and abs(dy_obstacle) <= 3:
                grid_x = int(dx_obstacle + 3)  # 偏移到0到6之间
                grid_y = int(dy_obstacle + 3)  # 偏移到0到6之间
                nearby_grid[grid_x, grid_y] = 3  # 将该栅格标记为动态障碍物的预测落点

        # 展平并将动态障碍物的影响加入状态
        nearby_flat_with_dynamic = nearby_grid.flatten()
        current_state = np.concatenate(([distance_to_goal, angle_to_goal], nearby_flat_with_dynamic))

        self.history.append(current_state)
        while len(self.history) < n_frames:
            self.history.appendleft(current_state)

        return np.array(self.history)  # 返回(n_frames, state_dim)
    
    def step(self, action):
        """
        执行动作并更新环境状态
        """
        state = self.get_state()
        nearby_flat = state[-1][2:]

        actions = [(-1, 0), (1, 0), (0, -1), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1), (0 , 0)]
        delta = actions[action]
        next_pos = (self.agent_pos[0] + delta[0], self.agent_pos[1] + delta[1])

        # 检查是否越界或碰到障碍物（包括动态障碍物和固定障碍物）
        for obstacle in self.dynamic_obstacles:
            if self.agent_pos == obstacle['position']:
                reward = -5.0
                done = True
                return self.get_state(), reward, done

        if not (0 <= next_pos[0] < self.rows and 0 <= next_pos[1] < self.cols) or self.grid_map[next_pos] == 1:
            reward = -5.0
            done = True
            return self.get_state(), reward, done

        self.agent_pos = next_pos
        reward = -0.5
        done = False

        next_distance = np.sqrt((self.goal[0] - self.agent_pos[0])**2 + (self.goal[1] - self.agent_pos[1])**2)
        if(self.distance > next_distance):
            reward += 0.6
        else:
            reward -= 0.4
        self.distance = next_distance

        action_vector = np.array([delta[0], delta[1]])
        goal_vector = np.array([self.goal[0] - self.agent_pos[0], self.goal[1] - self.agent_pos[1]])
        goal_vector_norm = goal_vector / (np.linalg.norm(goal_vector) + 1e-5)
        
        if np.linalg.norm(goal_vector) > 0.1:
            alignment_reward = np.dot(action_vector, goal_vector_norm)
        else:
            alignment_reward = 0
        
        reward += alignment_reward * 0.4

        if self.distance < 0.5:
            reward += 10
            if self.current_goal_index + 1 < len(self.goal_list):
                self.current_goal_index += 1
                self.goal = self.goal_list[self.current_goal_index]  
            else:
                done = True

        # 更新动态障碍物的位置
        for obstacle in self.dynamic_obstacles:
            x, y = obstacle['position']
            direction = obstacle['direction']
            speed = obstacle['speed']
            
            if direction == 'up':
                new_pos = (x - speed, y)
            elif direction == 'down':
                new_pos = (x + speed, y)
            elif direction == 'left':
                new_pos = (x, y - speed)
            elif direction == 'right':
                new_pos = (x, y + speed)
            
            if 0 <= new_pos[0] < self.rows and 0 <= new_pos[1] < self.cols:
                if self.grid_map[new_pos[0], new_pos[1]] == 1:
                    if direction == 'up':
                        obstacle['direction'] = 'down'
                    elif direction == 'down':
                        obstacle['direction'] = 'up'
                    elif direction == 'left':
                        obstacle['direction'] = 'right'
                    elif direction == 'right':
                        obstacle['direction'] = 'left'
                else:
                    # 更新障碍物的位置
                    self.grid_map[x, y] = 0  # 清除旧位置
                    self.grid_map[new_pos[0], new_pos[1]] = 1  # 设置新位置
                    obstacle['position'] = new_pos

        # 计算最小障碍物距离
        min_distance_to_obstacle = float('inf')
        for i in range(self.rows):
            for j in range(self.cols):
                if self.grid_map[i, j] == 1:  # 障碍物
                    distance_to_obstacle = np.linalg.norm(np.array(self.agent_pos) - np.array((i, j)))
                    min_distance_to_obstacle = min(min_distance_to_obstacle, distance_to_obstacle)

        # 奖励：距离障碍物越近，惩罚越大
        if min_distance_to_obstacle == 1.0:
            reward -= 2.0
        elif min_distance_to_obstacle <= 2.0:
            reward -= 1.5
        elif min_distance_to_obstacle <= 3.0:
            reward -= 0.2
        else :
            reward -= 0.01
        
        # 检查智能体与预测落点的距离
        for idx, cell in enumerate(nearby_flat):
            if cell == 3:  # 栅格标记为3，表示动态障碍物的预测落点
                grid_x, grid_y = divmod(idx, 7)  # 还原栅格的坐标
                # 将局部坐标转换为全局坐标
                global_x = self.agent_pos[0] + (grid_x - 3)
                global_y = self.agent_pos[1] + (grid_y - 3)
                distance_to_prediction = np.linalg.norm(np.array(self.agent_pos) - np.array((global_x, global_y)))
                if distance_to_prediction < 2:  # 距离预测落点过近
                    reward -= 2.75  # 惩罚

        # 检查是否碰到动态障碍物
        for obstacle in self.dynamic_obstacles:
            if self.agent_pos == obstacle['position']:
                reward -= 5.0
                done = True
                return self.get_state(), reward, done

        self.steps += 1
        if self.steps >= self.max_steps:
            done = True
            reward -= 10

        return self.get_state(), reward, done

In [None]:
class DuelingDQN_with_LSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256, lstm_layers=1):
        super(DuelingDQN_with_LSTM, self).__init__()

        # 输入数据的前两项是距离和角度信息
        self.distance_angle_fc = nn.Linear(2, 64)  # 处理距离和角度的全连接层

        # 栅格部分：7x7栅格图像的卷积处理
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)  # 7x7x16
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1)  # 4x4x32
        
        # LSTM输入维度 = 32*4*4 + 64（卷积输出特征 + 距离/角度特征）
        self.lstm_input_dim = 32 * 4 * 4 + 64

        # LSTM层
        self.lstm = nn.LSTM(input_size=self.lstm_input_dim, hidden_size=hidden_dim, num_layers=lstm_layers, batch_first=True)

        # 状态价值（V(s)）分支
        self.value_fc = nn.Linear(hidden_dim, 128)
        self.value_output = nn.Linear(128, 1)

        # 优势（A(s, a)）分支
        self.advantage_fc = nn.Linear(hidden_dim, 128)
        self.advantage_output = nn.Linear(128, output_dim)

    def forward(self, x):
        batch_size, time_steps, state_dim = x.size()

        # 提取栅格部分：假设栅格信息位于第三维及以后的部分
        grid_states = x[:, :, 2:].view(batch_size * time_steps, 1, 7, 7)

        # 卷积处理栅格数据
        grid_features = F.relu(self.conv1(grid_states))
        grid_features = F.relu(self.conv2(grid_features))

        # 展平卷积层输出
        grid_features = grid_features.view(batch_size * time_steps, -1)

        # 提取距离和角度信息：前两项数据
        distance_angle = x[:, :, :2].view(batch_size * time_steps, 2)
        distance_angle = F.relu(self.distance_angle_fc(distance_angle))  # 经过全连接层

        # 将距离和角度信息与卷积输出拼接
        combined_features = torch.cat([grid_features, distance_angle], dim=1)

        # LSTM处理
        combined_features = combined_features.view(batch_size, time_steps, -1)
        lstm_out, (h_n, c_n) = self.lstm(combined_features)

        # 使用最后一个时间步的输出
        lstm_out_last = lstm_out[:, -1, :]

        value = F.relu(self.value_fc(lstm_out_last))
        value = self.value_output(value)

        advantage = F.relu(self.advantage_fc(lstm_out_last))
        advantage = self.advantage_output(advantage)

        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))

        return q_values

In [79]:
class RainbowDQN:
    def __init__(self, env, gamma=0.8, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.9, 
                learning_rate=4e-4, batch_size=64, memory_size=10000, device=None, alpha=0.6, beta=0.4,
                n_step=4, shared_replay_buffer=None):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.alpha = alpha  # 优先级的重要性
        self.beta = beta    # 用于优先级采样的偏差修正
        self.n_step = n_step  # Multi-step 参数
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        
        # 使用 Dueling DQN 网络
        self.q_network = DuelingDQN_with_LSTM(51, 9).to(self.device)  # 输入51是特征维度，输出9是动作维度
        self.target_network = DuelingDQN_with_LSTM(51, 9).to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)

        # 使用共享缓冲区（如果提供），否则实例化私有缓冲区
        self.memory = shared_replay_buffer or deque(maxlen=memory_size)
        self.priority_sum = 0  # 初始化总优先级为 0
        self.update_target_network()
    
    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())
    
    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, 8)
        
        state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)
        state_tensor = state_tensor.unsqueeze(0)  # (1, n_frames, state_dim) 转换为三维张量
        
        q_values = self.q_network(state_tensor)
        return torch.argmax(q_values).item()
    
    def store_transition(self, state, action, reward, next_state, done):
        # 计算多步TD误差
        td_error = 0.0  # 初始时，TD error 为 0

        if len(self.memory) > 0:
            state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device).unsqueeze(0)  # (1, n_frames, state_dim)
            next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(self.device).unsqueeze(0)  # (1, n_frames, state_dim)
            
            # 计算 Q 值和 TD Error
            q_values = self.q_network(state_tensor)
            action_tensor = torch.tensor([action], dtype=torch.long).to(self.device)
            current_q_values = q_values.gather(1, action_tensor.view(-1, 1)).squeeze()
            next_q_values = self.target_network(next_state_tensor).max()
            td_error = abs(reward + self.gamma * next_q_values.item() - current_q_values.item())
        
        # 存储经验并附加优先级
        priority = (td_error + 1e-5) ** self.alpha  # 1e-5 防止优先级为零
        self.memory.append((state, action, reward, next_state, done, priority))
        self.priority_sum += priority

    def train(self):
        if len(self.memory) < self.batch_size:
            return

        # 使用均匀采样
        #indices = random.sample(range(len(self.memory)), self.batch_size)
        #batch = [self.memory[idx] for idx in indices]
        # 使用优先级采样
        probabilities = np.array([transition[5] for transition in self.memory])
        probabilities += 1e-5  # 防止某些优先级为零
        probabilities /= probabilities.sum()  # 归一化概率总和为1
        indices = np.random.choice(len(self.memory), self.batch_size, p=probabilities)
        batch = [self.memory[idx] for idx in indices]
        states, actions, rewards, next_states, dones, priorities = zip(*batch)
        
        # 转换为 Tensor
        states = torch.tensor(np.array(states), dtype=torch.float32).to(self.device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).to(self.device)

        if states.dim() == 2:
            states = states.unsqueeze(0)  # (batch_size, n_frames, state_dim)
        if next_states.dim() == 2:
            next_states = next_states.unsqueeze(0)  # (batch_size, n_frames, state_dim)

        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)

        # 计算多步奖励目标
        target_q_values = rewards
        for i in range(1, self.n_step):
            target_q_values += (self.gamma ** i) * rewards

        next_q_values = self.target_network(next_states).max(1)[0]
        target_q_values += (self.gamma ** self.n_step) * next_q_values * (1 - dones)

        # 计算损失
        current_q_values = self.q_network(states).gather(1, actions.view(-1, 1)).squeeze()
        loss = nn.MSELoss()(current_q_values, target_q_values)

        # 反向传播
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 更新目标网络
        self.update_target_network()

    def update(self):
        self.update_target_network()

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self, file_path="dqn_model.pth"):
        torch.save({
            'q_network': self.q_network.state_dict(),
            'target_network': self.target_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon
        }, file_path)
        print(f"Model saved to {file_path}")
    
    def load_model(self, file_path="dqn_model.pth"):
        checkpoint = torch.load(file_path)
        self.q_network.load_state_dict(checkpoint['q_network'])
        self.target_network.load_state_dict(checkpoint['target_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']
        print(f"Model loaded from {file_path}")

In [None]:
def visualize_training(env, agent, episodes=1000, max_steps=1000, save_path="dqn_model.pth"):
    import pygame
    import numpy as np

    pygame.init()
    grid_size = 10  # 每个格子的像素大小
    screen = pygame.display.set_mode((env.cols * grid_size, env.rows * grid_size))
    pygame.display.set_caption("Training Visualization")
    clock = pygame.time.Clock()

    pygame.font.init()
    font = pygame.font.SysFont("Arial", 24)

    best_reward = -float('inf')

    for episode in range(episodes):
        env.reset_dynamic(num_obstacles=min(5 + episode // 100, 20))  # 动态调整障碍物数量
        state = env.reset()
        done = False
        total_reward = 0
        path = [env.agent_pos]  # 路径记录
        steps = 0

        while not done and steps < max_steps:
            # 渲染环境
            screen.fill((255, 255, 255))  # 清空屏幕，设置白色背景

            # 绘制栅格地图
            for i in range(env.rows):
                for j in range(env.cols):
                    color = (255, 255, 255)  # 默认白色
                    if env.grid_map[i, j] == 1:
                        color = (0, 0, 0)  # 黑色障碍物
                    pygame.draw.rect(screen, color, (j * grid_size, i * grid_size, grid_size, grid_size))

            # 绘制目标
            pygame.draw.rect(
                screen,
                (0, 0, 255),
                (env.goal[1] * grid_size, env.goal[0] * grid_size, grid_size, grid_size)
            )  # 蓝色目标点

            # 绘制路径轨迹
            for pos in path:
                pygame.draw.circle(
                    screen,
                    (200, 200, 200),  # 浅灰色轨迹
                    (int(pos[1] * grid_size + grid_size // 2), int(pos[0] * grid_size + grid_size // 2)),
                    5
                )

            # 绘制智能体
            car_x = int(env.agent_pos[1] * grid_size + grid_size // 2)
            car_y = int(env.agent_pos[0] * grid_size + grid_size // 2)
            car_radius = grid_size // 4
            pygame.draw.circle(screen, (255, 0, 0), (car_x, car_y), car_radius)

            # 智能体选择动作
            state_tensor = torch.tensor(state, dtype=torch.float32).to(agent.device)
            action = agent.select_action(state_tensor)
            next_state, reward, done = env.step(action)
            agent.store_transition(state, action, reward, next_state, done)
            agent.train()

            # 更新状态和累计奖励
            state = next_state
            total_reward += reward
            path.append(env.agent_pos)
            steps += 1

            # 显示单步奖励
            reward_text = font.render(f"Episode: {episode}", True, (0, 0, 0))
            screen.blit(reward_text, (10, 10))

            # 显示总奖励
            total_reward_text = font.render(f"Total Reward: {total_reward:.2f}", True, (0, 0, 0))
            screen.blit(total_reward_text, (10, 40))

            pygame.display.flip()  # 更新显示

            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    return
            clock.tick(60)

        # 记录最佳模型
        if total_reward > best_reward:
            best_reward = total_reward
            agent.save_model(save_path)

        # 每 10 轮打印日志
        if episode % 10 == 0:
            print(f"Episode {episode}: Total Reward = {total_reward:.2f}")

    pygame.quit()
    print("Training Completed!")


In [None]:
def train_agent(env, agent, episodes=1000, max_steps=1000, save_path="dqn_model.pth", log_dir="runs"): 
    run_id = time.strftime("%Y%m%d-%H%M%S")
    log_dir = os.path.join(log_dir, f"run_{run_id}")
    os.makedirs(log_dir, exist_ok=True)
    writer = SummaryWriter(log_dir)

    # 添加网络模型到 TensorBoard（仅记录一次）
    dummy_input = torch.rand(1, 10, 51).to(agent.device)
    writer.add_graph(agent.q_network, dummy_input)  # 将模型结构写入 TensorBoard

    best_reward = -float('inf')  # 记录最佳奖励
    step_counter = 0  # 全局步数计数器
    all_rewards = []  # 保存所有回合奖励
    all_losses = []  # 保存所有回合平均损失
    all_epsilons = []  # 保存所有回合探索率

    # 使用 tqdm 进度条显示训练过程
    with tqdm(total=episodes, desc="训练进度", unit="episode") as pbar:
        for episode in range(episodes):
            env.reset_dynamic()  # 重置环境动态障碍物
            state = env.reset()  # 获取初始状态
            done = False
            total_reward = 0  # 当前回合总奖励
            episode_steps = 0  # 当前回合步数计数
            losses = []  # 当前回合的损失列表
            q_values = []  # 当前回合的 Q 值记录

            # 在每个回合内训练多个目标点之间的路径
            goal_pairs = zip(env.goal_list, env.goal_list[1:])
            for start_goal, next_goal in goal_pairs:
                env.goal = start_goal  # 设置当前目标
                while not done:
                    # 转换状态为 Tensor
                    state_tensor = torch.tensor(state, dtype=torch.float32).to(agent.device)
                    
                    # 智能体选择动作
                    action = agent.select_action(state_tensor)
                    
                    # 环境执行动作并返回新状态、奖励和是否结束
                    next_state, reward, done = env.step(action)
                    
                    # 存储经验和训练模型
                    agent.store_transition(state, action, reward, next_state, done)
                    loss = agent.train()  # 返回当前训练的损失
                    state = next_state
                    total_reward += reward
                    episode_steps += 1
                    step_counter += 1

                    # 记录损失和 Q 值分布
                    if loss is not None:
                        losses.append(loss.item())
                    with torch.no_grad():
                        q_values.append(agent.q_network(state_tensor.unsqueeze(0)).cpu().numpy().flatten())

                    # 如果达到最大步数，则结束当前回合
                    if episode_steps >= max_steps:
                        done = True

            # 每个回合结束后，衰减 epsilon（探索率）
            agent.decay_epsilon()

            # 记录奖励、epsilon 和损失到 TensorBoard
            writer.add_scalar("Reward/Episode", total_reward, episode)
            writer.add_scalar("Epsilon", agent.epsilon, episode)
            if losses:
                avg_loss = np.mean(losses)
                writer.add_scalar("Loss/Episode", avg_loss, episode)
                all_losses.append(avg_loss)
            all_rewards.append(total_reward)
            all_epsilons.append(agent.epsilon)

            # 记录 Q 值分布
            if q_values:
                q_values_flat = np.concatenate(q_values)
                writer.add_histogram("Q_values/Distribution", q_values_flat, episode)

            # 更新 tqdm 进度条信息
            pbar.set_postfix({"总奖励": total_reward, "Epsilon": round(agent.epsilon, 3)})
            pbar.update(1)

            # 如果当前回合的奖励是历史最高，则保存模型
            if total_reward > best_reward:
                best_reward = total_reward
                agent.save_model(save_path)

            # 每 10 个回合刷新 TensorBoard 数据
            if episode % 10 == 0:
                writer.flush()

        # 保存最终模型
        agent.save_model(save_path)

        # 关闭 TensorBoard 日志记录器
        writer.close()

    print(f"训练完成，模型已保存到 {save_path}")

    # 保存奖励、损失和 epsilon 数据以便后续绘图
    np.save(os.path.join(log_dir, "rewards.npy"), all_rewards)
    np.save(os.path.join(log_dir, "losses.npy"), all_losses)
    np.save(os.path.join(log_dir, "epsilons.npy"), all_epsilons)


In [None]:
def check_trained_model_num(env, agent, model_path, max_steps=1000, num_runs=100, result_file="./test/test_results.txt"):
    # 加载训练好的模型
    agent.load_model(model_path)
    print(f"Loaded model from {model_path}")

    # 设置 epsilon 为 0（测试时只选择最优动作）
    agent.epsilon = 0

    success_count = 0  # 记录成功次数

    # 打开文件进行写入
    with open(result_file, 'w') as f:
        # 写入文件头
        f.write("Test Results for Trained Model\n")
        f.write(f"Model Path: {model_path}\n")
        f.write(f"Max Steps: {max_steps}, Total Runs: {num_runs}\n\n")
        f.write("Run\tSuccess\n")
        
        # 测试多次
        for run in range(num_runs):
            state = env.reset()
            done = False
            steps = 0
            total_reward = 0
            path = [env.agent_pos]  # 路径记录

            while not done and steps < max_steps:
                # 智能体选择动作
                action = agent.select_action(state)  # 测试时选择最优动作
                next_state, reward, done = env.step(action)

                # 更新状态和累计奖励
                state = next_state
                total_reward += reward
                path.append(env.agent_pos)  # 记录路径
                steps += 1

                # 检查是否到达目标
                if env.agent_pos == env.goal:
                    success_count += 1
                    # 写入测试成功的结果
                    f.write(f"{run+1}\tSuccess\n")
                    break  # 达到目标则结束当前测试
            else:
                # 如果没有成功，到达最大步数
                f.write(f"{run+1}\tFailed\n")
        
        # 输出成功次数
        print(f"Test Completed. Success Count: {success_count}/{num_runs}")
        # 将最终的成功率写入文件
        f.write(f"\nTotal Success Count: {success_count}/{num_runs}\n")
        f.write(f"Success Rate: {success_count / num_runs * 100:.2f}%\n")

In [83]:
def check_trained_model(env, agent, model_path, max_steps=1000): 
    # 加载训练好的模型
    agent.load_model(model_path)
    start_time = time.time()
    print(f"Loaded model from {model_path}")

    # 设置 epsilon 为 0（测试时只选择最优动作）
    agent.epsilon = 0

    # 初始化 Pygame
    pygame.init()
    grid_size = 10  # 每个格子的像素大小
    screen = pygame.display.set_mode((env.cols * grid_size, env.rows * grid_size))
    pygame.display.set_caption("Trained Model Test")
    clock = pygame.time.Clock()

    # 初始化字体
    pygame.font.init()
    font = pygame.font.SysFont("Arial", 24)

    # 测试
    state = env.reset()
    done = False
    total_reward = 0
    path = [env.agent_pos]  # 路径记录
    steps = 0
    
    # 标记哪些格子被点击并变为红色
    clicked_cells = set()

    # 动态障碍物路径记录
    for obstacle in env.dynamic_obstacles:
        obstacle['path'] = [tuple(obstacle['position'])]  # 初始化路径记录

    while not done and steps < max_steps:
        # 渲染环境
        screen.fill((255, 255, 255))  # 清空屏幕，设置白色背景
        dynamic_positions = set([tuple(obstacle['position']) for obstacle in env.dynamic_obstacles])

        # 绘制栅格地图
        for i in range(env.rows):
            for j in range(env.cols):
                color = (255, 255, 255)  # 默认白色
                if env.grid_map[i, j] == 1:
                    if (i, j) in clicked_cells:
                        color = (255, 0, 0)  # 点击的格子为红色
                    elif (i, j) in dynamic_positions:
                        color = (210, 180, 140)  # 动态障碍物为土黄色
                    else:
                        color = (0, 0, 0)  # 黑色障碍物
                pygame.draw.rect(screen, color, (j * grid_size, i * grid_size, grid_size, grid_size))

        # 绘制动态障碍物的运动轨迹
        for obstacle in env.dynamic_obstacles:
            for prev_pos in obstacle['path']:
                prev_x, prev_y = prev_pos
                pygame.draw.circle(screen, (210, 180, 140), (prev_y * grid_size + grid_size // 2, prev_x * grid_size + grid_size // 2), 2)

        # 绘制目标
        pygame.draw.rect(
            screen,
            (0, 0, 255),
            (env.goal[1] * grid_size, env.goal[0] * grid_size, grid_size, grid_size)
        )  # 蓝色目标点

        # 绘制小车的路径轨迹
        for pos in path:
            pygame.draw.circle(screen, (200, 200, 200), (int(pos[1] * grid_size + grid_size // 2), int(pos[0] * grid_size + grid_size // 2)), 2)

        # 绘制智能体
        car_x = int(env.agent_pos[1] * grid_size + grid_size // 2)
        car_y = int(env.agent_pos[0] * grid_size + grid_size // 2)
        car_radius = grid_size // 4

        # 绘制小车的主体
        pygame.draw.circle(screen, (255, 0, 0), (car_x, car_y), car_radius)

        # 智能体选择动作
        action = agent.select_action(state)  # 测试时选择最优动作
        next_state, reward, done = env.step(action)

        # 更新动态障碍物的路径
        for obstacle in env.dynamic_obstacles:
            obstacle['path'].append(tuple(obstacle['position']))

        pygame.display.flip()  # 更新显示

        # 更新状态和累计奖励
        state = next_state
        total_reward += reward
        path.append(env.agent_pos)  # 记录路径
        steps += 1

        # 检查是否按下退出事件或者添加障碍物
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return
            elif event.type == pygame.MOUSEBUTTONDOWN:
                # 获取鼠标点击的坐标
                mouse_x, mouse_y = pygame.mouse.get_pos()
                # 转换为对应的栅格坐标
                grid_x = mouse_x // grid_size
                grid_y = mouse_y // grid_size
                # 在地图中添加障碍物
                if env.grid_map[grid_y, grid_x] == 0:  # 只在空白位置添加障碍物
                    env.grid_map[grid_y, grid_x] = 1
                    clicked_cells.add((grid_y, grid_x))
                    print(f"Obstacle added at ({grid_y}, {grid_x})")
        
        # 限制帧率
        clock.tick(1)

    end_time = time.time()
    pygame.quit()  # 测试完成后关闭窗口

    # 输出测试结果
    print(f"Test Completed. Total Reward: {total_reward}, Steps Taken: {steps}")
    print(f"仿真完成，耗时 {end_time - start_time:.2f} 秒")

In [84]:
files = "grid_map_final.txt"
grid_map = np.loadtxt(files, dtype=int)
obst = [[0,0],
        [30,0],
        [0,10],
        [0,30],
        [30,10],
        [30,20],
        [30,30]]
nodes = [(58, 5), (51, 12), (55, 23), (55, 30), (32, 47), (9, 50), (9, 64)]

In [85]:
for i in range(16):
    env = GridWorld(grid_map , start=nodes[0] , goal=nodes[-1],goal_list=nodes,obs=obst[i][0], dobs=obst[i][1])
    agent = RainbowDQN(
    env=env,
    gamma=0.9,
    epsilon=1.0,
    epsilon_min=0.0001,
    epsilon_decay=0.995,
    learning_rate=0.005,
    batch_size=64,
    memory_size=200000
    )
    train_agent(env, agent, episodes=1000, max_steps=50000, save_path="dqn_model.pth", log_dir="runs")
    env = GridWorld(grid_map , start=nodes[0] , goal=nodes[-1],goal_list=nodes,obs=obst[i][0], dobs=obst[i][1])
    agent = RainbowDQN(
        env=env,
        gamma=0.99,
        epsilon=1.0,
        epsilon_min=0.01,
        epsilon_decay=0.995,
        learning_rate=5e-3,
        batch_size=64,
        memory_size=20000
    )

    # 运行100次测试
    check_trained_model_num(
        env=env,
        agent=agent,
        model_path="dqn_model.pth",  # 替换为你的模型文件路径
        max_steps=1000,
        num_runs=1000,  # 运行1000次测试
        result_file = f'./test/test_results{i}.txt'
    )

  state_tensor = torch.tensor(state, dtype=torch.float32).to(self.device)


Model saved to dqn_model.pth
Model saved to dqn_model.pth
Model saved to dqn_model.pth


训练进度:   1%|▏         | 13/1000 [00:00<00:47, 20.94episode/s, 总奖励=-7.3, Epsilon=0.932] 

Model saved to dqn_model.pth


训练进度:   4%|▍         | 38/1000 [00:05<02:34,  6.24episode/s, 总奖励=-23.5, Epsilon=0.822]

Model saved to dqn_model.pth


训练进度:  12%|█▏        | 122/1000 [00:51<12:04,  1.21episode/s, 总奖励=-11, Epsilon=0.543]  

Model saved to dqn_model.pth


训练进度:  13%|█▎        | 134/1000 [01:14<28:23,  1.97s/episode, 总奖励=-5, Epsilon=0.508]    

Model saved to dqn_model.pth


训练进度:  15%|█▍        | 147/1000 [01:31<18:06,  1.27s/episode, 总奖励=-5, Epsilon=0.476]  

Model saved to dqn_model.pth


训练进度:  17%|█▋        | 169/1000 [02:13<27:49,  2.01s/episode, 总奖励=71.9, Epsilon=0.429]  

Model saved to dqn_model.pth


训练进度:  18%|█▊        | 176/1000 [02:22<14:46,  1.08s/episode, 总奖励=-7.3, Epsilon=0.414] 

Model saved to dqn_model.pth


训练进度:  18%|█▊        | 182/1000 [02:35<26:17,  1.93s/episode, 总奖励=82.7, Epsilon=0.402]

Model saved to dqn_model.pth


训练进度:  20%|██        | 201/1000 [03:20<33:39,  2.53s/episode, 总奖励=84, Epsilon=0.365]  

Model saved to dqn_model.pth


训练进度:  21%|██        | 207/1000 [03:31<21:35,  1.63s/episode, 总奖励=84.9, Epsilon=0.354]

Model saved to dqn_model.pth


训练进度:  27%|██▋       | 271/1000 [05:27<19:46,  1.63s/episode, 总奖励=87.5, Epsilon=0.257]  

Model saved to dqn_model.pth


训练进度:  28%|██▊       | 281/1000 [05:45<20:32,  1.71s/episode, 总奖励=90.6, Epsilon=0.245]

Model saved to dqn_model.pth


训练进度:  29%|██▉       | 292/1000 [06:06<22:02,  1.87s/episode, 总奖励=-5, Epsilon=0.23]    

Model saved to dqn_model.pth


训练进度:  33%|███▎      | 331/1000 [07:11<12:23,  1.11s/episode, 总奖励=-0.101, Epsilon=0.189]

Model saved to dqn_model.pth


训练进度:  35%|███▌      | 352/1000 [07:47<16:03,  1.49s/episode, 总奖励=98.2, Epsilon=0.171]  

Model saved to dqn_model.pth


训练进度:  41%|████      | 412/1000 [09:37<15:20,  1.57s/episode, 总奖励=98.5, Epsilon=0.127]

Model saved to dqn_model.pth


训练进度:  43%|████▎     | 429/1000 [10:08<15:08,  1.59s/episode, 总奖励=99, Epsilon=0.116]  

Model saved to dqn_model.pth


训练进度:  49%|████▊     | 487/1000 [11:59<15:19,  1.79s/episode, 总奖励=99.4, Epsilon=0.087]

Model saved to dqn_model.pth


训练进度:  49%|████▉     | 489/1000 [12:03<16:15,  1.91s/episode, 总奖励=101, Epsilon=0.086] 

Model saved to dqn_model.pth


训练进度:  50%|█████     | 504/1000 [12:33<13:00,  1.57s/episode, 总奖励=101, Epsilon=0.08]  

Model saved to dqn_model.pth


训练进度:  57%|█████▋    | 572/1000 [15:29<35:46,  5.02s/episode, 总奖励=102, Epsilon=0.057] 

Model saved to dqn_model.pth


训练进度:  66%|██████▌   | 656/1000 [22:39<29:29,  5.14s/episode, 总奖励=102, Epsilon=0.037] 

Model saved to dqn_model.pth


训练进度:  79%|███████▊  | 786/1000 [28:25<07:18,  2.05s/episode, 总奖励=102, Epsilon=0.019] 

Model saved to dqn_model.pth


训练进度:  85%|████████▌ | 850/1000 [30:35<04:41,  1.88s/episode, 总奖励=102, Epsilon=0.014] 

Model saved to dqn_model.pth


训练进度:  90%|████████▉ | 895/1000 [32:09<03:34,  2.04s/episode, 总奖励=102, Epsilon=0.011] 

Model saved to dqn_model.pth


训练进度:  91%|█████████ | 909/1000 [32:38<03:17,  2.17s/episode, 总奖励=102, Epsilon=0.01]  

Model saved to dqn_model.pth


训练进度: 100%|██████████| 1000/1000 [35:54<00:00,  2.15s/episode, 总奖励=102, Epsilon=0.007]
  checkpoint = torch.load(file_path)


Model saved to dqn_model.pth
训练完成，模型已保存到 dqn_model.pth
Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Test Completed. Success Count: 1000/1000


训练进度:   0%|          | 1/1000 [00:00<01:05, 15.36episode/s, 总奖励=0.8, Epsilon=0.99]

Model saved to dqn_model.pth
Model saved to dqn_model.pth


训练进度:   4%|▍         | 41/1000 [00:06<01:42,  9.39episode/s, 总奖励=-5, Epsilon=0.81]    

Model saved to dqn_model.pth


训练进度:  15%|█▍        | 148/1000 [01:05<09:45,  1.45episode/s, 总奖励=23.8, Epsilon=0.476] 

Model saved to dqn_model.pth


训练进度:  15%|█▌        | 150/1000 [01:09<16:43,  1.18s/episode, 总奖励=35.6, Epsilon=0.471] 

Model saved to dqn_model.pth


训练进度:  16%|█▋        | 164/1000 [01:24<13:01,  1.07episode/s, 总奖励=48.6, Epsilon=0.44]  

Model saved to dqn_model.pth


训练进度:  17%|█▋        | 168/1000 [01:30<16:53,  1.22s/episode, 总奖励=-5, Epsilon=0.429]   

Model saved to dqn_model.pth


训练进度:  20%|█▉        | 198/1000 [01:59<18:13,  1.36s/episode, 总奖励=69.6, Epsilon=0.371] 

Model saved to dqn_model.pth


训练进度:  23%|██▎       | 233/1000 [02:47<21:35,  1.69s/episode, 总奖励=75, Epsilon=0.311]    

Model saved to dqn_model.pth


训练进度:  25%|██▌       | 253/1000 [03:17<16:28,  1.32s/episode, 总奖励=75.3, Epsilon=0.281]

Model saved to dqn_model.pth


训练进度:  26%|██▌       | 256/1000 [03:24<23:03,  1.86s/episode, 总奖励=80, Epsilon=0.277]  

Model saved to dqn_model.pth


训练进度:  26%|██▌       | 257/1000 [03:26<23:55,  1.93s/episode, 总奖励=80.9, Epsilon=0.276]

Model saved to dqn_model.pth


训练进度:  27%|██▋       | 273/1000 [03:55<22:08,  1.83s/episode, 总奖励=83.1, Epsilon=0.255]  

Model saved to dqn_model.pth


训练进度:  29%|██▊       | 287/1000 [04:18<20:28,  1.72s/episode, 总奖励=-5, Epsilon=0.236]  

Model saved to dqn_model.pth


训练进度:  30%|███       | 302/1000 [04:32<16:15,  1.40s/episode, 总奖励=92, Epsilon=0.22]     

Model saved to dqn_model.pth


训练进度:  30%|███       | 303/1000 [04:34<17:33,  1.51s/episode, 总奖励=93.9, Epsilon=0.219]

Model saved to dqn_model.pth


训练进度:  39%|███▉      | 388/1000 [07:03<19:34,  1.92s/episode, 总奖励=99.2, Epsilon=0.143] 

Model saved to dqn_model.pth


训练进度:  56%|█████▌    | 557/1000 [12:22<15:15,  2.07s/episode, 总奖励=99.7, Epsilon=0.061] 

Model saved to dqn_model.pth


训练进度:  65%|██████▌   | 654/1000 [15:36<11:39,  2.02s/episode, 总奖励=101, Epsilon=0.038] 

Model saved to dqn_model.pth


训练进度:  71%|███████   | 706/1000 [17:23<10:02,  2.05s/episode, 总奖励=101, Epsilon=0.029]   

Model saved to dqn_model.pth


训练进度:  84%|████████▍ | 840/1000 [21:55<05:37,  2.11s/episode, 总奖励=101, Epsilon=0.015] 

Model saved to dqn_model.pth


训练进度: 100%|██████████| 1000/1000 [27:43<00:00,  1.66s/episode, 总奖励=88.7, Epsilon=0.007]


Model saved to dqn_model.pth
训练完成，模型已保存到 dqn_model.pth
Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Test Completed. Success Count: 940/1000


训练进度:   0%|          | 0/1000 [00:00<?, ?episode/s, 总奖励=-7.3, Epsilon=0.995]

Model saved to dqn_model.pth


训练进度:   0%|          | 5/1000 [00:00<00:44, 22.22episode/s, 总奖励=-5, Epsilon=0.97]   

Model saved to dqn_model.pth


训练进度:   4%|▍         | 38/1000 [00:04<01:19, 12.06episode/s, 总奖励=-12.9, Epsilon=0.822]

Model saved to dqn_model.pth


训练进度:  10%|▉         | 98/1000 [00:26<08:35,  1.75episode/s, 总奖励=-5, Epsilon=0.609]    

Model saved to dqn_model.pth


训练进度:  13%|█▎        | 126/1000 [00:46<20:15,  1.39s/episode, 总奖励=16.3, Epsilon=0.532]  

Model saved to dqn_model.pth


训练进度:  14%|█▎        | 136/1000 [00:53<12:39,  1.14episode/s, 总奖励=3.1, Epsilon=0.503]   

Model saved to dqn_model.pth


训练进度:  15%|█▌        | 151/1000 [01:07<13:35,  1.04episode/s, 总奖励=3.1, Epsilon=0.467]   

Model saved to dqn_model.pth


训练进度:  15%|█▌        | 154/1000 [01:08<10:57,  1.29episode/s, 总奖励=-5, Epsilon=0.46]   

Model saved to dqn_model.pth


训练进度:  17%|█▋        | 172/1000 [01:29<24:51,  1.80s/episode, 总奖励=62, Epsilon=0.422]   

Model saved to dqn_model.pth


训练进度:  19%|█▉        | 192/1000 [01:48<12:38,  1.07episode/s, 总奖励=-5, Epsilon=0.382]   

Model saved to dqn_model.pth


训练进度:  20%|██        | 201/1000 [02:00<18:47,  1.41s/episode, 总奖励=-4.43, Epsilon=0.363]

Model saved to dqn_model.pth


训练进度:  23%|██▎       | 226/1000 [02:28<18:23,  1.43s/episode, 总奖励=80.8, Epsilon=0.322] 

Model saved to dqn_model.pth


训练进度:  28%|██▊       | 281/1000 [03:42<16:49,  1.40s/episode, 总奖励=10.3, Epsilon=0.243]  

Model saved to dqn_model.pth


训练进度:  32%|███▏      | 316/1000 [04:34<20:42,  1.82s/episode, 总奖励=92.3, Epsilon=0.205] 

Model saved to dqn_model.pth


训练进度:  35%|███▍      | 349/1000 [05:26<20:49,  1.92s/episode, 总奖励=93, Epsilon=0.174]  

Model saved to dqn_model.pth


训练进度:  38%|███▊      | 375/1000 [06:06<14:16,  1.37s/episode, 总奖励=3.1, Epsilon=0.152] 

Model saved to dqn_model.pth


训练进度:  41%|████      | 409/1000 [07:01<16:43,  1.70s/episode, 总奖励=96, Epsilon=0.129]  

Model saved to dqn_model.pth


训练进度:  45%|████▍     | 446/1000 [08:04<16:01,  1.74s/episode, 总奖励=98.1, Epsilon=0.107] 

Model saved to dqn_model.pth


训练进度:  46%|████▌     | 459/1000 [08:25<13:54,  1.54s/episode, 总奖励=99.8, Epsilon=0.1]  

Model saved to dqn_model.pth


训练进度:  58%|█████▊    | 577/1000 [11:44<12:09,  1.72s/episode, 总奖励=100, Epsilon=0.055]  

Model saved to dqn_model.pth


训练进度:  59%|█████▉    | 590/1000 [12:07<12:18,  1.80s/episode, 总奖励=101, Epsilon=0.052] 

Model saved to dqn_model.pth


训练进度:  61%|██████    | 610/1000 [12:39<11:19,  1.74s/episode, 总奖励=101, Epsilon=0.047] 

Model saved to dqn_model.pth


训练进度:  72%|███████▏  | 717/1000 [15:54<08:52,  1.88s/episode, 总奖励=101, Epsilon=0.027] 

Model saved to dqn_model.pth


训练进度:  74%|███████▎  | 736/1000 [16:27<08:17,  1.89s/episode, 总奖励=102, Epsilon=0.025] 

Model saved to dqn_model.pth


训练进度: 100%|██████████| 1000/1000 [25:34<00:00,  1.53s/episode, 总奖励=69.7, Epsilon=0.007]


Model saved to dqn_model.pth
训练完成，模型已保存到 dqn_model.pth
Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Test Completed. Success Count: 908/1000


训练进度:   1%|          | 6/1000 [00:00<00:49, 20.16episode/s, 总奖励=-5, Epsilon=0.966]   

Model saved to dqn_model.pth
Model saved to dqn_model.pth


训练进度:   1%|          | 11/1000 [00:00<01:13, 13.39episode/s, 总奖励=3.1, Epsilon=0.946] 

Model saved to dqn_model.pth


训练进度:   6%|▌         | 59/1000 [00:05<01:07, 13.93episode/s, 总奖励=-5, Epsilon=0.74]     

Model saved to dqn_model.pth


训练进度:  25%|██▌       | 253/1000 [00:46<09:58,  1.25episode/s, 总奖励=4.52, Epsilon=0.281]  

Model saved to dqn_model.pth


训练进度:  26%|██▌       | 261/1000 [00:51<07:06,  1.73episode/s, 总奖励=10.9, Epsilon=0.27]  

Model saved to dqn_model.pth


训练进度:  29%|██▉       | 288/1000 [01:13<06:42,  1.77episode/s, 总奖励=1.8, Epsilon=0.236]   

Model saved to dqn_model.pth


训练进度:  32%|███▏      | 321/1000 [01:38<12:59,  1.15s/episode, 总奖励=13.1, Epsilon=0.2]     

Model saved to dqn_model.pth


训练进度:  32%|███▏      | 322/1000 [01:39<11:30,  1.02s/episode, 总奖励=13.6, Epsilon=0.199]

Model saved to dqn_model.pth


训练进度:  32%|███▎      | 325/1000 [01:41<08:21,  1.34episode/s, 总奖励=14.3, Epsilon=0.196]

Model saved to dqn_model.pth


训练进度:  33%|███▎      | 328/1000 [01:44<09:11,  1.22episode/s, 总奖励=19.8, Epsilon=0.193] 

Model saved to dqn_model.pth


训练进度:  36%|███▌      | 356/1000 [01:59<08:17,  1.29episode/s, 总奖励=28.6, Epsilon=0.168]  

Model saved to dqn_model.pth


训练进度:  38%|███▊      | 378/1000 [02:17<07:52,  1.32episode/s, 总奖励=35.3, Epsilon=0.15]  

Model saved to dqn_model.pth


训练进度:  41%|████      | 407/1000 [02:44<10:42,  1.08s/episode, 总奖励=35.6, Epsilon=0.13]  

Model saved to dqn_model.pth


训练进度:  41%|████      | 410/1000 [02:47<09:35,  1.02episode/s, 总奖励=53.9, Epsilon=0.128]   

Model saved to dqn_model.pth


训练进度:  43%|████▎     | 429/1000 [03:09<16:37,  1.75s/episode, 总奖励=72.9, Epsilon=0.116]   

Model saved to dqn_model.pth


训练进度:  47%|████▋     | 466/1000 [03:55<12:44,  1.43s/episode, 总奖励=73.2, Epsilon=0.097]  

Model saved to dqn_model.pth


训练进度:  47%|████▋     | 474/1000 [04:08<15:43,  1.79s/episode, 总奖励=78.3, Epsilon=0.093] 

Model saved to dqn_model.pth


训练进度:  49%|████▉     | 493/1000 [04:36<14:38,  1.73s/episode, 总奖励=94.6, Epsilon=0.084]   

Model saved to dqn_model.pth


训练进度:  57%|█████▋    | 573/1000 [06:24<08:21,  1.18s/episode, 总奖励=96.3, Epsilon=0.057]  

Model saved to dqn_model.pth


训练进度:  71%|███████   | 706/1000 [09:44<08:31,  1.74s/episode, 总奖励=96.8, Epsilon=0.029]   

Model saved to dqn_model.pth


训练进度:  73%|███████▎  | 733/1000 [10:27<08:36,  1.93s/episode, 总奖励=98.2, Epsilon=0.025]

Model saved to dqn_model.pth


训练进度:  78%|███████▊  | 781/1000 [11:44<05:38,  1.54s/episode, 总奖励=98.5, Epsilon=0.02] 

Model saved to dqn_model.pth


训练进度:  79%|███████▉  | 791/1000 [12:00<05:02,  1.45s/episode, 总奖励=100, Epsilon=0.019] 

Model saved to dqn_model.pth


训练进度:  91%|█████████ | 908/1000 [15:22<01:54,  1.25s/episode, 总奖励=102, Epsilon=0.011]  

Model saved to dqn_model.pth


训练进度: 100%|██████████| 1000/1000 [18:12<00:00,  1.09s/episode, 总奖励=92.4, Epsilon=0.007]


Model saved to dqn_model.pth
训练完成，模型已保存到 dqn_model.pth
Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Test Completed. Success Count: 688/1000


训练进度:   0%|          | 2/1000 [00:00<00:30, 33.21episode/s, 总奖励=3.1, Epsilon=0.985]

Model saved to dqn_model.pth
Model saved to dqn_model.pth


训练进度:  17%|█▋        | 169/1000 [01:18<03:59,  3.48episode/s, 总奖励=-5, Epsilon=0.427]    

Model saved to dqn_model.pth


训练进度:  18%|█▊        | 178/1000 [01:26<10:28,  1.31episode/s, 总奖励=15.7, Epsilon=0.41]  

Model saved to dqn_model.pth


训练进度:  18%|█▊        | 179/1000 [01:27<10:42,  1.28episode/s, 总奖励=24.5, Epsilon=0.408]

Model saved to dqn_model.pth


训练进度:  20%|█▉        | 196/1000 [01:42<10:12,  1.31episode/s, 总奖励=46.9, Epsilon=0.374]  

Model saved to dqn_model.pth


训练进度:  22%|██▏       | 218/1000 [02:06<10:04,  1.29episode/s, 总奖励=59, Epsilon=0.335]    

Model saved to dqn_model.pth


训练进度:  22%|██▏       | 220/1000 [02:10<17:16,  1.33s/episode, 总奖励=65.9, Epsilon=0.332] 

Model saved to dqn_model.pth


训练进度:  28%|██▊       | 277/1000 [03:21<16:49,  1.40s/episode, 总奖励=3.1, Epsilon=0.248]   

Model saved to dqn_model.pth


训练进度:  29%|██▉       | 291/1000 [03:40<17:01,  1.44s/episode, 总奖励=68.7, Epsilon=0.233]  

Model saved to dqn_model.pth


训练进度:  30%|███       | 302/1000 [03:58<18:53,  1.62s/episode, 总奖励=72.9, Epsilon=0.22] 

Model saved to dqn_model.pth


训练进度:  32%|███▏      | 315/1000 [04:18<20:10,  1.77s/episode, 总奖励=79, Epsilon=0.206]   

Model saved to dqn_model.pth


训练进度:  34%|███▎      | 336/1000 [04:46<12:09,  1.10s/episode, 总奖励=3.1, Epsilon=0.185]   

Model saved to dqn_model.pth


训练进度:  36%|███▋      | 364/1000 [05:33<20:12,  1.91s/episode, 总奖励=83.9, Epsilon=0.161] 

Model saved to dqn_model.pth


训练进度:  39%|███▉      | 388/1000 [06:11<18:37,  1.83s/episode, 总奖励=3.1, Epsilon=0.142] 

Model saved to dqn_model.pth


训练进度:  41%|████      | 412/1000 [06:44<16:27,  1.68s/episode, 总奖励=-5.88, Epsilon=0.126]

Model saved to dqn_model.pth


训练进度:  42%|████▏     | 421/1000 [06:54<13:32,  1.40s/episode, 总奖励=90.7, Epsilon=0.121] 

Model saved to dqn_model.pth


训练进度:  48%|████▊     | 480/1000 [08:25<10:27,  1.21s/episode, 总奖励=-5, Epsilon=0.09]      

Model saved to dqn_model.pth


训练进度:  51%|█████     | 506/1000 [09:08<15:45,  1.91s/episode, 总奖励=96.9, Epsilon=0.079]

Model saved to dqn_model.pth


训练进度:  58%|█████▊    | 581/1000 [11:14<12:25,  1.78s/episode, 总奖励=98.7, Epsilon=0.054]  

Model saved to dqn_model.pth


训练进度:  90%|█████████ | 902/1000 [20:24<02:30,  1.54s/episode, 总奖励=101, Epsilon=0.011]  

Model saved to dqn_model.pth


训练进度: 100%|██████████| 1000/1000 [23:49<00:00,  1.43s/episode, 总奖励=96.3, Epsilon=0.007]


Model saved to dqn_model.pth
训练完成，模型已保存到 dqn_model.pth
Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Test Completed. Success Count: 748/1000


训练进度:   0%|          | 0/1000 [00:00<?, ?episode/s, 总奖励=-10.3, Epsilon=0.995]

Model saved to dqn_model.pth


训练进度:   0%|          | 5/1000 [00:00<01:59,  8.30episode/s, 总奖励=-7.3, Epsilon=0.97]  

Model saved to dqn_model.pth
Model saved to dqn_model.pth


训练进度:   1%|          | 11/1000 [00:01<02:15,  7.29episode/s, 总奖励=-15.2, Epsilon=0.942]

Model saved to dqn_model.pth


训练进度:  15%|█▍        | 147/1000 [00:46<06:49,  2.08episode/s, 总奖励=9.53, Epsilon=0.479] 

Model saved to dqn_model.pth


训练进度:  19%|█▉        | 191/1000 [01:15<07:14,  1.86episode/s, 总奖励=-3.55, Epsilon=0.384] 

Model saved to dqn_model.pth


训练进度:  24%|██▍       | 244/1000 [02:08<12:01,  1.05episode/s, 总奖励=-5, Epsilon=0.293]    

Model saved to dqn_model.pth


训练进度:  28%|██▊       | 284/1000 [02:32<07:52,  1.52episode/s, 总奖励=35.8, Epsilon=0.241] 

Model saved to dqn_model.pth


训练进度:  30%|███       | 303/1000 [02:49<13:42,  1.18s/episode, 总奖励=46.7, Epsilon=0.219]   

Model saved to dqn_model.pth


训练进度:  31%|███       | 307/1000 [02:54<15:24,  1.33s/episode, 总奖励=-5, Epsilon=0.214]  

Model saved to dqn_model.pth


训练进度:  34%|███▎      | 335/1000 [03:30<16:45,  1.51s/episode, 总奖励=55.6, Epsilon=0.187] 

Model saved to dqn_model.pth


训练进度:  34%|███▍      | 340/1000 [03:35<12:10,  1.11s/episode, 总奖励=0.8, Epsilon=0.181] 

Model saved to dqn_model.pth


训练进度:  39%|███▉      | 392/1000 [04:31<12:24,  1.22s/episode, 总奖励=69, Epsilon=0.14]     

Model saved to dqn_model.pth


训练进度:  42%|████▏     | 416/1000 [05:03<15:34,  1.60s/episode, 总奖励=80.3, Epsilon=0.124] 

Model saved to dqn_model.pth


训练进度:  45%|████▍     | 446/1000 [05:32<07:58,  1.16episode/s, 总奖励=83.8, Epsilon=0.107]  

Model saved to dqn_model.pth


训练进度:  49%|████▉     | 491/1000 [06:22<12:26,  1.47s/episode, 总奖励=83.9, Epsilon=0.085] 

Model saved to dqn_model.pth


训练进度:  50%|█████     | 505/1000 [06:34<09:52,  1.20s/episode, 总奖励=87.7, Epsilon=0.08]  

Model saved to dqn_model.pth


训练进度:  57%|█████▋    | 574/1000 [08:12<12:19,  1.74s/episode, 总奖励=88.4, Epsilon=0.056] 

Model saved to dqn_model.pth


训练进度:  58%|█████▊    | 576/1000 [08:14<10:30,  1.49s/episode, 总奖励=92, Epsilon=0.056]  

Model saved to dqn_model.pth


训练进度:  64%|██████▎   | 635/1000 [09:40<11:24,  1.88s/episode, 总奖励=92.4, Epsilon=0.041]  

Model saved to dqn_model.pth


训练进度:  68%|██████▊   | 684/1000 [10:52<08:18,  1.58s/episode, 总奖励=95.4, Epsilon=0.032] 

Model saved to dqn_model.pth


训练进度:  72%|███████▏  | 722/1000 [11:58<08:06,  1.75s/episode, 总奖励=96, Epsilon=0.027]  

Model saved to dqn_model.pth


训练进度:  73%|███████▎  | 732/1000 [12:16<08:18,  1.86s/episode, 总奖励=97, Epsilon=0.025]  

Model saved to dqn_model.pth


训练进度: 100%|██████████| 1000/1000 [19:32<00:00,  1.17s/episode, 总奖励=17.1, Epsilon=0.007] 


Model saved to dqn_model.pth
训练完成，模型已保存到 dqn_model.pth
Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Test Completed. Success Count: 635/1000


训练进度:   0%|          | 2/1000 [00:00<01:27, 11.39episode/s, 总奖励=-7.3, Epsilon=0.985]

Model saved to dqn_model.pth
Model saved to dqn_model.pth


训练进度:   2%|▏         | 22/1000 [00:04<03:44,  4.36episode/s, 总奖励=-5, Epsilon=0.896]   

Model saved to dqn_model.pth


训练进度:  14%|█▎        | 137/1000 [01:08<07:30,  1.92episode/s, 总奖励=3.6, Epsilon=0.501]   

Model saved to dqn_model.pth


训练进度:  19%|█▊        | 187/1000 [02:33<12:00,  1.13episode/s, 总奖励=21, Epsilon=0.392]    

Model saved to dqn_model.pth


训练进度:  21%|██        | 209/1000 [03:04<09:34,  1.38episode/s, 总奖励=-0.101, Epsilon=0.351]

Model saved to dqn_model.pth


训练进度:  24%|██▍       | 240/1000 [04:01<33:07,  2.61s/episode, 总奖励=28.1, Epsilon=0.3]    

Model saved to dqn_model.pth


训练进度:  30%|███       | 301/1000 [06:30<21:25,  1.84s/episode, 总奖励=-5, Epsilon=0.22]     

Model saved to dqn_model.pth


训练进度:  31%|███       | 310/1000 [07:04<58:40,  5.10s/episode, 总奖励=46.1, Epsilon=0.211] 

Model saved to dqn_model.pth


训练进度:  31%|███       | 311/1000 [07:11<1:02:05,  5.41s/episode, 总奖励=63.4, Epsilon=0.21]

Model saved to dqn_model.pth


训练进度:  40%|███▉      | 399/1000 [11:29<47:15,  4.72s/episode, 总奖励=65.7, Epsilon=0.135]   

Model saved to dqn_model.pth


训练进度:  41%|████      | 409/1000 [11:56<22:58,  2.33s/episode, 总奖励=73.2, Epsilon=0.129]

Model saved to dqn_model.pth


训练进度:  46%|████▌     | 462/1000 [14:44<37:43,  4.21s/episode, 总奖励=82.1, Epsilon=0.099]   

Model saved to dqn_model.pth


训练进度:  55%|█████▍    | 546/1000 [19:06<31:45,  4.20s/episode, 总奖励=92.8, Epsilon=0.065] 

Model saved to dqn_model.pth


训练进度:  81%|████████  | 810/1000 [36:16<15:34,  4.92s/episode, 总奖励=93.1, Epsilon=0.017] 

Model saved to dqn_model.pth


训练进度:  95%|█████████▌| 950/1000 [46:08<03:56,  4.74s/episode, 总奖励=98.3, Epsilon=0.009] 

Model saved to dqn_model.pth


训练进度: 100%|██████████| 1000/1000 [49:17<00:00,  2.96s/episode, 总奖励=45.6, Epsilon=0.007]


Model saved to dqn_model.pth
训练完成，模型已保存到 dqn_model.pth
Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Test Completed. Success Count: 578/1000


IndexError: list index out of range

In [None]:
# # 加载环境和模型
# grid_map = np.loadtxt(files, dtype=int)
# env = GridWorld(grid_map , start=nodes[0], goal=nodes[-1], goal_list=nodes)
# agent = RainbowDQN(
#     env=env,
#     gamma=0.99,
#     epsilon=1.0,
#     epsilon_min=0.01,
#     epsilon_decay=0.995,
#     learning_rate=5e-3,
#     batch_size=64,
#     memory_size=20000
# )
# check_trained_model(
#     env=env,
#     agent=agent,
#     model_path="dqn_model.pth",  # 替换为你的模型文件路径
#     max_steps=1000
# )

  checkpoint = torch.load(file_path)


Model loaded from dqn_model.pth
Loaded model from dqn_model.pth
Obstacle added at (56, 97)
Obstacle added at (55, 77)
Obstacle added at (84, 81)
Obstacle added at (80, 41)
Obstacle added at (70, 15)
Obstacle added at (48, 44)
Obstacle added at (31, 22)
Obstacle added at (16, 39)
Obstacle added at (12, 68)
Obstacle added at (79, 30)
Obstacle added at (80, 30)
Obstacle added at (51, 77)
Obstacle added at (64, 80)
Obstacle added at (83, 39)
Obstacle added at (10, 46)
Obstacle added at (14, 84)
Test Completed. Total Reward: 196.6471298567925, Steps Taken: 288
仿真完成，耗时 288.23 秒
