diff --git a/.devcontainer/cpu/Dockerfile b/.devcontainer/cpu/Dockerfile index f44b0e6..731b4c8 100644 --- a/.devcontainer/cpu/Dockerfile +++ b/.devcontainer/cpu/Dockerfile @@ -1,23 +1,6 @@ -#FROM elainasuki/ros:ros2-humble-full-v3 -FROM elainasuki/ros:ros2-humble-full-0614 - -ARG USERNAME=Elaina - -ARG USER_GID=$USER_UID - -ARG GROUP_NAME=wheel -RUN apt-get update \ - && apt-get install -y sudo vim nautilus -USER ${USERNAME} -#安装前置依赖 +FROM ubuntu:22.04 +RUN apt-get update\ + && apt-get install -y git python3 python3-pip RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu -# 安装yolo -RUN pip install imutils ultralytics openvino -#安装其他依赖 - -RUN pip install jupyter d2l==0.17.6 - -USER root -RUN echo "source /opt/ros/humble/setup.bash" >> /home/${USERNAME}/.bashrc -ENV PYTHONPATH="/home/Elaina/yolo:${PYTHONPATH}" +RUN pip install pandas openpyxl matplotlib \ No newline at end of file diff --git a/.devcontainer/cpu/build.bash b/.devcontainer/cpu/build.bash index 9416aa9..9d362a6 100755 --- a/.devcontainer/cpu/build.bash +++ b/.devcontainer/cpu/build.bash @@ -9,7 +9,7 @@ echo "脚本目录: $SCRIPT_DIR" echo "父目录: $PARENT_DIR" # 设置默认 tag -TAG="pytorch" +TAG="pytorch_cpu" # 从外部传入的 IMAGE_REPO(格式:ghcr.io/user/repo 或 docker.io/user/repo) IMAGE_REPO="elainasuki/other" diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f137411..74ff08c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,7 +2,7 @@ "name": "information-task-container", "dockerComposeFile": "docker-compose.yml", "service": "gpu-service", - "workspaceFolder": "/home/Elaina/pytorch", + "workspaceFolder": "/pytorch", "shutdownAction": "stopCompose", "customizations": { "vscode": { diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 7345a13..9159444 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -1,10 +1,7 @@ version: '3' services: pytorch-service: - # build: - # context: .. - # dockerfile: .devcontainer/Dockerfile - image: elainasuki/other:pytorch + image: elainasuki/other:pytorch_cpu container_name: information-cpu-container environment: - DISPLAY=${DISPLAY} @@ -14,21 +11,13 @@ services: - TERM=xterm-256color volumes: - /tmp/.X11-unix:/tmp/.X11-unix - - ./..:/home/Elaina/pytorch - - /dev:/dev - network_mode: host - pid: "host" # 添加 pid 命名空间共享 - ipc: "host" # 添加 ipc 命名空间共享 - privileged: true + - ./..:/pytorch/ + entrypoint: [ '/bin/bash' ] stdin_open: true tty: true - user: "Elaina" # runtime: "nvidia" - working_dir: "/home/Elaina/pytorch" # 指定默认工作目录 + working_dir: "/pytorch" # 指定默认工作目录 gpu-service: - # build: - # context: .. - # dockerfile: .devcontainer/Dockerfile image: elainasuki/other:pytorch_gpu container_name: information-task-container environment: @@ -39,14 +28,9 @@ services: - TERM=xterm-256color volumes: - /tmp/.X11-unix:/tmp/.X11-unix - - ./..:/home/Elaina/pytorch - - /dev:/dev - network_mode: host - pid: "host" # 添加 pid 命名空间共享 - ipc: "host" # 添加 ipc 命名空间共享 - privileged: true - stdin_open: true - tty: true - user: "Elaina" + - ./..:/pytorch runtime: "nvidia" - working_dir: "/home/Elaina/pytorch" # 指定默认工作目录 + tty: true + stdin_open: true + entrypoint: [ '/bin/bash' ] + working_dir: "/pytorch" # 指定默认工作目录 diff --git a/.devcontainer/gpu/Dockerfile b/.devcontainer/gpu/Dockerfile index 9bfbd20..c8d96a8 100644 --- a/.devcontainer/gpu/Dockerfile +++ b/.devcontainer/gpu/Dockerfile @@ -1,44 +1,6 @@ -# 基础镜像:ROS 2 Humble(官方桌面版,基于Ubuntu 22.04) -FROM elainasuki/ros:ros2-humble-full-0614 +FROM pytorch/pytorch:2.9.0-cuda13.0-cudnn9-runtime -# 修复未定义变量 -ARG USERNAME=Elaina -ARG USER_UID=1000 -ARG USER_GID=$USER_UID +RUN pip3 install pandas openpyxl matplotlib -# 设置非交互模式+pip缓存目录(避免占用根目录空间) -ENV DEBIAN_FRONTEND=noninteractive -ENV PIP_CACHE_DIR=/tmp/pip-cache - -# 1. 精简基础依赖(仅保留必需项)+ 清理缓存 -RUN apt-get update && apt-get install -y --no-install-recommends \ - wget \ - ca-certificates \ - python3-pip \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# 2. 仅安装CUDA运行时(极致精简) -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \ - && dpkg -i cuda-keyring_1.1-1_all.deb \ - && apt-get update \ - && apt-get install -y --no-install-recommends \ - cuda-runtime-12-1 \ - libcudnn8=8.9.2.26-1+cuda12.1 \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ - && rm -f cuda-keyring_1.1-1_all.deb - -# 3. 配置环境变量(修复未定义问题) -ENV PATH=/usr/local/cuda-12.1/bin:${PATH} -ENV LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH:-/usr/local/lib} - -# 4. 优化PyTorch安装(节省空间) -RUN pip3 install --no-cache-dir --upgrade pip \ - && pip3 install --no-cache-dir --ignore-installed sympy \ - && pip3 install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 \ - && rm -rf $PIP_CACHE_DIR /tmp/* /var/tmp/* -#安装pandas 和openpyxl -RUN pip3 install pandas openpyxl -# 5. 配置ROS 2环境 -RUN echo "source /opt/ros/humble/setup.bash" >> ~/.bashrc \ No newline at end of file +RUN apt-get update &&\ + apt-get install -y git \ No newline at end of file diff --git a/.gitignore b/.gitignore index e69de29..f34d590 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,4 @@ +.~* +model/ +*.png +*.pyc \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..8aef7b1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.analysis.typeCheckingMode": "standard" +} \ No newline at end of file diff --git a/data/.~task2.xlsx b/data/.~task2.xlsx deleted file mode 100644 index 65e0b3e..0000000 Binary files a/data/.~task2.xlsx and /dev/null differ diff --git a/src/infer.py b/src/infer.py new file mode 100644 index 0000000..2bc4aec --- /dev/null +++ b/src/infer.py @@ -0,0 +1,25 @@ +from visual import * +from model import * +from load_data import * +def infer(): + #初始化参数 + model_path='/pytorch/model/best_transformer.pth' + device=torch.device('cuda' if torch.cuda.is_available() else'cpu') + #加载模型 + model=MLPModel(input_size=8,hidden_size=24,output_size=2,device=device) + model.load_state_dict(torch.load(model_path, map_location=device, weights_only=True)['model_state_dict']) + #加载数据 + data_path='/pytorch/data/task2.xlsx' + batcher = DataBatcher(file_path=data_path, val_ratio=0.2, batch_size=16,device=device) + val_inputs, val_outputs = batcher.getValBatches() + #进行推理 + output=model(val_inputs) + #取第1batch的结果比较 + print("真实值:", val_outputs[0]) + print("预测值:", output[0]) + # 计算均方误差 + loss_fn=torch.nn.MSELoss() + loss=loss_fn(output,val_outputs) + print(f"验证集均方误差: {loss.item():.4f}") +if __name__ == "__main__": + infer() \ No newline at end of file diff --git a/src/load_data.py b/src/load_data.py index 674d557..5dc69d2 100644 --- a/src/load_data.py +++ b/src/load_data.py @@ -17,13 +17,13 @@ def LoadTask2Data(file_path)->tuple[torch.Tensor, torch.Tensor]: outputs_tensor = torch.tensor(outputs, dtype=torch.float32) return inputs_tensor, outputs_tensor class DataBatcher: - def __init__(self, file_path: str, val_ratio: float = 0.2, batch_size: int = 32, shuffle: bool = True): + def __init__(self, file_path: str, val_ratio: float = 0.2, batch_size: int = 32, shuffle: bool = True,device: torch.device = torch.device('cpu')): """ 数据批处理工具类,支持划分验证集和生成批次数据 """ self.batch_size = batch_size self.shuffle = shuffle - + self.device = device # 加载原始数据 self.inputs, self.outputs = LoadTask2Data(file_path) @@ -81,16 +81,16 @@ def _splitAndCreateBatches(self, val_ratio: float): def getTrainBatches(self): """获取训练集批次张量 (输入, 输出)""" - return self.train_inputs, self.train_outputs + return self.train_inputs.to(self.device), self.train_outputs.to(self.device) def getValBatches(self): """获取验证集批次张量 (输入, 输出)""" - return self.val_inputs, self.val_outputs + return self.val_inputs.to(self.device), self.val_outputs.to(self.device) if __name__ == "__main__": # 测试代码 - file_path = os.path.join("/home/Elaina/pytorch/data", "task2.xlsx") + file_path = os.path.join("/pytorch/data", "task2.xlsx") # 初始化批处理工具 batcher = DataBatcher(file_path, val_ratio=0.2, batch_size=16) diff --git a/src/model.py b/src/model.py index e69de29..baf186a 100644 --- a/src/model.py +++ b/src/model.py @@ -0,0 +1,19 @@ +import torch.nn as nn +import os,torch +class MLPModel(nn.Module): + """简单的多层感知机模型""" + + def __init__(self, input_size, hidden_size, output_size, device=torch.device('cpu')): + super().__init__() + self.linear1 = nn.Linear(input_size, hidden_size) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(hidden_size, output_size) + self.device=device + self.to(device) # 关键:初始化时就把模型移到目标设备 + def forward(self, x:torch.Tensor)->torch.Tensor: + assert isinstance(x, torch.Tensor), "输入必须是torch.Tensor类型" + x = x.to(self.device) # 确保输入数据在正确的设备上 + x = self.linear1(x) + x = self.relu(x) + x = self.linear2(x) + return x \ No newline at end of file diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..d5eee0b --- /dev/null +++ b/src/train.py @@ -0,0 +1,32 @@ +from visual import * +from model import * +from load_data import * +def train(): + device=torch.device('cuda' if torch.cuda.is_available() else'cpu') + model=MLPModel(input_size=8,hidden_size=24,output_size=2,device=device) + data_path='/pytorch/data/task2.xlsx' + model_path='/pytorch/model/' + batcher = DataBatcher(file_path=data_path, val_ratio=0.2, batch_size=16,device=device) + train_inputs, train_outputs = batcher.getTrainBatches() + val_inputs, val_outputs = batcher.getValBatches() + #平方损失 + loss_fn=torch.nn.MSELoss() + optimizer=torch.optim.Adam(model.parameters(),lr=0.001) + + visual=SaveAndVisual(model_dir=model_path, loss_img_path=model_path+'loss_curve.png') + num_epoch=600 + visual.loadModel(model,optimizer,device) + for epoch in range(num_epoch): + model.train() + inputs=train_inputs + targets=train_outputs + optimizer.zero_grad() + outputs=model(inputs) + loss=loss_fn(outputs,targets) + loss.backward() + optimizer.step() + visual.updateVisualization(epoch,loss.item()) + visual.finalizeVisualization() + +if __name__ == "__main__": + train() \ No newline at end of file diff --git a/src/visual.py b/src/visual.py index e69de29..3071260 100644 --- a/src/visual.py +++ b/src/visual.py @@ -0,0 +1,86 @@ +# visual.py +import matplotlib.pyplot as plt +import os,torch +class SaveAndVisual: + """模型管理类,负责模型保存和训练可视化""" + + def __init__(self, model_dir='models', loss_img_path='loss_curve.png'): + self.model_dir = model_dir + self.loss_img_path = loss_img_path + self.epoch_losses = [] # 存储每个epoch的损失 + self.epoch_indices = [] # 存储epoch索引 + self._init_visualization() + self._init_model_dir() + self.loop_count=0 + def _init_model_dir(self): + """初始化模型保存目录""" + os.makedirs(self.model_dir, exist_ok=True) + + def _init_visualization(self): + """初始化可视化环境""" + plt.ion() # 开启交互模式 + self.fig, self.ax = plt.subplots(figsize=(10, 6)) + self.ax.set_xlabel("Epoch") + self.ax.set_ylabel("Loss") + self.ax.set_title("Training Loss (per Epoch)") + self.line, = self.ax.plot([], [], label="Epoch Loss") + self.ax.legend() + self.min_loss=float('inf') + def loadModel(self, model:torch.nn.Module, optimizer, device): + """加载已保存的模型""" + model_path = os.path.join(self.model_dir, 'best_transformer.pth') + self.model=model + self.optimizer=optimizer + if os.path.exists(model_path): + checkpoint = torch.load(model_path, map_location=device, weights_only=True) + model.load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + print(f"发现现有模型,其损失为: {checkpoint['loss']:.4f}") + self.min_loss=checkpoint['loss'] + return checkpoint['loss'] + return float('inf') + + def saveModel(self, model, optimizer, epoch, loss): + """保存模型检查点""" + model_path = os.path.join(self.model_dir, 'best_transformer.pth') + torch.save({ + 'epoch': epoch, + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'loss': loss, + }, model_path) + # print(f" 保存最佳模型(损失:{loss:.4f})") + + def updateVisualization(self, epoch, loss): + """更新训练损失可视化""" + self.epoch_losses.append(loss) + self.epoch_indices.append(epoch) + self.loop_count+=1 + if(self.loop_count%10==0): + self.loop_count=0 + print(f" 训练损失(第{epoch+1}轮):{loss:.4f}") + # 更新图像数据 + self.line.set_data(self.epoch_indices, self.epoch_losses) + self.ax.relim() # 重新计算坐标轴范围 + self.ax.autoscale_view() # 自动调整视图 + if(loss