# Loss 测试


In [2]:
import os
from sched import scheduler

import torch
from jupyter_server.services.contents.checkpoints import Checkpoints
from torch import nn
# Example of target with class indices
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
print(input)
print(target)

output = loss(input, target)
print(output)
output.backward()
print(output)

tensor([[ 0.5084, -1.7636,  0.7366,  0.1848, -0.8711],
        [-0.3399, -0.1831, -0.3349,  0.9180,  1.1863],
        [-1.1433, -1.0251, -0.1639, -1.0709, -1.1389]], requires_grad=True)
tensor([3, 4, 3])
tensor(1.4268, grad_fn=<NllLossBackward0>)
tensor(1.4268, grad_fn=<NllLossBackward0>)


In [3]:
# Example of target with class probabilities
# If containing class probabilities, same shape as the input and each value should be between [0,1]
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
print(input)
print(target)
output = loss(input, target)
output.backward()
print(output)

tensor([[-0.5021,  1.4945, -0.3166,  0.8346, -2.5062],
        [-0.1088, -2.0948,  0.6726, -0.6959,  2.0686],
        [-1.7934,  0.2097,  2.4396,  0.9922,  1.0973]], requires_grad=True)
tensor([[0.0412, 0.4861, 0.0452, 0.3562, 0.0712],
        [0.0729, 0.2705, 0.1232, 0.3744, 0.1590],
        [0.0355, 0.3614, 0.1140, 0.4536, 0.0355]])
tensor(2.0965, grad_fn=<DivBackward1>)


# 优化器使用重点：
```
optimizer.zero_grad()
loss.backward()
optimizer.step()
```


# 训练模板

导包

In [13]:
import torch

from src.model import AntBeeClassifier
from src.dataset import ClassDirectoryDataset
from torch.optim import Adam
from torch.utils.data import DataLoader,random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch import autocast, GradScaler
from torch.amp.autocast_mode import is_autocast_available

import numpy as np
import os
from datetime import datetime

from tqdm import tqdm
from tqdm import trange

from torch.utils.tensorboard import SummaryWriter


参数配置

In [14]:
# ==== 训练配置 ====
current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
experiment_name = f"exp-{current_time}"

checkpoints_dir = f"checkpoints/{experiment_name}"
os.makedirs(checkpoints_dir, exist_ok=True)


writer = SummaryWriter(f"runs/{experiment_name}")
# ==== 参数设定 ====
num_epochs = 100
batch_size = 32
num_workers = 2
learning_rate = 1e-3

patience = 30
best_val_loss = np.inf
epochs_without_improvement = 0 # 记录val_loss已经多少个epoch没有下降了

设备选择

In [15]:
# ==== 设备选择 ====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if is_autocast_available(str(device)):
    print("Autocast available")
else:
    print("Autocast not available")

Using device: cuda
Autocast available


数据

In [16]:
# ==== 创建dataset 和 dataloader ====
dataset = ClassDirectoryDataset("../data/hymenoptera_data/train", ["jpg"])
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = random_split(dataset, [train_size, val_size],
                                  generator=torch.Generator().manual_seed(42)
)


train_loader = DataLoader(train_set,
                          batch_size=batch_size,
                          num_workers=num_workers,
                          shuffle=True,
                          pin_memory=True  # 如果使用GPU，启用此选项加速数据传输
)
val_loader = DataLoader(val_set,
                        batch_size=batch_size,
                        num_workers=num_workers,
                        shuffle=True,
                        pin_memory=True
)

In [17]:
img0, label0 = dataset[5]
print("img0.shape:", img0.shape)
print("label0_id:", label0)
print("label0:", dataset.classes[label0])

img0.shape: torch.Size([3, 224, 224])
label0_id: 0
label0: ants


模型实例、损失函数、Optimizer

In [18]:
# ==== 模型实例 ====
model = AntBeeClassifier(dropout_rate=0.2).to(device) # 1. build a model
# ==== 损失函数 & 优化器 ====
criterion = torch.nn.CrossEntropyLoss() # 2. define the loss
optimizer = Adam(model.parameters(), lr=learning_rate , weight_decay=1e-4) # 3. do the optimize work
# 添加权重衰减防止过拟合
scheduler = ReduceLROnPlateau(optimizer)
scaler = GradScaler()  # 用于缩放 loss，防止下溢

Training + Validation loop

In [19]:
# ==== Training + Validation loop ====
for epoch in range(num_epochs):
    # --- Training ---
    model.train()
    train_loss = 0.0 # 1个 epoch 中 所有 batch 的损失之和
    correct, total = 0, 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        optimizer.zero_grad()

        # Enables autocasting for the forward pass (model + loss)
        with autocast(device_type=str(device)):
            outputs = model(inputs)
            loss = criterion(outputs, labels) # 计算 交叉熵loss

        # Scales the loss, and calls backward() to create scaled gradients
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration
        scaler.update()

        train_loss += loss.item()
        predicted = outputs.argmax(1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_train_loss = train_loss / len(train_loader) # 把总损失除以 batch 数，就是平均每个 batch 的损失
    train_acc = correct / total

    # --- Validation ---
    model.eval()
    val_loss = 0.0
    correct, total = 0, 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            predicted = outputs.argmax(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / total

    scheduler.step(avg_val_loss)

    # --- 模型保存 ---
    # 保存最新模型
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "train_acc": train_acc,
        "val_acc": val_acc,
    }
    torch.save(checkpoint, os.path.join(checkpoints_dir, "latest_model.pth"))

    # 保存最佳模型（基于验证损失）
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(checkpoint, os.path.join(checkpoints_dir, "best_model.pth"))
        epochs_without_improvement = 0
        print(f"New best model saved with val_loss: {avg_val_loss:.4f}")
    else:
        epochs_without_improvement += 1

    # --- Log ---
    print(f"Epoch [{epoch+1}/{num_epochs}] "
          f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f} | "
          f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}")
    writer.add_scalar('Loss/train', avg_train_loss, epoch)
    writer.add_scalar('Loss/val', avg_val_loss, epoch)
    writer.add_scalar('Accuracy/train', train_acc, epoch)
    writer.add_scalar('Accuracy/val', val_acc, epoch)

    # --- 早停检查 ---
    if epochs_without_improvement >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs!")
        break

writer.close()

New best model saved with val_loss: 0.6898
Epoch [1/100] Train Loss: 0.6852, Train Acc: 0.5722 | Val Loss: 0.6898, Val Acc: 0.5510
Epoch [2/100] Train Loss: 0.6262, Train Acc: 0.7010 | Val Loss: 0.7057, Val Acc: 0.4694
Epoch [3/100] Train Loss: 0.6703, Train Acc: 0.6186 | Val Loss: 0.6960, Val Acc: 0.5510
New best model saved with val_loss: 0.6506
Epoch [4/100] Train Loss: 0.6133, Train Acc: 0.6340 | Val Loss: 0.6506, Val Acc: 0.6735
New best model saved with val_loss: 0.5663
Epoch [5/100] Train Loss: 0.6280, Train Acc: 0.6856 | Val Loss: 0.5663, Val Acc: 0.6939
Epoch [6/100] Train Loss: 0.5532, Train Acc: 0.7371 | Val Loss: 0.6902, Val Acc: 0.5510
Epoch [7/100] Train Loss: 0.5435, Train Acc: 0.7062 | Val Loss: 0.6354, Val Acc: 0.5918
Epoch [8/100] Train Loss: 0.5058, Train Acc: 0.7165 | Val Loss: 0.6916, Val Acc: 0.6327
Epoch [9/100] Train Loss: 0.4515, Train Acc: 0.7371 | Val Loss: 0.5805, Val Acc: 0.6939
Epoch [10/100] Train Loss: 0.4762, Train Acc: 0.7577 | Val Loss: 0.7563, Val Ac

In [20]:
# 训练完成后加载最佳模型
best_checkpoint = torch.load(os.path.join(checkpoints_dir, "best_model.pth"), weights_only=True)
best_model = AntBeeClassifier()
best_model.load_state_dict(best_checkpoint["model_state_dict"])

<All keys matched successfully>