# 训练

## 设置随机种子

In [106]:
import random
import torch


# -------------------------------
# 1. 设置随机种子，保证实验可复现
# -------------------------------
def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


set_seed(42)

# 创建保存中间结果的目录
import os

os.makedirs("intermediates", exist_ok=True)

## 参数配置

In [107]:
config = {
    "data_dir": "/data/Workspace/CIC-IoTDataset2023/bin-class",  # 数据集目录
    "num_classes": 2,  # 类别数
    "batch_size": 128,  # 批量大小（根据显存实际情况调整）
    "lr": 1e-4,  # 学习率
    "epochs": 100,  # 总训练轮数
    "model_name": "Conv2Former",
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

## 数据增强设置

In [108]:
from torchvision import transforms

train_transform = transforms.Compose([
    # transforms.RandomResizedCrop(64, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

valid_transform = transforms.Compose([
    transforms.Resize(64),
    transforms.CenterCrop(64),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

## 数据集与 DataLoader

In [109]:
from MyDataset import CustomDataset
from torch.utils.data import DataLoader

train_dataset = CustomDataset(os.path.join(config["data_dir"], "train"), transform=train_transform)
valid_dataset = CustomDataset(os.path.join(config["data_dir"], "valid"), transform=valid_transform)

train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, num_workers=8, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config["batch_size"], shuffle=False)

## 模型定义

In [110]:
# 这里 dims 与 depths 可根据实验调节，本示例给出较轻量配置
from deepseek import Conv2Former

model = Conv2Former(dims=[96, 192, 384, 768], depths=[3, 3, 9, 3], num_classes=config["num_classes"])
model = model.to(config["device"])

## 损失函数、优化器与学习率调度器

In [111]:
from torch import nn, optim
from torch.cuda.amp import GradScaler

# 这里采用了类别权重，适用于类别不平衡场景
class_weights = torch.tensor([7.0, 1.0])
criterion = nn.CrossEntropyLoss(label_smoothing=0.1, weight=class_weights.to(config["device"]))

optimizer = optim.AdamW(model.parameters(), lr=config["lr"])
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, epochs=config["epochs"], max_lr=1e-3,
                                          total_steps=config["epochs"] * len(train_loader),
                                          pct_start=0.1, anneal_strategy="cos")
# 如果需要可尝试 OneCycleLR 或 Warmup 策略

best_val_acc = 0.0
scaler = GradScaler()  # 用于混合精度训练

## 训练与验证函数

In [112]:
from tqdm.notebook import tqdm
from torch.cuda.amp import autocast

reports = []


def train(epoch_num: int):
    model.train()
    total_loss = 0.0
    train_bar = tqdm(train_loader, desc=f"Train {epoch_num:>3}/{config['epochs']}", leave=True, unit=" batch")
    for images, labels in train_bar:
        images = images.to(config["device"])
        labels = labels.to(config["device"])
        optimizer.zero_grad()
        with autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        # 梯度裁剪，防止梯度爆炸
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item() * images.size(0)
        train_bar.set_postfix(loss=loss.item())
    return total_loss / len(train_dataset)


def validate(epoch_num: int):
    model.eval()
    correct = 0
    total = 0
    all_predicts = []
    all_labels = []
    valid_bar = tqdm(valid_loader, desc=f"Valid {epoch_num:>3}/{config['epochs']}", leave=True, unit=" batch")
    with torch.no_grad():
        for images, labels in valid_bar:
            images = images.to(config["device"])
            labels = labels.to(config["device"])
            # outputs = model(images)
            # TTA: 原图 + 水平翻转
            outputs = model(images) + model(torch.flip(images, dims=[-1]))
            _, predicts = torch.max(outputs, 1)
            correct += (predicts == labels).sum().item()
            total += labels.size(0)
            all_predicts.extend(predicts.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            valid_bar.set_postfix(batch_acc=(predicts == labels).float().mean().item())
        valid_acc = correct / total
        reports.append((all_labels, all_predicts))
    return valid_acc

## 训练循环

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/conv2former_train')
train_losses = []
valid_accs = []
best_model = None
patience_counter = 0
patience = 10

for epoch in range(config["epochs"]):
    train_loss = train(epoch_num=epoch + 1)
    train_losses.append(train_loss)
    writer.add_scalar('Loss/train', train_loss, epoch)

    valid_acc = validate(epoch_num=epoch + 1)
    valid_accs.append(valid_acc)
    writer.add_scalar('Accuracy/val', valid_acc, epoch)

    scheduler.step()

    # 保存最佳模型
    if valid_acc > best_val_acc:
        best_val_acc = valid_acc
        best_model = model

    print(f"Epoch: {epoch + 1}, Train Loss: {train_loss:.4f}, Val Acc: {valid_acc:.4f}")
    # 检查训练损失是否下降
    if epoch > 0 and train_loss >= train_losses[epoch - 1]:
        patience_counter += 1
    else:
        patience_counter = 0  # 训练损失下降时重置计数器
    if patience_counter >= patience:
        print(f"Stopped at epoch {epoch + 1} due to no improvement in loss.")
        break

writer.close()

Train   1/100:   0%|          | 0/657 [00:00<?, ? batch/s]

Valid   1/100:   0%|          | 0/147 [00:00<?, ? batch/s]

Epoch: 1, Train Loss: 0.0905, Val Acc: 0.9263


Train   2/100:   0%|          | 0/657 [00:00<?, ? batch/s]

Valid   2/100:   0%|          | 0/147 [00:00<?, ? batch/s]

Epoch: 2, Train Loss: 0.0709, Val Acc: 0.9582


Train   3/100:   0%|          | 0/657 [00:00<?, ? batch/s]

Valid   3/100:   0%|          | 0/147 [00:00<?, ? batch/s]

Epoch: 3, Train Loss: 0.0662, Val Acc: 0.9695


Train   4/100:   0%|          | 0/657 [00:00<?, ? batch/s]

## 保存最佳模型

In [None]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H_%M")
torch.save(best_model.state_dict(), f"intermediates/model_{timestamp}_{best_val_acc * 100:.2f}.pth")

## 保存数据

In [105]:
import pickle

training_data = {'train_losses': train_losses, 'valid_accs': valid_accs, 'reports': reports}
# 获取当前时间，格式为 'YYYYMMDD_HH_MM'
timestamp = datetime.now().strftime("%Y%m%d_%H_%M")
file_name = f"intermediates/training_data_{timestamp}.pkl"
latest_train_data = "intermediates/training_data_latest.pkl"
# 保存数据到文件
with open(file_name, "wb") as f, open(latest_train_data, "wb") as latest_file:
    pickle.dump(training_data, f)
    pickle.dump(training_data, latest_file)

NameError: name 'reports' is not defined

# 后续处理

## 加载并输出训练报告

In [None]:
from sklearn.metrics import classification_report
import pickle

# 加载保存的数据
with open(latest_train_data, "rb") as f:
    training_data = pickle.load(f)

loaded_reports = training_data['reports']

for all_labels, all_predicts in loaded_reports:
    print(classification_report(all_labels, all_predicts, zero_division=0, digits=4))

## 加载数据并绘图

In [104]:
import numpy as np
import pickle
import matplotlib.pyplot as plt


# 创建一个简单的平滑函数
def smooth(data, window_size=5):
    window = np.ones(int(window_size)) / float(window_size)
    return np.convolve(data, window, 'same')


# 加载保存的数据
with open(latest_train_data, "rb") as f:
    training_data = pickle.load(f)

train_losses = training_data['train_losses']
valid_accs = training_data['valid_accs']

# 应用平滑函数
smoothed_train_losses = smooth(train_losses)
smoothed_valid_accs = smooth(valid_accs)

# 绘制平滑后的曲线
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(smoothed_train_losses, label="Train Loss")
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(smoothed_valid_accs, label="Validation Accuracy")
plt.title("Validation Accuracy")
plt.xlabel("Epoch")
plt.legend()

plt.show()


NameError: name 'latest_train_data' is not defined

In [None]:
# train_losses
valid_accs