In [19]:
import os
import torch
import random
import pprint
import numpy as np
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
import sys

from torch import nn, optim
from torch.cuda import amp
from torch.utils.data import DataLoader

from dataloader import Dataset, AugmentTransform, BasicTransform
from model import YoloModel
from utils import (
    YoloLoss, Evaluator, ModelEMA,
    generate_random_color, set_lr, de_parallel,
    resume_state
)
from val import validate, result_analyis


SEED = 2025
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x2105f8d5210>

In [None]:
class Args:
    exp = "debug170multi"  # 实验名称，会生成 experiment/debug/ 文件夹
    data = Path("data/voc.yaml")  # 数据配置文件路径
    model_type = "base"  # YOLOv3 模型类型
    img_size = 416  # 输入图像大小
    batch_size = 8  # 批大小
    num_epochs = 170  # 训练轮数
    warmup = 1  # warmup epoch 数
    base_lr = 0.001  # 初始学习率
    lr_decay = [200]  # 指定在哪些 epoch 衰减学习率
    momentum = 0.9  # SGD 动量
    weight_decay = 5e-4  # 权重衰减
    conf_thres = 0.01  # 置信度阈值（用于推理时过滤预测框）
    nms_thres = 0.6  # NMS 阈值
    img_interval = 10  # 每隔几轮保存一次图像 初始：5
    workers = 4  # dataloader 多线程
    multiscale = True  # 是否启用多尺度训练
    no_amp = False  # 是否禁用混合精度
    scratch = True  # 是否从头训练（不加载预训练权重）
    resume = True  # 是否从上次 checkpoint 恢复
    rank = 0  # 单卡训练设为 0
    world_size = 1  # 单卡训练设为 1

args = Args()

# 设置路径
args.exp_path = Path("experiment") / args.exp
args.weight_dir = args.exp_path / "weight"
args.img_log_dir = args.exp_path / "train-image"
args.load_path = args.weight_dir / "last.pt" if args.resume else None

# 创建输出目录
os.makedirs(args.weight_dir, exist_ok=True)
os.makedirs(args.img_log_dir, exist_ok=True)


In [None]:
def train_yolov3_notebook(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loss_type = ["multipart", "obj", "noobj", "txty", "twth", "cls"]
    losses = defaultdict(float)
    

    # 加载训练和验证集
    train_dataset = Dataset(yaml_path=args.data, phase="train")
    val_dataset = Dataset(yaml_path=args.data, phase="val")

    # 记录 class 和 anchor 信息
    args.anchors = train_dataset.anchors
    args.class_list = train_dataset.class_list
    args.color_list = generate_random_color(len(args.class_list))
    args.mAP_filepath = val_dataset.mAP_filepath

    # 设置数据增强器
    args.train_size = 608 if args.multiscale else args.img_size
    train_dataset.load_transformer(AugmentTransform(input_size=args.train_size))
    train_loader = DataLoader(train_dataset, collate_fn=Dataset.collate_fn,
                              batch_size=args.batch_size, shuffle=True,
                              num_workers=args.workers, pin_memory=True)

    val_dataset.load_transformer(BasicTransform(input_size=args.img_size))
    val_loader = DataLoader(val_dataset, collate_fn=Dataset.collate_fn,
                            batch_size=args.batch_size, shuffle=False,
                            num_workers=args.workers, pin_memory=True)

    # 计算 warmup 阶段和梯度累计参数
    args.nw = max(round(args.warmup * len(train_loader)), 100)
    args.nominal_batch_size = 64
    args.grad_accumulate = max(round(args.nominal_batch_size / args.batch_size), 1)
    args.last_opt_step = -1

    # 初始化模型、损失函数、优化器等
    model = YoloModel(args.img_size, len(args.class_list), args.anchors,
                      args.model_type, pretrained=not args.scratch).to(device)
    model.set_grid_xy(input_size=args.train_size)

    criterion = YoloLoss(args.train_size, len(args.class_list), anchors=model.anchors)
    optimizer = optim.SGD(model.parameters(), lr=args.base_lr,
                          momentum=args.momentum, weight_decay=args.weight_decay)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_decay, gamma=0.1)
    scaler = amp.GradScaler(enabled=not args.no_amp)
    ema = ModelEMA(model)
    evaluator = Evaluator(annotation_file=args.mAP_filepath)
    

    # 恢复训练
    start_epoch = 1
    if args.resume:
        start_epoch = resume_state(args.load_path, args.rank, model, ema, optimizer, scheduler, scaler)

    # ---------- 开始训练循环 ----------
    for epoch in range(start_epoch, args.num_epochs + 1):
        model.train()
        loop = tqdm(train_loader, desc=f"[Train] Epoch {epoch}/{args.num_epochs}", leave=False)
        
        # epoch_loss = defaultdict(float)

        for i, minibatch in enumerate(loop):
            ni = i + len(train_loader) * (epoch - 1)

            # 动态调整学习率 & 梯度累计
            if ni <= args.nw:
                args.grad_accumulate = max(1, np.interp(ni, [0, args.nw],
                                                        [1, args.nominal_batch_size / args.batch_size]).round())
                set_lr(optimizer, args.base_lr * pow(ni / args.nw, 4))

            images, labels = minibatch[1].to(device, non_blocking=True), minibatch[2]

            # 多尺度训练（随机 resize）
            if args.multiscale:
                if ni % 10 == 0 and ni > 0:
                    args.train_size = random.randint(10, 19) * 32
                    model.module.set_grid_xy(input_size=args.train_size) if hasattr(model, "module") else model.set_grid_xy(input_size=args.train_size)
                    criterion.set_grid_xy(input_size=args.train_size)
                images = nn.functional.interpolate(images, size=args.train_size, mode="bilinear")

            # 混合精度前向 & 计算 loss
            with amp.autocast(enabled=not args.no_amp):
                predictions = model(images)
                loss = criterion(predictions, labels)

            # 反向传播（支持梯度累计）
            scaler.scale((loss[0] / args.grad_accumulate)).backward()

            # 执行优化器 step
            if ni - args.last_opt_step >= args.grad_accumulate:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                if ema is not None:
                    ema.update(model)
                args.last_opt_step = ni
                
            for loss_name, loss_value in zip(loss_type, loss):
                if not torch.isfinite(loss_value) and loss_name != "multipart":
                    print(f"############## {loss_name} Loss is Nan/Inf ! {loss_value} ##############")
                    sys.exit(0)
                else:
                    losses[loss_name] += loss_value.item()

        scheduler.step()
        print(f"Epoch {epoch} completed")

        # ---------- 验证 & 评估 ----------
        if epoch % 10 == 0:
            model.eval()
            mAP_dict, eval_text = validate(args=args, dataloader=val_loader, model=ema.module, evaluator=evaluator, epoch=epoch)
            if mAP_dict:
                print(eval_text)
        
        
        # ---------- 保存模型 ----------
        ckpt = {
            "running_epoch": epoch,
            "model_type": args.model_type,
            "class_list": args.class_list,
            "anchors": args.anchors,
            "model_state": de_parallel(model).state_dict(),
            "ema_state": ema.module.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "scheduler_state": scheduler.state_dict(),
            "scaler_state_dict": scaler.state_dict(),
        }
        torch.save(ckpt, args.weight_dir / "last.pt")

        # ---------- 可视化 & 分析 ----------
        if epoch % 10 == 0:
            if mAP_dict:
                result_analyis(args=args, mAP_dict=mAP_dict["all"])
                
    loss_str = f"[Train-Epoch:{epoch:03d}] "
    for loss_name in loss_type:
        losses[loss_name] /= len(loop)
        loss_str += f"{loss_name}: {losses[loss_name]:.4f}  "
    return loss_str
                

In [23]:
train_yolov3_notebook(args)

                                                                          

Epoch 161 completed


                                                                          

Epoch 162 completed


                                                                          

Epoch 163 completed


                                                                          

Epoch 164 completed


                                                                          

Epoch 165 completed


                                                                          

Epoch 166 completed


                                                                          

Epoch 167 completed


                                                                          

Epoch 168 completed


                                                                          

Epoch 169 completed


                                                                          

Epoch 170 completed

	 - Average Precision (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.055
	 - Average Precision (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.210
	 - Average Precision (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.008
	 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
	 - Average Precision (AP) @[ IoU=0.50      | area= small | maxDets=100 ] = 0.000
	 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
	 - Average Precision (AP) @[ IoU=0.50      | area=medium | maxDets=100 ] = 0.000
	 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.074
	 - Average Precision (AP) @[ IoU=0.50      | area= large | maxDets=100 ] = 0.282



NameError: name 'dataloader' is not defined