In [1]:
import torch
import gc
import sys
import subprocess
from pathlib import Path
import datetime
# 配置 TensorBoard
try:
    from torch.utils.tensorboard import SummaryWriter
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorboard"])  # 安装 tensorboard
    from torch.utils.tensorboard import SummaryWriter

# 全局变量来存储 writer 和日志目录
writer = None
TB_LOG_DIR = None

def setup_tensorboard(phase_name: str):
    """为训练阶段设置 TensorBoard。"""
    global writer, TB_LOG_DIR
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    TB_LOG_DIR = Path(f"runs/tensorboard/{phase_name}/{timestamp}")
    TB_LOG_DIR.mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(log_dir=str(TB_LOG_DIR))
    print(f"TensorBoard for '{phase_name}' phase initialized. Logs will be saved to: {TB_LOG_DIR}")


def _safe_scalar_log(prefix: str, data: dict, step: int):
    if not isinstance(data, dict) or writer is None:
        return
    for k, v in data.items():
        if isinstance(v, (int, float)):
            writer.add_scalar(f"{prefix}/{k}", v, step)


def tb_on_fit_epoch_end(trainer):
    """每个 epoch 结束时记录 metrics、loss、学习率与显存占用。"""
    if writer is None: return
    epoch = getattr(trainer, "epoch", None)
    if epoch is None:
        return

    # 训练/验证指标
    metrics = getattr(trainer, "metrics", None) or {}
    _safe_scalar_log("metrics", metrics, epoch)

    # 训练损失（有的版本将损失汇总到 metrics 中）
    losses = getattr(trainer, "loss", None)
    if isinstance(losses, dict):
        _safe_scalar_log("loss", losses, epoch)

    # 学习率
    try:
        if getattr(trainer, "optimizer", None):
            lr = trainer.optimizer.param_groups[0].get("lr", None)
            if isinstance(lr, (int, float)):
                writer.add_scalar("opt/lr", lr, epoch)
    except Exception:
        pass

    # GPU 显存（如可用）
    try:
        if torch.cuda.is_available():
            mem = torch.cuda.memory_allocated() / (1024 ** 2)  # MB
            writer.add_scalar("gpu/memory_allocated_mb", mem, epoch)
    except Exception:
        pass

    writer.flush()


def tb_on_train_end(trainer):
    """训练结束时的回调。"""
    global writer
    if writer:
        writer.close()
        print(f"TensorBoard logs saved to: {TB_LOG_DIR}")
        writer = None # 重置 writer
    
    # 训练结束后清理内存
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print("TensorBoard callback functions defined and setup_tensorboard function is ready.")

TensorBoard callback functions defined and setup_tensorboard function is ready.


In [2]:
# train_with_dbl_corrected.py
from ultralytics.models.yolo.detect.train import DetectionTrainer # <<< MODIFICATION: 导入基础训练器
from dbl_loss import DBL_TSD # 确保您的 dbl_loss_tsd.py 在项目目录中
import torch

# === Step 1: 定义一个使用DBL损失的自定义训练器 ===
class DBLDetectionTrainer(DetectionTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.custom_loss = DBL_TSD(cls_w=1.0, box_w=5.0, iou_w=2.0, scale_alpha=1.0, trunc_thresh=0.4)
        print("✅ DBLDetectionTrainer initialized with custom DBL_TSD loss function.")

    def get_loss(self, batch, preds):
        loss, loss_items = self.criterion(preds, batch)
        if hasattr(self.criterion, 'assigned_targets'):
             targets = self.criterion.assigned_targets
        else:
            return loss, loss_items
        custom_loss_total, custom_loss_info = self.custom_loss(preds, targets)
        final_loss_items = torch.cat((
            custom_loss_info['box_loss'].unsqueeze(0),
            custom_loss_info['cls_loss'].unsqueeze(0),
            loss_items[2:]
        ))
        return custom_loss_total, final_loss_items

此处用于聚合所有模块进行训练

In [4]:
# === FINAL Monkey Patch (v4) ===
from ultralytics.nn import tasks as nn_tasks
from ultralytics.nn.modules import *
from copy import deepcopy
from torch import nn as nn
from modules.custom_modules import LCFANeck, ECA, CSAB_Offset
from ultralytics.utils import LOGGER

def patched_parse_model(d, ch, verbose=True):
    """
    Final, robust version of parse_model that correctly handles ECA and other custom modules.
    """
    LOGGER.info(f"\\n{'':>3}{'from':>20}{'n':>3}{'module':>25}{'arguments':>30}")
    
    custom_modules = {'LCFANeck': LCFANeck, 'ECA': ECA, 'CSAB_Offset': CSAB_Offset}

    ch = [ch]
    layers, save, c2 = [], [], ch[-1]

    for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]):
        if m in custom_modules:
            m_cls = custom_modules[m]
        else:
            m_cls = globals()[m]

        if isinstance(f, int):
            c1 = ch[f]
            if m == 'Detect':
                 module_args = [d['nc'], c1]
            else:
                 module_args = [c1, *args]
        else:
            c1 = [ch[x] for x in f]
            if m == 'Concat':
                module_args = [c1]
            elif m == 'LCFANeck':
                 module_args = args
            elif m == 'Detect':
                module_args = [d['nc'], c1]
            else:
                module_args = [c1, *args]

        m_ = m_cls(*module_args) if n == 1 else nn.Sequential(*(m_cls(*module_args) for _ in range(n)))
        
        t = str(m_cls)[8:-2].replace('__main__.', '')
        
        # --- CRITICAL FIX FOR IndexError ---
        if m == 'Concat':
            c2 = sum(c1)
        elif m == 'Detect':
            c2 = [] 
        elif m == 'LCFANeck':
            c2 = m_.c_out
        elif m == 'ECA': # Add special handling for ECA
            # Attention modules like ECA don't change the number of channels.
            c2 = c1
        else: # Fallback for standard modules
            c2 = m_.c2 if hasattr(m_, 'c2') else args[0]

        m_.i, m_.f, m_.type, m_.np = i, f, t, sum(p.numel() for p in m_.parameters())
        LOGGER.info(f'{i:>3}{str(f):>20}{n:>3}{m_.type:>25}{str(args):>30}')
        save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)
        layers.append(m_)
        
        if i == 0:
            ch = []
        ch.append(c2)

    return nn.Sequential(*layers), sorted(save)


# Apply the final, correct patch
nn_tasks.parse_model = patched_parse_model
print("✅ Final, correct monkey patch (v4) has been applied. Handles ECA correctly.")

✅ Final, correct monkey patch (v4) has been applied. Handles ECA correctly.


In [None]:
# master_experiment_runner_v2.py (FIXED)
from ultralytics import YOLO
import torch
import sys
import os
from ultralytics.nn import tasks
from modules.custom_modules import LCFANeck, ECA, CSAB_Offset

# Register custom modules with ultralytics
tasks.LCFANeck = LCFANeck
tasks.ECA = ECA
tasks.CSAB = CSAB_Offset


def run_experiment(model_yaml: str, experiment_name: str, use_dbl_loss: bool):
    """
    A standardized function to run a full experiment with simplified training strategies.
    """
    # Add custom modules path to sys.path
    custom_module_path = os.path.abspath('modules')
    if custom_module_path not in sys.path:
        sys.path.insert(0, custom_module_path)
    
    print(f"\\n{'='*25} STARTING EXPERIMENT: {experiment_name} {'='*25}")
    
    # --- Build model and transfer weights ---
    print(f"--- Building custom model from '{model_yaml}' ---")
    model = YOLO(model_yaml) # Step 1: Build your custom model from YAML

    print(f"--- Transferring backbone weights from 'yolo11n.pt' ---")
    model.load("yolo11n_final4.pt") 

    # === Setup Trainer and Callbacks ===
    trainer_class = DBLDetectionTrainer if use_dbl_loss else None
    setup_tensorboard(experiment_name)
    try:
        model.add_callback("on_fit_epoch_end", tb_on_fit_epoch_end)
        model.add_callback("on_train_end", tb_on_train_end)
    except NameError:
        print("TensorBoard callbacks not found.")
    _device = 0 if torch.cuda.is_available() else 'cpu'

    # <<< MODIFICATION: Simplified and Unified Training Strategy >>>
    
    # Determine training parameters
    if use_dbl_loss:
        print("\\n--- Applying STRATEGY B: Global Training for DBL Loss ---")
        epochs = 150
        lr0 = 1e-2
        batch_size = 24 # Reduced to prevent OOM
        exp_name_suffix = "dbl_global"
    else:
        # --- NEW SIMPLIFIED STRATEGY for architectural changes ---
        print("\\n--- Applying SIMPLIFIED STRATEGY: Unified Global Fine-tuning ---")
        epochs = 150
        lr0 = 1e-2
        batch_size = 24 # Reduced to prevent OOM
        exp_name_suffix = "unified_global"

    # Execute the training
    model.train(
        trainer=trainer_class,
        data="pest24.yaml",
        seed=42,
        epochs=epochs,
        batch=batch_size, # Using the safer batch size
        workers=2,
        lr0=lr0,
        lrf=0.001,
        optimizer="AdamW",
        weight_decay=0.001,
        patience=20,
        name=f"{experiment_name}_{exp_name_suffix}",
        device=_device,
        cache=True,
        amp=True, # Keep mixed precision enabled, it helps with memory
        resume=True #temp for eca+fusion
    )

    # === Final Step: Save final model ===
    final_weights_path = f"weights/{experiment_name}_final.pt"
    model.save(final_weights_path)
    print(f"\\n{'='*25} EXPERIMENT {experiment_name} COMPLETE {'='*25}")
    print(f"Final model saved to {final_weights_path}")

    # Clean up sys.path
    if custom_module_path in sys.path:
        sys.path.remove(custom_module_path)

In [13]:
# --- 单独验证 ---
# 实验1: 仅Fusion Neck (架构改动 -> 策略A)
run_experiment("yamls/yolo11n-fusion.yaml", "exp_fusion_only", use_dbl_loss=False)

--- Building custom model from 'yamls/yolo11n-fusion.yaml' ---
\n                   from  n                   module                     arguments
  0                  -1  1ultralytics.nn.modules.conv.Conv                    [16, 3, 2]
  1                  -1  1ultralytics.nn.modules.conv.Conv                    [32, 3, 2]
  2                  -1  1ultralytics.nn.modules.block.C3k2             [64, False, 0.25]
  3                  -1  1ultralytics.nn.modules.conv.Conv                    [64, 3, 2]
  4                  -1  1ultralytics.nn.modules.block.C3k2            [128, False, 0.25]
  5                  -1  1ultralytics.nn.modules.conv.Conv                   [128, 3, 2]
  6                  -1  1ultralytics.nn.modules.block.C3k2                   [128, True]
  7                  -1  1ultralytics.nn.modules.conv.Conv                   [256, 3, 2]
  8                  -1  1ultralytics.nn.modules.block.C3k2                   [256, True]
  9                  -1  1ultralytics.nn.modules

In [None]:
# 实验2: ECA + Fusion (架构改动 -> 策略A)
run_experiment("yamls/yolo11n-eca-fusion.yaml", "exp_eca_fusion", use_dbl_loss=False)   

In [None]:
from ultralytics import YOLO
import os

# 1. 定义被中断的实验的运行名称
#    这来自于 run_experiment 中的 f"{experiment_name}_{exp_name_suffix}"
#    对于实验2, experiment_name="exp_eca_fusion", exp_name_suffix="unified_global"
experiment_run_name = "exp_eca_fusion_unified_global"

# 2. 构建到最后一个检查点文件的完整路径
checkpoint_path = os.path.join('ultralytics','runs', 'detect', experiment_run_name, 'weights', 'last.pt')

print(f"正在尝试从以下路径恢复: {checkpoint_path}")

# 3. 检查检查点文件是否存在
if os.path.exists(checkpoint_path):
    print("检查点文件已找到。正在恢复训练...")
    
    # 4. 从最后一个检查点加载模型
    #    这将自动加载模型结构、权重和优化器状态
    model = YOLO(checkpoint_path)
    
    # 5. 调用 train() 继续训练
    #    Ultralytics 会自动从中断的轮次开始，并使用所有保存的超参数
    model.train(resume=True)
    
    print("\n✅ 恢复的训练已完成。")

else:
    print(f"❌ 错误: 在 '{checkpoint_path}' 处未找到检查点文件。")
    print("请检查以下几点：")
    print("1. 原始训练是否已运行足够长的时间以保存第一个检查点？")
    print(f"2. 实验名称 '{experiment_run_name}' 是否正确？")
    print("   (请检查您的 'runs/detect' 文件夹以获取确切的目录名称，它可能带有一个数字后缀，例如 '..._global2')")

In [None]:
# 实验3: CSAB + Fusion (架构改动 -> 策略A)
run_experiment("yamls/yolo11n-csab-fusion.yaml", "exp_csab_fusion", use_dbl_loss=False)

In [None]:
# 实验4: DBL + Fusion (架构+损失改动 -> 策略B)
# The loss change is more fundamental, so we choose the global strategy.
run_experiment("yamls/yolo11n-fusion.yaml", "exp_dbl_fusion", use_dbl_loss=True)

In [None]:
    
# 实验5: ECA + CSAB + Fusion (架构改动 -> 策略A)
run_experiment("yamls/yolo11n-ultimate.yaml", "exp_ultimate_no_dbl", use_dbl_loss=False)

In [None]:
# 实验6: ECA + CSAB + Fusion + DBL (架构+损失改动 -> 策略B)
# Again, the fundamental loss change dictates the strategy.
run_experiment("yamls/yolo11n-ultimate.yaml", "exp_ultimate_with_dbl", use_dbl_loss=True)