In [1]:
! pip install git+https://github.com/huggingface/transformers@main

Collecting git+https://github.com/huggingface/transformers@main
  Cloning https://github.com/huggingface/transformers (to revision main) to /tmp/pip-req-build-xc2w9pbl
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-xc2w9pbl
  Resolved https://github.com/huggingface/transformers to commit cf084f5b40e19b5a5f946cee75bead6d4247b071
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
import transformers
print(transformers.__version__)

4.57.0.dev0


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Basic building blocks
class Conv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=None, groups=1):
        super().__init__()
        if padding is None:
            padding = kernel_size // 2
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, groups=groups, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.SiLU()

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class Bottleneck(nn.Module):
    def __init__(self, in_channels, out_channels, shortcut=True, expansion=0.5):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = Conv(in_channels, hidden_channels, 1, 1)
        self.conv2 = Conv(hidden_channels, out_channels, 3, 1)
        self.add = shortcut and in_channels == out_channels

    def forward(self, x):
        return x + self.conv2(self.conv1(x)) if self.add else self.conv2(self.conv1(x))

class C3k2(nn.Module):
    def __init__(self, in_channels, out_channels, n=1, shortcut=True, expansion=0.5):
        super().__init__()
        mid_channels = out_channels // 2
        self.conv1 = Conv(in_channels, mid_channels, 1, 1)
        self.conv2 = Conv(in_channels, mid_channels, 1, 1)
        self.res_blocks = nn.Sequential(*[Bottleneck(mid_channels, mid_channels, shortcut, expansion) for _ in range(n)])
        self.conv3 = Conv(2 * mid_channels, out_channels, 1, 1)

    def forward(self, x):
        y1 = self.res_blocks(self.conv1(x))
        y2 = self.conv2(x)
        return self.conv3(torch.cat((y1, y2), dim=1))

class Detect(nn.Module):
    def __init__(self, nc=2, ch=(256, 256, 256), stride=(8, 16, 32), reg_max=0):
        super().__init__()
        self.nc = nc
        self.nl = len(ch)
        self.no = (reg_max * 4 if reg_max > 0 else 4) + 1 + nc
        self.stride = stride
        self.reg_max = reg_max

        self.cv2 = nn.ModuleList()
        self.cv3 = nn.ModuleList()
        for in_channels in ch:
            self.cv2.append(Conv(in_channels, in_channels, 3, 1))
            self.cv3.append(nn.Conv2d(in_channels, self.no, 1, 1))
    def forward(self, x):
        z = []
        for i in range(self.nl):
            xi = self.cv2[i](x[i])   # [B, C, H, W]
            xi = self.cv3[i](xi)     # [B, no, H, W]
            xi = xi.permute(0, 2, 3, 1).contiguous()  # [B, H, W, no]
            z.append(xi)
        return z







In [4]:


from transformers import DINOv3ConvNextModel

class DINOBackbone(nn.Module):
    def __init__(self, model_name="facebook/dinov3-convnext-base-pretrain-lvd1689m"):
        super().__init__()
        self.model = DINOv3ConvNextModel.from_pretrained(
            model_name, 
            output_hidden_states=True  
        )

    def forward(self, x):
        pixel_values = x["pixel_values"] 

        outputs = self.model(pixel_values)  
        hidden_states = outputs.hidden_states 
        features = [hidden_states[i] for i in [2, 3, 4]]  

        return features




        
class YoloDetectionHeadPath1(nn.Module):
    def __init__(self, nc=2, backbone_channels=[384, 768, 1024], neck_out_dim=256):
        super().__init__()
        C3_ch, C4_ch, C5_ch = backbone_channels  # Backbone output channels: 384, 768, 1024
        
        # Channel reduction to standardize dimensions
        self.reduce_p3 = Conv(C3_ch, neck_out_dim, 1, 1)  # 384 -> 256
        self.reduce_p4 = Conv(C4_ch, neck_out_dim, 1, 1)  # 768 -> 256
        self.reduce_p5 = Conv(C5_ch, neck_out_dim, 1, 1)  # 1024 -> 256
        
        # --- Upward Path ---
        self.up_p5 = nn.Upsample(scale_factor=2, mode="nearest")
        self.c3_p4 = C3k2(neck_out_dim * 2, neck_out_dim, shortcut=False)  # 512 -> 256

        self.up_p4 = nn.Upsample(scale_factor=2, mode="nearest")
        self.c3_p3 = C3k2(neck_out_dim * 2, neck_out_dim, shortcut=False)  # 512 -> 256

        # --- Downward Path ---
        self.down_p3 = Conv(neck_out_dim, neck_out_dim, 3, 2)  # 256 -> 256
        self.c3_p4d = C3k2(neck_out_dim * 2, neck_out_dim, shortcut=False)  # 512 -> 256

        self.down_p4 = Conv(neck_out_dim, neck_out_dim, 3, 2)  # 256 -> 256
        self.c3_p5d = C3k2(neck_out_dim * 2, neck_out_dim, shortcut=True)  # 512 -> 256

        # --- Detect Layer ---
        self.detect = Detect(nc=nc, ch=[neck_out_dim, neck_out_dim, neck_out_dim])

    def forward(self, p3, p4, p5):

        p3 = self.reduce_p3(p3)
        p4 = self.reduce_p4(p4)
        p5 = self.reduce_p5(p5)
        
        # --- Upward path ---
        p5_up = self.up_p5(p5)
        p4_cat = torch.cat([p5_up, p4], dim=1)
        p4_out = self.c3_p4(p4_cat)

        p4_up = self.up_p4(p4_out)
        p3_cat = torch.cat([p4_up, p3], dim=1)
        p3_out = self.c3_p3(p3_cat)

        # --- Downward path ---
        p3_down = self.down_p3(p3_out)
        p4_cat2 = torch.cat([p3_down, p4_out], dim=1)
        p4_out2 = self.c3_p4d(p4_cat2)

        p4_down = self.down_p4(p4_out2)
        p5_cat2 = torch.cat([p4_down, p5], dim=1)
        p5_out2 = self.c3_p5d(p5_cat2)

        return self.detect([p3_out, p4_out2, p5_out2])



class Path1Model(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.backbone = DINOBackbone()
        self.head = YoloDetectionHeadPath1(
            nc=num_classes,
            backbone_channels=[256, 512, 1024],
            neck_out_dim=256
        )
        self.num_classes = num_classes
        self.args = {
            "box": 7.5,   
            "cls": 0.5,
            "dfl": 1.5,
        }


    def forward(self, x):
        backbone_features = self.backbone(x)
        p3, p4, p5 = backbone_features
        
        detect_output = self.head(p3, p4, p5)  
        return detect_output  # tuple: (bbox_preds, class_logits)




2025-09-11 20:09:56.870277: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757621397.046356     117 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757621397.100453     117 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
import kagglehub
import os

# 🔹 Download the dataset
path = kagglehub.dataset_download("pengbo00/home-fire-dataset")

print("Dataset downloaded to:", path)
print("Files inside:", os.listdir(path))



Dataset downloaded to: /kaggle/input/home-fire-dataset
Files inside: ['val', 'test', 'train']


In [6]:
from torch.utils.data import Dataset
from PIL import Image

class FireDataset(Dataset):
    def __init__(self, root_dir, processor ,img_size = 640):
        self.root_dir = root_dir
        self.processor = processor
        self.img_size = img_size
        
        self.image_dir = os.path.join(root_dir, "images")
        self.label_dir = os.path.join(root_dir, "labels")

        self.image_files = sorted(os.listdir(self.image_dir))

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self,idx):
        img_file  = self.image_files[idx]
        img_path = os.path.join(self.image_dir,img_file)
        image = Image.open(img_path).convert("RGB")

        inputs = self.processor(image)
        pixel_values = inputs["pixel_values"].squeeze(0) 
        label_file = os.path.splitext(img_file)[0] + ".txt"
        label_path = os.path.join(self.label_dir, label_file)

        boxes = []
        classes = []

        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                for line in f.readlines():
                    cls, x, y, w, h = map(float, line.strip().split())
                    classes.append(int(cls))
                    boxes.append([x, y, w, h])

        target = {
            "bboxes": torch.tensor(boxes, dtype=torch.float32), 
            "cls": torch.tensor(classes, dtype=torch.long)    ,
            "gt_groups": torch.tensor([len(boxes)], dtype=torch.long)
        }

        return pixel_values, target

In [7]:
def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]

    batch_pixel_values = torch.stack(images)

    bboxes = []
    classes = []
    gt_group_sizes = []  # number of boxes per image

    for t in targets:
        bboxes.append(t["bboxes"])
        classes.append(t["cls"])
        gt_group_sizes.append(len(t["cls"]))

    return batch_pixel_values, {
        "bboxes": torch.cat(bboxes, dim=0) if bboxes else torch.empty((0, 4)),
        "cls": torch.cat(classes, dim=0) if classes else torch.empty((0,), dtype=torch.long),
        "gt_groups": torch.tensor(gt_group_sizes, dtype=torch.long)  # [bs] e.g. [3, 5]
    }


In [9]:
from torchvision import transforms
from PIL import Image
import torch
from transformers import AutoImageProcessor

class Processor:
    def __init__(self, image_size=224):
        # Base transform (like DINOv3 training)
        self.preprocessor = AutoImageProcessor.from_pretrained("facebook/dinov3-convnext-base-pretrain-lvd1689m")
        self.extra_transforms = []

    def add_transform(self, transform_fn):
        self.extra_transforms.append(transform_fn)

    def __call__(self, image):
        for t in self.extra_transforms:
            image = t(image)

        return self.preprocessor(images=image, return_tensors="pt")



In [10]:
from torch.utils.data import DataLoader

processor = Processor()

train_dataset = FireDataset("/kaggle/input/home-fire-dataset/train", processor)
val_dataset   = FireDataset("/kaggle/input/home-fire-dataset/val", processor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ContrastiveHeadWrapper(nn.Module):
    def __init__(self, model, proj_dim=256):
        super().__init__()
        self.model = model
        concat_dim = 256+ 512+ 1024

        self.proj = nn.Sequential(
            nn.Linear(concat_dim, 512),  # P3+P4+P5 pooled concat
            nn.ReLU(),
            nn.Linear(512, proj_dim)
        )
        
    def forward(self, x):
        # Backbone features
        backbone_features = self.model.backbone(x)
        # Head features (before detect)
        p3, p4, p5 = backbone_features

        # Global average pooling
        pooled = []
        for fm in [p3, p4, p5]:
            pooled.append(F.adaptive_avg_pool2d(fm, (1, 1)).flatten(1))
        pooled = torch.cat(pooled, dim=1)  # [B, 256*3]

        return self.proj(pooled)  # [B, proj_dim]


In [19]:
class NTXentLoss(nn.Module):
    def __init__(self, temperature=0.2):
        super().__init__()
        self.temperature = temperature

    def forward(self, z1, z2):
        z1 = F.normalize(z1, dim=-1)
        z2 = F.normalize(z2, dim=-1)

        logits = torch.mm(z1, z2.t()) / self.temperature
        labels = torch.arange(z1.size(0), device=z1.device)
        return F.cross_entropy(logits, labels)


In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"

base_model = Path1Model(num_classes=2).to(device)

# Freeze backbone if you only want to train head
for param in base_model.backbone.parameters():
    param.requires_grad = False  
    
contrastive_model = ContrastiveHeadWrapper(base_model, proj_dim=256).to(device)



optimizer = torch.optim.AdamW(contrastive_model.parameters(), lr=1e-4, weight_decay=1e-4)
criterion = NTXentLoss(temperature=0.2)

epochs = 20
for epoch in range(epochs):
    contrastive_model.train()
    total_loss = 0

    for batch in train_loader:
        images, _ = batch  # FireDataset gives (image, target)
        # Two random augmentations of the same batch
        v1 = torch.stack([img for img in images]).to(device)
        v2 = torch.stack([img for img in images]).to(device)  # here you should apply different augmentations!

        z1 = contrastive_model({"pixel_values": v1})
        z2 = contrastive_model({"pixel_values": v2})

        loss = criterion(z1, z2)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")
# Save only the projection head
torch.save(contrastive_model.proj.state_dict(), "contrastive_head.pth")
print("Projection head weights saved!")



Epoch 1/20, Loss: 0.0949
Epoch 2/20, Loss: 0.0526
Epoch 3/20, Loss: 0.0491
Epoch 4/20, Loss: 0.0469
Epoch 5/20, Loss: 0.0440
Epoch 6/20, Loss: 0.0422
Epoch 7/20, Loss: 0.0391
Epoch 8/20, Loss: 0.0398
Epoch 9/20, Loss: 0.0396
Epoch 10/20, Loss: 0.0421
Epoch 11/20, Loss: 0.0374
Epoch 12/20, Loss: 0.0388
Epoch 13/20, Loss: 0.0372
Epoch 14/20, Loss: 0.0376
Epoch 15/20, Loss: 0.0366
Epoch 16/20, Loss: 0.0389
Epoch 17/20, Loss: 0.0328
Epoch 18/20, Loss: 0.0365
Epoch 19/20, Loss: 0.0346
Epoch 20/20, Loss: 0.0350
Projection head weights saved!


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class DetectionLoss(nn.Module):
    def __init__(self, num_classes, box_weight=10.0, obj_weight=1.0, cls_weight=1.0):
        super().__init__()
        self.num_classes = num_classes
        self.box_weight = box_weight
        self.obj_weight = obj_weight
        self.cls_weight = cls_weight

    def forward(self, preds, targets):
        """
        preds: list of feature map heads [B, C, H, W] with C = 4 + 1 + nc
        targets: list of dicts, length = B
            - boxes: [N, 4] (xywh normalized)
            - labels: [N] class indices
        """
        device = preds[0].device
        B = preds[0].size(0)
        

        flattened = []
        for p in preds:  
            B, H, W, no = p.shape
            p = p.view(B, H * W, no)   
            flattened.append(p)
        
        preds = torch.cat(flattened, dim=1)  
             


        pred_boxes = preds[..., :4]     
        pred_obj   = preds[..., 4]      
        pred_cls   = preds[..., 5:]       

        num_preds = pred_boxes.size(1)

        # init targets
        obj_target = torch.zeros_like(pred_obj)           
        cls_target = torch.zeros_like(pred_cls)           
        box_target = torch.zeros_like(pred_boxes)          
        box_mask   = torch.zeros_like(pred_obj, dtype=torch.bool)

        # loop over batch
        for b, t in enumerate(targets):
            boxes = t["boxes"].to(device)
            labels = t["labels"].to(device)

            if boxes.numel() == 0:
                continue

            # naive assignment: each GT → random prediction slot
            for gt_box, gt_label in zip(boxes, labels):
                idx = torch.randint(0, num_preds, (1,)).item()
                box_target[b, idx] = gt_box
                obj_target[b, idx] = 1.0
                cls_target[b, idx, gt_label] = 1.0
                box_mask[b, idx] = True

        # losses
        loss_box = (
            F.smooth_l1_loss(pred_boxes[box_mask], box_target[box_mask])
            if box_mask.any()
            else torch.tensor(0., device=device)
        )
        loss_obj = F.binary_cross_entropy_with_logits(pred_obj, obj_target)
        loss_cls = F.binary_cross_entropy_with_logits(pred_cls, cls_target)

        total_loss = (
            self.box_weight * loss_box +
            self.obj_weight * loss_obj +
            self.cls_weight * loss_cls
        )

        return total_loss, {
            "box_loss": loss_box.item(),
            "obj_loss": loss_obj.item(),
            "cls_loss": loss_cls.item()
        }


In [12]:
import os
import torch
import torch.nn as nn

device = "cuda" if torch.cuda.is_available() else "cpu"

save_dir = "./checkpoints"
os.makedirs(save_dir, exist_ok=True)

model = Path1Model(num_classes=2).to(device)

# ✅ Freeze backbone if needed
for p in model.backbone.parameters():
    p.requires_grad = False

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-3,
    weight_decay=1e-4
)

criterion = DetectionLoss(num_classes=2).to(device)

best_val_loss = float("inf")
train_box_loss = 0.0
train_obj_loss = 0.0
train_cls_loss = 0.0

for epoch in range(50):
    model.train()
    train_loss = 0.0

    for images, targets in train_loader:   # targets is a dict from collate_fn
        images = images.to(device)

        # ✅ Split back into list of dicts for each image
        batch_targets = []
        start = 0
        
        for count in targets["gt_groups"]:
            count = count.item()
            boxes = targets["bboxes"][start:start+count].to(device)
            labels = targets["cls"][start:start+count].to(device)
            start += count

            batch_targets.append({"boxes": boxes, "labels": labels})

        # -------------------- Forward pass --------------------
        preds = model({"pixel_values": images})  # list of detection heads

        # -------------------- Compute loss --------------------
        loss, loss_items = criterion(preds, batch_targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
        optimizer.step()

        train_loss += loss.item()
        train_box_loss += loss_items["box_loss"]
        train_obj_loss += loss_items["obj_loss"]
        train_cls_loss += loss_items["cls_loss"]

    train_loss /= len(train_loader)
    train_box_loss /= len(train_loader)
    train_obj_loss /= len(train_loader)
    train_cls_loss /= len(train_loader)

    # -------------------- Validation --------------------
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, targets in val_loader:
            images = images.to(device)

            batch_targets = []
            start = 0
            for count in targets["gt_groups"]:
                count = count.item()
                boxes = targets["bboxes"][start:start+count].to(device)
                labels = targets["cls"][start:start+count].to(device)
                start += count
                batch_targets.append({"boxes": boxes, "labels": labels})


            preds = model({"pixel_values": images})
            loss, _ = criterion(preds, batch_targets)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    # -------------------- Logging --------------------
    print(
        f"Epoch {epoch+1:03d} | "
        f"Train Loss={train_loss:.6f} (box={train_box_loss:.4f}, obj={train_obj_loss:.4f}, cls={train_cls_loss:.4f}) | "
        f"Val Loss={val_loss:.6f} "
    )

    # -------------------- Save best checkpoint --------------------
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), os.path.join(save_dir, "best_model.pth"))
        print(f" ✅ Saved new best model (val_loss={val_loss:.4f})")


config.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/350M [00:00<?, ?B/s]

Epoch 001 | Train Loss=0.578660 (box=0.0482, obj=0.0496, cls=0.0475) | Val Loss=0.404102 
 ✅ Saved new best model (val_loss=0.4041)
Epoch 002 | Train Loss=0.370496 (box=0.0354, obj=0.0109, cls=0.0064) | Val Loss=0.201220 
 ✅ Saved new best model (val_loss=0.2012)
Epoch 003 | Train Loss=0.300115 (box=0.0284, obj=0.0105, cls=0.0059) | Val Loss=0.841364 
Epoch 004 | Train Loss=0.346310 (box=0.0330, obj=0.0105, cls=0.0058) | Val Loss=0.425857 
Epoch 005 | Train Loss=0.343504 (box=0.0328, obj=0.0103, cls=0.0057) | Val Loss=0.350187 
Epoch 006 | Train Loss=0.275559 (box=0.0261, obj=0.0098, cls=0.0055) | Val Loss=0.337532 
Epoch 007 | Train Loss=0.247345 (box=0.0232, obj=0.0099, cls=0.0055) | Val Loss=0.295899 
Epoch 008 | Train Loss=0.208416 (box=0.0194, obj=0.0097, cls=0.0053) | Val Loss=0.171640 
 ✅ Saved new best model (val_loss=0.1716)
Epoch 009 | Train Loss=0.181610 (box=0.0167, obj=0.0095, cls=0.0052) | Val Loss=0.165997 
 ✅ Saved new best model (val_loss=0.1660)
Epoch 010 | Train Loss

In [13]:
model.eval()


Path1Model(
  (backbone): DINOBackbone(
    (model): DINOv3ConvNextModel(
      (stages): ModuleList(
        (0): DINOv3ConvNextStage(
          (downsample_layers): ModuleList(
            (0): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
            (1): DINOv3ConvNextLayerNorm((128,), eps=1e-06, elementwise_affine=True)
          )
          (layers): ModuleList(
            (0-2): 3 x DINOv3ConvNextLayer(
              (depthwise_conv): Conv2d(128, 128, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=128)
              (layer_norm): DINOv3ConvNextLayerNorm((128,), eps=1e-06, elementwise_affine=True)
              (pointwise_conv1): Linear(in_features=128, out_features=512, bias=True)
              (activation_fn): GELUActivation()
              (pointwise_conv2): Linear(in_features=512, out_features=128, bias=True)
              (drop_path): Identity()
            )
          )
        )
        (1): DINOv3ConvNextStage(
          (downsample_layers): ModuleList(
  

In [22]:
test_dataset  = FireDataset("/kaggle/input/home-fire-dataset/test", processor)

test_loader = DataLoader(  
    test_dataset,
    batch_size=4,
    shuffle=False,
    collate_fn=collate_fn
)

In [29]:
def collate_fn_raw(batch):
    images, targets = zip(*batch)   # keep as lists
    return list(images), list(targets)


In [17]:
# -------------------- TEST --------------------
model.eval()
test_loss = 0.0
test_box_loss = 0.0
test_obj_loss = 0.0
test_cls_loss = 0.0

all_preds = []
all_targets = []

with torch.no_grad():
    for images, targets in test_loader:
        images = images.to(device)

        # ✅ Same target splitting logic as training/val
        batch_targets = []
        start = 0
        for count in targets["gt_groups"]:
            count = count.item()
            boxes = targets["bboxes"][start:start+count].to(device)
            labels = targets["cls"][start:start+count].to(device)
            start += count
            batch_targets.append({"boxes": boxes, "labels": labels})

        # Forward + loss
        preds = model({"pixel_values": images})
        loss, loss_items = criterion(preds, batch_targets)

        test_loss += loss.item()
        test_box_loss += loss_items["box_loss"]
        test_obj_loss += loss_items["obj_loss"]
        test_cls_loss += loss_items["cls_loss"]

        # save preds + targets for metrics
        all_preds.extend(preds)
        all_targets.extend(batch_targets)

# ✅ Average over test set
test_loss /= len(test_loader)
test_box_loss /= len(test_loader)
test_obj_loss /= len(test_loader)
test_cls_loss /= len(test_loader)

print(
    f"[TEST] Avg loss={test_loss:.4f} "
    f"(box={test_box_loss:.4f}, obj={test_obj_loss:.4f}, cls={test_cls_loss:.4f})"
)


[TEST] Avg loss=0.1115 (box=0.0097, obj=0.0094, cls=0.0052)
