# YOLO 기본 구조 이해
- Backbone: CNN으로 이미지 특징 추출 
- Head: 각 feature map pixel마다 "box/class" 예측

## Backbone : 특징 추출

In [None]:
import torch
import torch.nn as nn

class YOLOBackbone(nn.Module):
    def __init__(self):
        super(YOLOBackbone, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3), # 448→224
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                                   # 224→112
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                                   # 112→56
            nn.Conv2d(192, 128, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                                   # 56→28
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                                   # 28→14
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1), # 14→7
            nn.ReLU(),
        )
    def forward(self, x):
        return self.features(x)  # [batch, 1024, 7, 7]

In [None]:
# Backbone 사용 예시
input_image = torch.randn(1, 3, 448, 448)

backbone = YOLOBackbone()
feature_map = backbone(input_image)
print(feature_map.shape)  # torch.Size([1, 1024, 7, 7])


## Head : 예측 모듈

In [None]:
class YOLOHead(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YOLOHead, self).__init__()
        # s : grid size
        # b : number of boxes (후보군 개수 설정)
        # c : number of classes
        self.S, self.B, self.C = S, B, C
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, S * S * (B * 5 + C)),
        )
    def forward(self, x):
        x = self.fc(x)  
        x = x.view(-1, self.S, self.S, self.B * 5 + self.C)
        return x  # [batch, 7, 7, 30]


In [None]:
# Head 사용 예시
head = YOLOHead(S=7, B=2, C=20)
pred_tensor = head(feature_map)
print(pred_tensor.shape)  # torch.Size([1, 7, 7, 30])


## Pose-processing

In [None]:
import numpy as np

def yolo_postprocess(pred_tensor, S=7, B=2, C=20, conf_thresh=0.2, nms_thresh=0.5, img_size=448):
    """
    pred_tensor: [S, S, B*5 + C] (ex: [7,7,30])
    Returns: 최종 bounding box 리스트 [x1, y1, x2, y2, score, class]
    """
    boxes = []
    pred_tensor = pred_tensor.squeeze().detach().cpu().numpy()  # [7, 7, 30]
    for i in range(S):
        for j in range(S):
            cell = pred_tensor[i, j]
            class_probs = cell[B*5:]
            best_class = np.argmax(class_probs)
            best_class_score = class_probs[best_class]
            for b in range(B):
                x, y, w, h, conf = cell[b*5:(b+1)*5]
                cx = (j + x) / S * img_size
                cy = (i + y) / S * img_size
                bw = w * img_size
                bh = h * img_size
                x1 = int(cx - bw/2)
                y1 = int(cy - bh/2)
                x2 = int(cx + bw/2)
                y2 = int(cy + bh/2)
                score = conf * best_class_score
                if score > conf_thresh:
                    boxes.append([x1, y1, x2, y2, score, best_class])
    boxes = np.array(boxes)
    if len(boxes) == 0:
        return []
    keep = nms(boxes, nms_thresh)
    return boxes[keep]

def nms(boxes, iou_thresh):
    if len(boxes) == 0:
        return []
    indices = boxes[:, 4].argsort()[::-1]
    keep = []
    while len(indices) > 0:
        i = indices[0]
        keep.append(i)
        others = indices[1:]
        ious = compute_iou(boxes[i], boxes[others])
        indices = others[ious < iou_thresh]
    return keep

def compute_iou(box, boxes):
    if len(boxes) == 0:
        return np.array([])
    x1 = np.maximum(box[0], boxes[:,0])
    y1 = np.maximum(box[1], boxes[:,1])
    x2 = np.minimum(box[2], boxes[:,2])
    y2 = np.minimum(box[3], boxes[:,3])
    inter = np.maximum(x2-x1, 0) * np.maximum(y2-y1, 0)
    area1 = (box[2]-box[0]) * (box[3]-box[1])
    area2 = (boxes[:,2]-boxes[:,0]) * (boxes[:,3]-boxes[:,1])
    union = area1 + area2 - inter
    iou = inter / (union + 1e-6)
    return iou


In [None]:
# 1) 입력
input_image = torch.randn(1, 3, 448, 448)

# 2) Backbone
backbone = YOLOBackbone()
feature_map = backbone(input_image)

# 3) Head
head = YOLOHead(S=7, B=4, C=20)
pred_tensor = head(feature_map)

# 4) 후처리
boxes = yolo_postprocess(pred_tensor[0], S=7, B=4, C=20, conf_thresh=0.2, nms_thresh=0.5, img_size=448)
print("최종 바운딩 박스:", boxes)


# 모델 학습

## Dataset 설정

In [None]:

import os
import torch
import numpy as np
from tqdm import tqdm

VOC_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle",
    "bus", "car", "cat", "chair", "cow",
    "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]
CLASS2IDX = {cls: i for i, cls in enumerate(VOC_CLASSES)}

def voc_to_yolo_target(objects, img_w, img_h, S=7, B=2, C=20):
    target = np.zeros((S, S, B * 5 + C), dtype=np.float32)
    for obj in objects:
        cls_idx = CLASS2IDX[obj['name']]
        xmin, ymin, xmax, ymax = obj['bbox']
        # x, y, w, h 생성 (norm 값)
        cx = (xmin + xmax) / 2 / img_w
        cy = (ymin + ymax) / 2 / img_h
        bw = (xmax - xmin) / img_w
        bh = (ymax - ymin) / img_h
        # grid 좌표로 변환
        i = int(cy * S)
        j = int(cx * S)
        x_cell = cx * S - j
        y_cell = cy * S - i
        for b in range(B):
            if target[i, j, b*5+4] == 0:
                target[i, j, b*5:b*5+5] = [x_cell, y_cell, bw, bh, 1.0]
                break
        target[i, j, B*5 + cls_idx] = 1.0
    return target   # shape: (S, S, B*5+C)

def get_voc_objects(target):
    size = target['annotation']['size']
    img_w, img_h = int(size['width']), int(size['height'])
    objs = target['annotation']['object']
    if not isinstance(objs, list):
        objs = [objs]
    objects = []
    for obj in objs:
        name = obj['name']
        bbox = [int(obj['bndbox']['xmin']),
                int(obj['bndbox']['ymin']),
                int(obj['bndbox']['xmax']),
                int(obj['bndbox']['ymax'])]
        objects.append({"name": name, "bbox": bbox})
    return objects, img_w, img_h

def preprocess_and_save_voc_yolo(root, save_dir, year='2012', image_set='train', S=7, B=2, C=20, img_size=448):
    os.makedirs(save_dir, exist_ok=True)
    dataset = VOCDetection(root=root, year=year, image_set=image_set, download=True)
    transform = T.Compose([
        T.Resize((img_size, img_size)),
        T.ToTensor()
    ])
    img_save_dir = os.path.join(save_dir, 'images')
    label_save_dir = os.path.join(save_dir, 'labels')
    os.makedirs(img_save_dir, exist_ok=True)
    os.makedirs(label_save_dir, exist_ok=True)
    
    for idx in tqdm(range(len(dataset))):
        img, target = dataset[idx]
        img_t = transform(img) # torch.Tensor (3, 448, 448)
        objects, img_w, img_h = get_voc_objects(target)
        yolo_target = voc_to_yolo_target(objects, img_w, img_h, S, B, C)
        # 저장 (이미지: .pt, 라벨: .npy)
        torch.save(img_t, os.path.join(img_save_dir, f"{idx:06d}.pt"))
        np.save(os.path.join(label_save_dir, f"{idx:06d}.npy"), yolo_target)


In [None]:
import os 
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import VOCDetection
import torchvision.transforms as T
import numpy as np

class CachedVOCYOLODataset(Dataset):
    def __init__(self, cache_dir, S=7, B=2, C=20):
        self.img_dir = os.path.join(cache_dir, 'images')
        self.label_dir = os.path.join(cache_dir, 'labels')
        self.indices = sorted([
            fname.replace('.pt', '')
            for fname in os.listdir(self.img_dir)
            if fname.endswith('.pt')
        ])
        self.S, self.B, self.C = S, B, C

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        idx_str = self.indices[idx]
        img = torch.load(os.path.join(self.img_dir, f"{idx_str}.pt"))
        label = np.load(os.path.join(self.label_dir, f"{idx_str}.npy"))
        label = torch.from_numpy(label)
        return img, label


In [None]:

cached_dataset = CachedVOCYOLODataset('/media/otter/hard_otter/VOC', S=7, B=2, C=20)
loader = DataLoader(cached_dataset, batch_size=1, shuffle=True, num_workers=2)

for imgs, targets in loader:
    print("이미지 배치 shape:", imgs.shape)
    print("라벨 배치 shape:", targets.shape)
    break


In [None]:
class YOLOv1Simple(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YOLOv1Simple, self).__init__()
        self.backbone = YOLOBackbone()
        self.head = YOLOHead(S=S, B=B, C=C)

    def forward(self, x):
        x = self.backbone(x)
        x = self.head(x)
        return x

def yolo_loss(pred, target, S=7, B=2, C=20, lambda_coord=5, lambda_noobj=0.5):
    '''
    pred, target: [batch, S, S, B*5 + C]
    '''
    # 마스킹
    conf_idx = [b*5+4 for b in range(B)]
    # 각 박스(conf) 마다 마스크 만들고 stack
    obj_mask = torch.stack([target[..., idx] > 0 for idx in conf_idx], dim=-1)   # [batch, 7, 7, B]
    noobj_mask = torch.stack([target[..., idx] == 0 for idx in conf_idx], dim=-1)

    # print(obj_mask.shape)
    # ========== (1) Localization Loss (x, y, w, h) ==========
    # 좌표, width, height는 오직 객체가 있을 때만 loss
    loc_loss = 0.0
    for b in range(B):
        # (x, y) loss
        loc_loss += torch.sum(
            obj_mask[..., b] * (
                (pred[..., b*5+0] - target[..., b*5+0])**2 +
                (pred[..., b*5+1] - target[..., b*5+1])**2
            )
        )
        # (w, h) loss (sqrt로)
        loc_loss += torch.sum(
            obj_mask[..., b] * (
                (torch.sqrt(torch.abs(pred[..., b*5+2]+1e-6)) - torch.sqrt(target[..., b*5+2]+1e-6))**2 +
                (torch.sqrt(torch.abs(pred[..., b*5+3]+1e-6)) - torch.sqrt(target[..., b*5+3]+1e-6))**2
            )
        )

    loc_loss = lambda_coord * loc_loss

    # ========== (2) Confidence Loss ==========
    conf_loss = 0.0
    for b in range(B):
        # object (conf) loss
        conf_loss += torch.sum(
            obj_mask[..., b] * (pred[..., b*5+4] - target[..., b*5+4])**2
        )
        # no-object (conf) loss
        conf_loss += lambda_noobj * torch.sum(
            noobj_mask[..., b] * (pred[..., b*5+4] - target[..., b*5+4])**2
        )

    # ========== (3) Class Loss ==========
    # 객체가 있는 셀만 class loss (B개 중 첫 박스만 사용, 보통)
    # target[..., B*5+B*5:] => [batch, S, S, C]
    class_loss = torch.sum(
        (target[..., B*5+0:B*5+C] - pred[..., B*5+0:B*5+C])**2 * obj_mask[..., 0].unsqueeze(-1)
    )

    # ========== (4) 합계 ==========
    total_loss = loc_loss + conf_loss + class_loss
    return total_loss

In [None]:
import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.use('Agg')


loss_history = []

def plot_loss_curve(loss_history):
    fig = plt.figure(figsize=(6,4))
    plt.plot(loss_history, label='Avg Loss')
    plt.xlabel("Iteration")
    plt.ylabel("Loss")
    plt.title("Loss Curve")
    plt.legend()
    fig.canvas.draw()
    saveimg = np.asarray(fig.canvas.buffer_rgba())
    saveimg = saveimg[..., :3]  # 알파 채널 제거
    saveimg = cv2.cvtColor(saveimg, cv2.COLOR_RGB2BGR)    
    cv2.imshow('loss_curve', saveimg)
    cv2.waitKey(1)

def show_yolo_cv2(img, pred_tensor, class_names, conf_thresh=0.2, nms_thresh=0.5, img_size=448, winname="YOLO Detection"):
    # 후처리로 박스 추출
    boxes = yolo_postprocess(pred_tensor, S=7, B=2, C=20, conf_thresh=conf_thresh, nms_thresh=nms_thresh, img_size=img_size)
    
    # Tensor → numpy 변환 및 이미지 정규화 해제
    if isinstance(img, torch.Tensor):
        img = img.permute(1,2,0).cpu().numpy()
    if img.max() <= 1.0:
        img = (img * 255).astype(np.uint8)
    else:
        img = img.astype(np.uint8)
    
    # RGB → BGR (cv2는 BGR)
    img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    # bbox 그리기
    for box in boxes:
        x1, y1, x2, y2, score, cls_idx = box
        x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])
        color = (0,0,255)
        cv2.rectangle(img_bgr, (x1, y1), (x2, y2), color, 2)
        label = f"{class_names[int(cls_idx)]}: {score:.2f}"
        cv2.putText(img_bgr, label, (x1, max(0, y1-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
    
    # 이미지 띄우기
    cv2.imshow(winname, img_bgr)
    cv2.waitKey(1)  


In [None]:
import tqdm 
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR


model = YOLOv1Simple(S=7, B=2, C=20).to('cuda')
optimizer = optim.SGD(
    model.parameters(), 
    lr=1e-3,            # (권장 시작값: 1e-3 ~ 1e-2, 후에 줄여도 됨)
    momentum=0.9,       # 일반적으로 0.9~0.95 사용
    weight_decay=5e-4   # YOLOv1 논문 기본값, 필요 없으면 0으로
)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # 50 epoch마다 lr x0.1


loss_history = []
for epoch in range(30):
    model.train()
    running_loss = 0.0
    total_samples = 0    
    pbar = tqdm.tqdm(loader)
    for i, (imgs, targets) in enumerate(pbar):
        imgs, targets = imgs.to('cuda'), targets.to('cuda')
        optimizer.zero_grad()
        preds = model(imgs)
        loss = yolo_loss(preds, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        total_samples += imgs.size(0)

        # 평균 loss
        avg_loss = running_loss / total_samples if total_samples > 0 else 0.0
        pbar.desc = f'Epoch {epoch}, Avg Loss: {avg_loss:.4f}'
        if i%30==0:
            img_vis = imgs[0].detach().cpu()
            pred_vis = preds[0].detach().cpu()
            show_yolo_cv2(img_vis, pred_vis, VOC_CLASSES, winname="YOLO Detection")
    loss_history.append(avg_loss)
    plot_loss_curve(loss_history)
    scheduler.step()


In [None]:
import matplotlib.pyplot as plt
VOC_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle",
    "bus", "car", "cat", "chair", "cow",
    "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]

def plot_boxes_on_image(image, boxes, class_names=VOC_CLASSES):
    if isinstance(image, torch.Tensor):
        image = image.permute(1, 2, 0).cpu().numpy()  # [H,W,C]
    fig, ax = plt.subplots(1, figsize=(8, 8))
    ax.imshow(image.astype(np.uint8) if image.max() > 1.5 else (image * 255).astype(np.uint8))
    print(boxes)
    for box in boxes:
        x1, y1, x2, y2, score, cls_idx = box
        print(box)
        rect = patches.Rectangle(
            (x1, y1), x2 - x1, y2 - y1,
            linewidth=2, edgecolor='red', facecolor='none'
        )
        ax.add_patch(rect)
        ax.text(x1, y1 - 3, f"{class_names[int(cls_idx)]}: {score:.2f}",
                color='yellow', fontsize=10, weight='bold',
                bbox=dict(facecolor='red', alpha=0.5, edgecolor='none'))
    plt.axis('off')
    plt.show()


In [None]:

# 예시: DataLoader에서 하나 뽑아서 바로 실행
for img, _ in loader:
    img0 = img[0].cuda()  # [3, 448, 448]
    with torch.no_grad():
        pred_tensor = model(img0.unsqueeze(0))[0]  # [7,7,30]
    boxes = yolo_postprocess(pred_tensor, S=7, B=2, C=20, conf_thresh=0.2, nms_thresh=0.5, img_size=448)
    plot_boxes_on_image(img0, boxes, VOC_CLASSES)
    break


# YOLOv5 Demo

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 1. Focus Layer (YOLOv5만의 최초 특징 추출)
class Focus(nn.Module):
    def __init__(self, in_channels=3, out_channels=32):
        super().__init__()
        self.conv = nn.Conv2d(in_channels * 4, out_channels, kernel_size=1, stride=1, padding=0)
    def forward(self, x):
        # 슬라이싱으로 4분할 채널 확장
        patch1 = x[..., ::2, ::2]
        patch2 = x[..., ::2, 1::2]
        patch3 = x[..., 1::2, ::2]
        patch4 = x[..., 1::2, 1::2]
        x = torch.cat([patch1, patch2, patch3, patch4], dim=1)
        x = self.conv(x)
        return x

# 2. Basic Conv Block
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, k=3, s=1, p=1):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, k, s, p, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.LeakyReLU(0.1, inplace=True)

    def forward(self, x):
        x = self.act(self.bn(self.conv(x)))
        return x

# 3. CSPBlock (간단화)
class CSPBlock(nn.Module):
    def __init__(self, in_channels, out_channels, n=1):
        super().__init__()
        self.part1 = nn.Sequential(*[ConvBlock(in_channels//2, in_channels//2) for _ in range(n)])
        self.part2 = nn.Identity()
        self.conv_cat = ConvBlock(in_channels, out_channels, k=1, s=1, p=0)
    def forward(self, x):
        x1, x2 = x.chunk(2, dim=1)  # channel split
        x1 = self.part1(x1)
        x2 = self.part2(x2)
        x_cat = torch.cat([x1, x2], dim=1)
        x_out = self.conv_cat(x_cat)
        return x_out

# 4. SPP Module (Spatial Pyramid Pooling)
class SPP(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = ConvBlock(in_channels, in_channels//2, k=1, s=1, p=0)
        self.poolings = nn.ModuleList([
            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks//2) for ks in [5, 9, 13]
        ])
        self.conv2 = ConvBlock(in_channels*2, out_channels, k=1, s=1, p=0)
    def forward(self, x):
        x = self.conv1(x)
        pool_outs = [x] + [pool(x) for pool in self.poolings]
        x = torch.cat(pool_outs, dim=1)
        x = self.conv2(x)
        return x

# 5. Neck (FPN+PANet 간단화)
class Neck(nn.Module):
    def __init__(self, c3, c4, c5):
        super().__init__()
        # 채널 조정, 업샘플링/다운샘플링
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv_c5 = ConvBlock(c5, c4, k=1, s=1, p=0)
        self.conv_c4 = ConvBlock(c4*2, c3, k=1, s=1, p=0)
        self.downsample = nn.Conv2d(c3, c4, kernel_size=3, stride=2, padding=1)
        
    def forward(self, x3, x4, x5):
        p5 = self.upsample(self.conv_c5(x5))     # [B, c4, 32, 32]
        p4 = torch.cat([p5, x4], dim=1)          # [B, c4+c4, 32, 32]
        p4 = self.conv_c4(p4)                    # [B, c3, 32, 32]
        p3 = self.upsample(p4)                   # [B, c3, 64, 64]
        p3 = torch.cat([p3, x3], dim=1)          # [B, c3+x3의채널, 64, 64]
        return p3, p4, p5


# class SimpleYOLOHead(nn.Module):
#     def __init__(self, in_channels, anchors, S, num_classes):
#         super().__init__()
#         self.anchors = anchors
#         self.S = S
#         self.num_classes = num_classes
#         self.conv = nn.Conv2d(in_channels, len(anchors) * (5 + num_classes), 1)
#     def forward(self, x):
#         B = x.size(0)
#         out = self.conv(x)  # (B, A*(5+C), S, S)
#         out = out.view(B, len(self.anchors), 5 + self.num_classes, self.S, self.S)
#         out = out.permute(0, 1, 3, 4, 2)  # (B, A, S, S, 5+C)
#         return out

# 6. Detection Head (각 스케일별 예측, 간단화)
class DetectionHead(nn.Module):
    def __init__(self, in_channels, anchors, S, num_classes):
        super().__init__()
        self.anchors = anchors
        self.S = S
        self.num_classes = num_classes
        self.conv = nn.Conv2d(in_channels, len(anchors) * (5 + num_classes), 1)

    def forward(self, x):
        B = x.size(0)
        out = self.conv(x)
        out = out.view(B, len(self.anchors), 5 + self.num_classes, self.S, self.S)
        out = out.permute(0, 1, 3, 4, 2)  # (B, A, S, S, 5+C)
        return out


# 7. 전체 YOLOv5 구조 (연결)
class YOLOv5Demo(nn.Module):
    def __init__(self, num_classes=80):
        super().__init__()
        self.focus = Focus(3, 32)
        self.conv1 = ConvBlock(32, 64, s=2)
        self.csp1 = CSPBlock(64, 128, n=1)
        self.conv2 = ConvBlock(128, 256, s=2)
        self.csp2 = CSPBlock(256, 256, n=1)
        self.conv3 = ConvBlock(256, 512, s=2)
        self.spp = SPP(512, 512)
        self.neck = Neck(128, 256, 512)
        self.anchors = torch.tensor([[0.08, 0.08], [0.15, 0.15], [0.30, 0.30]])  # (w, h)
        # 각 p3, p4, p5 feature map 크기(S), 채널에 맞게 DetectionHead 선언
        self.head_p3 = DetectionHead(256, self.anchors, S=64, num_classes=num_classes)
        self.head_p4 = DetectionHead(128, self.anchors, S=32, num_classes=num_classes)
        self.head_p5 = DetectionHead(256, self.anchors, S=32, num_classes=num_classes)

    def forward(self, x):
        x = self.focus(x)
        x = self.conv1(x)
        x3 = self.csp1(x)
        x = self.conv2(x3)
        x4 = self.csp2(x)
        x = self.conv3(x4)
        x5 = self.spp(x)
        p3, p4, p5 = self.neck(x3, x4, x5)
        out3 = self.head_p3(p3)
        out4 = self.head_p4(p4)
        out5 = self.head_p5(p5)
        return out3, out4, out5

# 8. 테스트 실행
if __name__ == '__main__':
    # 입력 예시: (batch=1, 3, 256, 256)
    x = torch.randn(16, 3, 256, 256)
    model = YOLOv5Demo(num_classes=20)  # 예시: VOC 20 클래스
    outs = model(x)
    print("Output shapes:", [o.shape for o in outs])


In [None]:
import torch
import numpy as np

def voc2yolo(target, img_size, class_to_idx=None):
    """
    VOC XML 딕셔너리(target)를 YOLO 형식 [class_id, x_center, y_center, w, h]로 변환
    img_size: (W, H)
    class_to_idx: VOC 클래스 이름→인덱스 딕셔너리. 없으면 VOC 기본 20클래스 사용.
    """
    objects = target['annotation']['object']
    if not isinstance(objects, list):
        objects = [objects]
    boxes = []
    for obj in objects:
        # VOC 클래스 이름
        cls_name = obj['name']
        # 클래스 인덱스 변환
        if class_to_idx is None:
            VOC_CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
                           'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
                           'dog', 'horse', 'motorbike', 'person', 'pottedplant',
                           'sheep', 'sofa', 'train', 'tvmonitor']
            class_to_idx = {name: i for i, name in enumerate(VOC_CLASSES)}
        cls_id = class_to_idx[cls_name]
        # Bounding box (VOC는 좌상단/우하단 픽셀 좌표)
        bndbox = obj['bndbox']
        xmin = float(bndbox['xmin'])
        ymin = float(bndbox['ymin'])
        xmax = float(bndbox['xmax'])
        ymax = float(bndbox['ymax'])
        w_img, h_img = img_size
        # YOLO 형식으로 변환 (center x/y, w/h, 모두 0~1 정규화)
        x_center = (xmin + xmax) / 2 / w_img
        y_center = (ymin + ymax) / 2 / h_img
        box_w = (xmax - xmin) / w_img
        box_h = (ymax - ymin) / h_img
        boxes.append([cls_id, x_center, y_center, box_w, box_h])
    return np.array(boxes, dtype=np.float32)



In [None]:

from torchvision.datasets import VOCDetection
from torchvision.transforms import ToTensor

voc = VOCDetection(root='/media/otter/hard_otter/dataset', year='2012', image_set='trainval')
img, target = voc[0]       # img: PIL.Image, target: dict

img_tensor = ToTensor()(img)   # (3, H, W)
w_img, h_img = img.size

yolo_targets = voc2yolo(target, (w_img, h_img))
print("YOLO targets:\n", yolo_targets)
# yolo_targets: [ [class_id, x_center, y_center, w, h], ... ]


In [None]:

from torch.utils.data import Dataset

class VOCDetectionYOLO(Dataset):
    def __init__(self, root, year='2012', image_set='trainval', transforms=None):
        self.dataset = VOCDetection(root=root, year=year, image_set=image_set)
        self.transforms = transforms
    def __getitem__(self, idx):
        img, target = self.dataset[idx]
        w, h = img.size
        yolo_targets = voc2yolo(target, (w, h))
        if self.transforms:
            img = self.transforms(img)
        return img, torch.from_numpy(yolo_targets)
    def __len__(self):
        return len(self.dataset)


In [None]:

def yolo_label_assignment(gt_targets, anchors, S, num_classes, device):
    """
    gt_targets: list of [ [class, x_center, y_center, w, h], ... ] (정규화, 0~1)
    anchors: (A, 2)  # anchor box 크기 (w, h), 이미지 스케일 기준
    S: output feature map 크기 (SxS)
    num_classes: 클래스 개수
    device: torch device

    반환: (A, S, S, 5+C) tensor
    """
    output = torch.zeros((len(anchors), S, S, 5 + num_classes), device=device)
    img_size = 1.0  # GT가 정규화(0~1)이므로, anchor도 정규화값 사용

    for gt in gt_targets:
        cls, xc, yc, w, h = gt
        gt_w, gt_h = w, h  # 0~1 정규화값

        # (1) GT 박스 vs 각 anchor IoU 계산 (center=0,0)
        ious = []
        for anchor in anchors:
            anchor_w, anchor_h = anchor[0] / img_size, anchor[1] / img_size
            inter_w = min(gt_w, anchor_w)
            inter_h = min(gt_h, anchor_h)
            inter = inter_w * inter_h
            union = gt_w * gt_h + anchor_w * anchor_h - inter
            iou = inter / (union + 1e-6)
            ious.append(iou)
        best_anchor = torch.tensor(ious).argmax().item()

        # (2) GT의 center가 포함되는 grid cell 위치
        grid_x = int(xc * S)
        grid_y = int(yc * S)
        # (3) 해당 anchor, cell에 할당
        output[best_anchor, grid_y, grid_x, 0:4] = torch.tensor(
            [xc * S - grid_x, yc * S - grid_y, w, h], device=device
        )  # cell 내 상대 좌표
        output[best_anchor, grid_y, grid_x, 4] = 1.0  # objectness
        output[best_anchor, grid_y, grid_x, 5 + int(cls)] = 1.0  # class one-hot

    return output


def yolo_simple_loss(pred, target):
    """
    pred/target: (B, A, S, S, 5+C)
    """
    obj_mask = target[..., 4] == 1  # objectness=1인 곳
    # box: x, y, w, h
    box_loss = ((pred[..., 0:4][obj_mask] - target[..., 0:4][obj_mask]) ** 2).mean()
    # objectness: BCE
    bce = nn.BCEWithLogitsLoss()
    obj_loss = bce(pred[..., 4], target[..., 4])
    # class: BCE
    if target.shape[-1] > 5:
        cls_loss = bce(pred[..., 5:], target[..., 5:])
    else:
        cls_loss = 0.0
    return box_loss + obj_loss + cls_loss

S = 13  # feature map 크기
anchors = torch.tensor([[0.08, 0.08], [0.15, 0.15], [0.30, 0.30]]).cuda()  # (이미지 대비 w,h), 정규화
num_classes = 20
in_channels = 32


In [None]:
import tqdm 
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torchvision.transforms import ToTensor
voc_yolo = VOCDetectionYOLO(root='/media/otter/hard_otter/dataset', transforms=ToTensor())
loader = DataLoader(voc_yolo, batch_size=1, shuffle=True)
img, target = voc_yolo[0]
print(img.shape)      # torch.Size([3, H, W])
print(target.shape)   # (num_objs, 5)

model = YOLOv5Demo(20).to('cuda')
optimizer = optim.SGD(
    model.parameters(), 
    lr=1e-3,            # (권장 시작값: 1e-3 ~ 1e-2, 후에 줄여도 됨)
    momentum=0.9,       # 일반적으로 0.9~0.95 사용
    weight_decay=5e-4   # YOLOv1 논문 기본값, 필요 없으면 0으로
)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # 50 epoch마다 lr x0.1


loss_history = []
for epoch in range(30):
    model.train()
    running_loss = 0.0
    total_samples = 0    
    pbar = tqdm.tqdm(loader)
    for i, (imgs, target) in enumerate(pbar):
        # 각 이미지별로 dummy GT box 리스트 (class, x_center, y_center, w, h)
        # 각 스케일별 label assignment
        imgs, targets = imgs.to('cuda'), target.to('cuda')
        batch_targets_p3 = torch.stack([
            yolo_label_assignment(gt, anchors, S=64, num_classes=num_classes, device=device)
            for gt in target
        ])
        batch_targets_p4 = torch.stack([
            yolo_label_assignment(gt, anchors, S=32, num_classes=num_classes, device=device)
            for gt in target
        ])
        batch_targets_p5 = torch.stack([
            yolo_label_assignment(gt, anchors, S=32, num_classes=num_classes, device=device)
            for gt in target
        ])
        # Forward
        out3, out4, out5 = model(imgs)
        # Loss (각 스케일별로)
        loss3 = yolo_simple_loss(out3, batch_targets_p3)
        loss4 = yolo_simple_loss(out4, batch_targets_p4)
        loss5 = yolo_simple_loss(out5, batch_targets_p5)
        loss = loss3 + loss4 + loss5
        running_loss += loss.item() * imgs.size(0)
        total_samples += imgs.size(0)
        optimizer.zero_grad()
        running_loss.backward()
        optimizer.step()
        avg_loss = running_loss / total_samples if total_samples > 0 else 0.0
        pbar.desc = f'Epoch {epoch}, Avg Loss: {avg_loss:.4f}'

        # 평균 loss
        if i%30==0:
            img_vis = imgs[0].detach().cpu()
            pred_vis = preds[0].detach().cpu()
            show_yolo_cv2(img_vis, pred_vis, VOC_CLASSES, winname="YOLO Detection")
    loss_history.append(avg_loss)
    plot_loss_curve(loss_history)
    scheduler.step()
