## Preparação do Dataset

In [2]:
import os
import random
from PIL import Image
import shutil

# Define dataset root
original_dataset_path = '/kaggle/input/pascal-voc-2012-dataset/VOC2012_train_val/'
yolo_dataset_path = 'yolo_dataset'

In [3]:
yolo_dirs = [
    os.path.join(yolo_dataset_path, 'images', 'train'),
    os.path.join(yolo_dataset_path, 'images', 'val'),
    os.path.join(yolo_dataset_path, 'labels', 'train'),
    os.path.join(yolo_dataset_path, 'labels', 'val')
]

for yolo_dir in yolo_dirs:
    os.makedirs(yolo_dir, exist_ok=True)

jpeg_images_dir = os.path.join(original_dataset_path, 'VOC2012_train_val', 'JPEGImages')
annotations_dir = os.path.join(original_dataset_path, 'VOC2012_train_val', 'Annotations')
if not os.path.exists(jpeg_images_dir) or not os.path.exists(annotations_dir):
    raise FileNotFoundError(f"The directory {jpeg_images_dir} or {annotations_dir} does not exist. Please verify the dataset path.")
image_filenames = os.listdir(jpeg_images_dir)
image_ids = [os.path.splitext(filename)[0] for filename in image_filenames if filename.endswith('.jpg')]

random.seed(42)
random.shuffle(image_ids)
split_index = int(0.8 * len(image_ids)) #Spliting the dataset 80% for training, 20% for validation
train_ids = image_ids[:split_index] #taking the first 80% pictures
val_ids = image_ids[split_index:]
print(len(train_ids))
print(len(val_ids))

13700
3425


In [4]:
import xml.etree.ElementTree as ET 
#this fucntion converts PASCAL_VOC annotations to YOLO format
def create_yolo_annotation(xml_file_path, yolo_label_path, label_dict):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    annotations = [] #list that will store the converted YOLO annotations.

    img_width = int(root.find('size/width').text)
    img_height = int(root.find('size/height').text)

    for obj in root.findall('object'):
        label = obj.find('name').text
        if label not in label_dict:
            continue
        label_idx = label_dict[label]
        bndbox = obj.find('bndbox')
        xmin = float(bndbox.find('xmin').text)
        ymin = float(bndbox.find('ymin').text)
        xmax = float(bndbox.find('xmax').text)
        ymax = float(bndbox.find('ymax').text)

        # this is YOLOv8 annotation format: label x_center y_center width height (normalized)
        x_center = ((xmin + xmax) / 2) / img_width
        y_center = ((ymin + ymax) / 2) / img_height
        width = (xmax - xmin) / img_width
        height = (ymax - ymin) / img_height

        annotations.append(f"{label_idx} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")

    #annotations to the label file
    with open(yolo_label_path, 'w') as f:
        f.write("\n".join(annotations))

label_dict = {
    'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4,
    'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9,
    'diningtable': 10, 'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14,
    'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19
}

for image_set, ids in [('train', train_ids), ('val', val_ids)]:
    for img_id in ids:
        img_src_path = os.path.join(jpeg_images_dir, f'{img_id}.jpg')
        label_dst_path = os.path.join(yolo_dataset_path, 'labels', image_set, f'{img_id}.txt')

        # Create the YOLO annotation file
        xml_file_path = os.path.join(annotations_dir, f'{img_id}.xml')
        if not os.path.exists(xml_file_path):
            print(f"Warning: Annotation {xml_file_path} not found, skipping.")
            continue
        create_yolo_annotation(xml_file_path, label_dst_path, label_dict)

        # Copy the image to the new YOLO dataset structure
        img_dst_path = os.path.join(yolo_dataset_path, 'images', image_set, f'{img_id}.jpg')
        shutil.copy(img_src_path, img_dst_path)

In [5]:
# 1. Criar o arquivo de nomes das classes (voc.names)
names_file_path = os.path.join(yolo_dataset_path, 'voc.names')
with open(names_file_path, 'w') as f:
    for label in label_dict.keys():
        f.write(f"{label}\n")
print(f"Arquivo 'voc.names' criado em: {names_file_path}")


# 2. Criar os arquivos de lista de imagens (train.txt e val.txt) com caminhos absolutos
train_txt_path = os.path.join(yolo_dataset_path, 'train.txt')
val_txt_path = os.path.join(yolo_dataset_path, 'val.txt')

# Para o conjunto de treino
with open(train_txt_path, 'w') as f:
    for img_id in train_ids:
        img_path = os.path.abspath(os.path.join(yolo_dataset_path, 'images', 'train', f'{img_id}.jpg'))
        f.write(f"{img_path}\n")
print(f"Arquivo 'train.txt' criado em: {train_txt_path}")

# Para o conjunto de validação
with open(val_txt_path, 'w') as f:
    for img_id in val_ids:
        img_path = os.path.abspath(os.path.join(yolo_dataset_path, 'images', 'val', f'{img_id}.jpg'))
        f.write(f"{img_path}\n")
print(f"Arquivo 'val.txt' criado em: {val_txt_path}")


# 3. Criar o arquivo de configuração de dados (voc.data)
data_config_path = os.path.join(yolo_dataset_path, 'voc.data')
num_classes = len(label_dict)

with open(data_config_path, 'w') as f:
    f.write(f"classes = {num_classes}\n")
    f.write(f"train = {os.path.abspath(train_txt_path)}\n")
    f.write(f"valid = {os.path.abspath(val_txt_path)}\n")
    f.write(f"names = {os.path.abspath(names_file_path)}\n")
    f.write("backup = backup/") # Pasta para salvar os pesos durante o treino
print(f"Arquivo 'voc.data' criado em: {data_config_path}")

# Cria a pasta de backup, se não existir
os.makedirs(os.path.join(yolo_dataset_path, 'backup'), exist_ok=True)

Arquivo 'voc.names' criado em: yolo_dataset/voc.names
Arquivo 'train.txt' criado em: yolo_dataset/train.txt
Arquivo 'val.txt' criado em: yolo_dataset/val.txt
Arquivo 'voc.data' criado em: yolo_dataset/voc.data


## Configuração e Utilitários

In [7]:
# Instalações e Importações
# =======================================

import torch
import torch.nn as nn
import os

In [8]:
# Configuração Principal do Projeto
# =============================================

# -- Dispositivo --
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -- Caminhos do Dataset --
DATASET_PATH = "yolo_dataset" 
TRAIN_CSV_PATH = os.path.join(DATASET_PATH, "train.txt")
VAL_CSV_PATH = os.path.join(DATASET_PATH, "val.txt")

# -- Hiperparâmetros de Treinamento --
LEARNING_RATE = 2e-5
BATCH_SIZE = 8
NUM_WORKERS = 4 # Kaggle geralmente oferece 4 cores, então 4 é um bom valor
NUM_EPOCHS = 100 # Número de épocas para treinar
PIN_MEMORY = True # Otimização para carregar dados mais rápido para a GPU
LOAD_MODEL = False # Se True, carrega um modelo pré-treinado
SAVE_MODEL = True # Se True, salva o modelo durante o treinamento

# -- Constantes do Modelo e Dataset --
IMAGE_SIZE = 416 # Tamanho padrão de entrada para YOLOv3
NUM_CLASSES = 20 # 20 classes para o PASCAL VOC
CONF_THRESHOLD = 0.6 # Limite de confiança para uma predição ser considerada
MAP_IOU_THRESH = 0.5 # Limite de IoU para cálculo do mAP (True Positive)
NMS_IOU_THRESH = 0.45 # Limite de IoU para o Non-Maximum Suppression

# -- Âncoras (Anchors) da YOLOv3 --
# Estas âncoras são pré-calculadas e são padrão para YOLOv3.
# Estão agrupadas por escala de predição.
# Formato: [(largura_ancora_1, altura_ancora_1), (largura_ancora_2, altura_ancora_2), ...]
ANCHORS = [
    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],      # Para a grade de predição GRANDE (detecta objetos pequenos)
    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],     # Para a grade de predição MÉDIA (detecta objetos médios)
    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],     # Para a grade de predição PEQUENA (detecta objetos grandes)
]

# -- Checkpoints para salvar/carregar o modelo --
CHECKPOINT_FILE = "yolov3_voc.pth.tar"

In [37]:
# Função Utilitária - Intersection over Union (IoU)
# =============================================================
def iou(box1, box2, is_pred=True):
    """
    Calcula a Interseção sobre União (IoU) entre duas caixas delimitadoras ou apenas (w,h).
    """
    if is_pred:
        # box1 e box2 no formato (x, y, w, h)
        b1_x1 = box1[..., 0:1] - box1[..., 2:3] / 2
        b1_y1 = box1[..., 1:2] - box1[..., 3:4] / 2
        b1_x2 = box1[..., 0:1] + box1[..., 2:3] / 2
        b1_y2 = box1[..., 1:2] + box1[..., 3:4] / 2

        b2_x1 = box2[..., 0:1] - box2[..., 2:3] / 2
        b2_y1 = box2[..., 1:2] - box2[..., 3:4] / 2
        b2_x2 = box2[..., 0:1] + box2[..., 2:3] / 2
        b2_y2 = box2[..., 1:2] + box2[..., 3:4] / 2
    else:
        # box1 = (w, h), box2 = lista de âncoras (w, h)
        b1_w, b1_h = box1[0], box1[1]
        b2_w, b2_h = box2[:, 0], box2[:, 1]

        intersection = torch.min(b1_w, b2_w) * torch.min(b1_h, b2_h)
        union = (b1_w * b1_h) + (b2_w * b2_h) - intersection + 1e-6
        return intersection / union

    inter_x1 = torch.max(b1_x1, b2_x1)
    inter_y1 = torch.max(b1_y1, b2_y1)
    inter_x2 = torch.min(b1_x2, b2_x2)
    inter_y2 = torch.min(b1_y2, b2_y2)

    intersection_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(inter_y2 - inter_y1, min=0)
    box1_area = abs((b1_x2 - b1_x1) * (b1_y2 - b1_y1))
    box2_area = abs((b2_x2 - b2_x1) * (b2_y2 - b2_y1))
    union_area = box1_area + box2_area - intersection_area + 1e-6

    return intersection_area / union_area


In [11]:
# Função Utilitária - Non-Maximum Suppression (NMS)
# ==============================================================

def nms(bboxes, iou_threshold, threshold):
    """
    Executa Non-Maximum Suppression para filtrar caixas delimitadoras.

    Parâmetros:
        bboxes (list): Lista de predições. Cada predição é uma lista no formato
                       [class_id, confidence_score, x1, y1, x2, y2].
        iou_threshold (float): Limite de IoU para suprimir caixas.
        threshold (float): Limite de confiança para considerar uma caixa.

    Retorna:
        list: A lista de caixas após aplicar NMS.
    """
    
    # 1. Filtra as caixas com score de confiança abaixo do threshold
    bboxes = [box for box in bboxes if box[1] > threshold]

    # 2. Ordena as caixas pela confiança, da maior para a menor
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
    
    bboxes_after_nms = []

    while bboxes:
        # 3. Pega a caixa com a maior confiança
        chosen_box = bboxes.pop(0)
        bboxes_after_nms.append(chosen_box)

        # 4. Compara a caixa escolhida com todas as outras restantes
        bboxes = [
            box
            for box in bboxes
            # Mantém apenas as caixas que são de outra classe
            if box[0] != chosen_box[0]
            # Ou que têm IoU abaixo do threshold com a caixa escolhida
            or iou(
                torch.tensor(chosen_box[2:]),
                torch.tensor(box[2:]),
                is_pred=False
            ) < iou_threshold
        ]

    return bboxes_after_nms

## Preparação e Carregamento dos Dados

In [13]:
# Instalação Adicional e Importações
# ==========================================================

!pip install -q albumentations==1.3.1 opencv-python-headless

import numpy as np
import pandas as pd
from PIL import Image, ImageFile
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2

ImageFile.LOAD_TRUNCATED_IMAGES = True

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.7/125.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h

In [46]:
class YOLOv3Dataset(Dataset):
    def __init__(
        self,
        txt_path,
        anchors,
        image_size=416,
        S=[13, 26, 52], 
        C=20,
        transform=None,
    ):
        with open(txt_path, "r") as f:
            self.annotations = [line.strip() for line in f.readlines() if line.strip()]

        self.image_size = image_size
        self.transform = transform
        self.S = S
        self.C = C
        self.anchors = torch.tensor(anchors[0] + anchors[1] + anchors[2])
        self.num_anchors = self.anchors.shape[0]
        self.num_anchors_per_scale = self.num_anchors // 3
        self.ignore_iou_thresh = 0.5

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        image_path = self.annotations[index]
        label_path = image_path.replace("images", "labels").replace(".jpg", ".txt")
    
        # Carrega a imagem
        image = np.array(Image.open(image_path).convert("RGB"))
    
        # Inicializa os targets
        targets = [torch.zeros((self.num_anchors_per_scale, S, S, 6)) for S in self.S]
        bboxes = np.loadtxt(fname=label_path, delimiter=" ", ndmin=2)
        bboxes = np.roll(bboxes, shift=-1, axis=1)  # move class para o fim
    
        # Aplica data augmentation
        if self.transform:
            if len(bboxes) == 0:
                bboxes_for_transform = np.empty((0, 5))
            else:
                bboxes_for_transform = bboxes
    
            augmentations = self.transform(image=image, bboxes=bboxes_for_transform)
            image = augmentations["image"]
            bboxes = augmentations["bboxes"]
    
        for box in bboxes:
            x, y, w, h, class_label = box

            iou_anchors = iou(torch.tensor([w, h]), self.anchors, is_pred=False)
    
            anchor_indices = iou_anchors.argsort(descending=True)
            best_anchor_idx = anchor_indices[0]
    
            scale_idx = (best_anchor_idx // self.num_anchors_per_scale).item()
            anchor_on_scale = (best_anchor_idx % self.num_anchors_per_scale).item()
    
            S = self.S[scale_idx]
            i, j = int(S * y), int(S * x)
    
            target_scale = targets[scale_idx]
            if target_scale[anchor_on_scale, i, j, 0] == 0:
                target_scale[anchor_on_scale, i, j, 0] = 1
                x_cell, y_cell = S * x - j, S * y - i
                w_cell, h_cell = w * S, h * S
                box_coordinates = torch.tensor([x_cell, y_cell, w_cell, h_cell])
                target_scale[anchor_on_scale, i, j, 1:5] = box_coordinates
                target_scale[anchor_on_scale, i, j, 5] = int(class_label)
    
            for anchor_idx in anchor_indices[1:]:
                if iou_anchors[anchor_idx] > self.ignore_iou_thresh:
                    scale_idx_ignore = (anchor_idx // self.num_anchors_per_scale).item()
                    anchor_on_scale_ignore = (anchor_idx % self.num_anchors_per_scale).item()
    
                    S_ignore = self.S[scale_idx_ignore]
                    i_ignore, j_ignore = int(S_ignore * y), int(S_ignore * x)
    
                    if targets[scale_idx_ignore][anchor_on_scale_ignore, i_ignore, j_ignore, 0] == 0:
                        targets[scale_idx_ignore][anchor_on_scale_ignore, i_ignore, j_ignore, 0] = -1
    
        return image, tuple(targets)

## Arquitetura da YOLO

In [54]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels)
        self.leaky = nn.LeakyReLU(0.1)
        self.use_bn_act = bn_act

    def forward(self, x):
        if self.use_bn_act:
            return self.leaky(self.bn(self.conv(x)))
        else:
            return self.conv(x)

In [55]:
class ResidualBlock(nn.Module):
    def __init__(self, channels, use_residual=True, num_repeats=1):
        super().__init__()
        self.layers = nn.ModuleList()
        for _ in range(num_repeats):
            self.layers += [
                nn.Sequential(
                    CNNBlock(channels, channels // 2, kernel_size=1),
                    CNNBlock(channels // 2, channels, kernel_size=3, padding=1),
                )
            ]
        self.use_residual = use_residual
        self.num_repeats = num_repeats

    def forward(self, x):
        for layer in self.layers:
            if self.use_residual:
                x = x + layer(x)
            else:
                x = layer(x)
        return x


In [57]:
class ScalePrediction(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()
        self.pred = nn.Sequential(
            CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
            CNNBlock(
                2 * in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1
            ),
        )
        self.num_classes = num_classes

    def forward(self, x):
        # Transforma a saída do formato (N, C, H, W) para o formato YOLO
        # N -> Batch Size
        # C -> 3 * (5 + num_classes)
        # H, W -> Dimensões da grade (S)
        # Saída desejada: (N, 3, S, S, 5 + num_classes)
        return (
            self.pred(x)
            .reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3])
            .permute(0, 1, 3, 4, 2)
        )


In [59]:
# Configuração da arquitetura YOLOv3
# Tupla: (out_channels, kernel_size, stride)
# Lista: ["B", num_repeats] -> Bloco Residual
# "S": Scale Prediction
# "U": Upsampling
config = [
    (32, 3, 1),
    (64, 3, 2),
    ["B", 1],
    (128, 3, 2),
    ["B", 2],
    (256, 3, 2),
    ["B", 8],
    (512, 3, 2),
    ["B", 8],
    (1024, 3, 2),
    ["B", 4],  # Fim da Darknet-53
    (512, 1, 1),
    (1024, 3, 1),
    "S", # Predição em Escala 1 (grade 13x13)
    (256, 1, 1),
    "U",
    (256, 1, 1),
    (512, 3, 1),
    "S", # Predição em Escala 2 (grade 26x26)
    (128, 1, 1),
    "U",
    (128, 1, 1),
    (256, 3, 1),
    "S", # Predição em Escala 3 (grade 52x52)
]


class YOLOv3(nn.Module):
    def __init__(self, in_channels=3, num_classes=20):
        super().__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.layers = self._create_conv_layers()

    def forward(self, x):
        outputs = []
        route_connections = []
        for layer in self.layers:
            if isinstance(layer, ScalePrediction):
                outputs.append(layer(x))
                continue

            x = layer(x)

            if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
                route_connections.append(x)
            
            elif isinstance(layer, nn.Upsample):
                x = torch.cat([x, route_connections.pop()], dim=1)
        
        return outputs

    def _create_conv_layers(self):
        layers = nn.ModuleList()
        in_channels = self.in_channels

        for module in config:
            if isinstance(module, tuple):
                out_channels, kernel_size, stride = module
                layers.append(
                    CNNBlock(
                        in_channels,
                        out_channels,
                        kernel_size=kernel_size,
                        stride=stride,
                        padding=1 if kernel_size == 3 else 0,
                    )
                )
                in_channels = out_channels

            elif isinstance(module, list):
                num_repeats = module[1]
                layers.append(ResidualBlock(in_channels, num_repeats=num_repeats))

            elif isinstance(module, str):
                if module == "S":
                    layers += [
                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
                        CNNBlock(in_channels, in_channels // 2, kernel_size=1),
                        ScalePrediction(in_channels // 2, num_classes=self.num_classes),
                    ]
                    in_channels = in_channels // 2

                elif module == "U":
                    layers.append(nn.Upsample(scale_factor=2))
                    in_channels = in_channels * 3 # Concatenação com a route connection
        
        return layers


## Loss

In [105]:
class YOLOLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        self.bce = nn.BCEWithLogitsLoss()
        self.cross_entropy = nn.CrossEntropyLoss()
        self.sigmoid = nn.Sigmoid()

        # Usando os lambdas que você tinha no seu código
        self.lambda_class = 1
        self.lambda_noobj = 1
        self.lambda_obj = 1
        self.lambda_box = 1

    def forward(self, predictions, targets, anchors):
        total_loss = 0

        for i in range(3):
            prediction = predictions[i]
            target = targets[i]
            anchors_scale = anchors[i]

            obj = target[..., 0] == 1
            noobj = target[..., 0] == 0

            # --- 1. Perda de "Não Objeto" (No Object Loss) ---
            no_obj_loss = self.bce(
                (prediction[..., 0:1][noobj]), (target[..., 0:1][noobj])
            )

            # --- 2. Perda de Objeto (Object Loss) ---
            anchors_scale_reshaped = anchors_scale.reshape(1, 3, 1, 1, 2)
            box_preds = torch.cat([self.sigmoid(prediction[..., 1:3]), torch.exp(prediction[..., 3:5]) * anchors_scale_reshaped], dim=-1)
            ious = iou(box_preds[obj], target[..., 1:5][obj], is_pred=True).detach()
            object_loss = self.bce(
                (prediction[..., 0:1][obj]), (ious * target[..., 0:1][obj])
            )

            # --- 3. Perda da Caixa Delimitadora ---
            t_xy = self.sigmoid(target[..., 1:3])
            t_wh = torch.log(
                (1e-16 + target[..., 3:5]) / anchors_scale_reshaped
            )
            box_loss_target = torch.cat((t_xy, t_wh), dim=-1)

            # Calcula a perda MSE apenas nas posições onde há objetos.
            box_loss = self.mse(prediction[..., 1:5][obj], box_loss_target[obj])


            # --- 4. Perda de Classificação (Class Loss) ---
            class_loss = self.cross_entropy(
                (prediction[..., 5:][obj]), (target[..., 5][obj].long()),
            )

            # Evita adicionar perdas NaN se não houver objetos na amostra
            # (object_loss, box_loss, class_loss podem ser NaN se 'obj' estiver vazio)
            if torch.isnan(object_loss): object_loss = 0
            if torch.isnan(box_loss): box_loss = 0
            if torch.isnan(class_loss): class_loss = 0
                
            total_loss += (
                self.lambda_box * box_loss
                + self.lambda_obj * object_loss
                + self.lambda_noobj * no_obj_loss
                + self.lambda_class * class_loss
            )

        return total_loss


## Treino

In [108]:
# Funções Utilitárias (Checkpoints, Conversão de Saída e mAP)
# =======================================================================
from collections import Counter
from tqdm import tqdm
import matplotlib.pyplot as plt

def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
    print("=> Salvando checkpoint")
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
    }
    torch.save(checkpoint, filename)

def load_checkpoint(checkpoint_file, model, optimizer, lr):
    print("=> Carregando checkpoint")
    checkpoint = torch.load(checkpoint_file, map_location=DEVICE)
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

def convert_cells_to_bboxes(predictions, anchors, S, is_predictions=True):
    # predictions: (N, 3, S, S, 5+C)
    # anchors: (3, 2)
    # S: grid_size
    batch_size = predictions.shape[0]
    num_anchors = len(anchors)
    box_predictions = predictions[..., 1:5]

    if is_predictions:
        anchors = anchors.reshape(1, len(anchors), 1, 1, 2)
        box_predictions[..., 0:2] = torch.sigmoid(box_predictions[..., 0:2])
        box_predictions[..., 2:] = torch.exp(box_predictions[..., 2:]) * anchors

    cell_indices = (
        torch.arange(S)
        .repeat(predictions.shape[0], 3, S, 1)
        .unsqueeze(-1)
        .to(predictions.device)
    )
    x = 1 / S * (box_predictions[..., 0:1] + cell_indices)
    y = 1 / S * (box_predictions[..., 1:2] + cell_indices.permute(0, 1, 3, 2, 4))
    w_h = 1 / S * box_predictions[..., 2:4]
    
    converted_bboxes = torch.cat((x, y, w_h), dim=-1).reshape(batch_size, num_anchors * S * S, 4)
    return converted_bboxes.tolist()

def get_evaluation_bboxes(loader, model, iou_threshold, anchors, threshold, device):
    model.eval()
    train_idx = 0
    all_pred_boxes = []
    all_true_boxes = []

    for batch_idx, (x, labels) in enumerate(tqdm(loader, desc="Calculando predições para mAP")):
        x = x.to(device)
        with torch.no_grad():
            with torch.amp.autocast(device_type=device, dtype=torch.float16):
                predictions = model(x)

        batch_size = x.shape[0]
        bboxes = [[] for _ in range(batch_size)]
        
        for i in range(3): # 3 scales
            S = predictions[i].shape[2]
            anchor = torch.tensor([*anchors[i]]).to(device) * S
            boxes_scale_i = convert_cells_to_bboxes(predictions[i], anchor, S=S, is_predictions=True)
            for idx, (box) in enumerate(boxes_scale_i):
                bboxes[idx] += box

        # Processar ground truths
        true_bboxes = convert_cells_to_bboxes(labels[0], anchors[0], S=13, is_predictions=False)
        for idx in range(batch_size):
            all_pred_boxes.append(bboxes[idx])
            all_true_boxes.append(true_bboxes[idx])
            
    model.train()
    return all_pred_boxes, all_true_boxes

def mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20):
    # pred_boxes (list): [[train_idx, class_pred, prob_score, x, y, w, h], ...]
    average_precisions = []
    epsilon = 1e-6

    for c in range(num_classes):
        detections = [det for det in pred_boxes if det[1] == c]
        ground_truths = [gt for gt in true_boxes if gt[1] == c]

        amount_bboxes = Counter([gt[0] for gt in ground_truths])
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)
        
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            ground_truth_img = [bbox for bbox in ground_truths if bbox[0] == detection[0]]
            best_iou = 0
            best_gt_idx = -1

            for idx, gt in enumerate(ground_truth_img):
                iou_val = iou(torch.tensor(detection[3:]), torch.tensor(gt[3:]), is_pred=True)
                if iou_val > best_iou:
                    best_iou = iou_val
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)


In [109]:
# A Função de Treinamento (train_fn)
# ===============================================

def train_fn(train_loader, model, optimizer, loss_fn, scaler, scaled_anchors):
    loop = tqdm(train_loader, leave=True)
    losses = []

    for batch_idx, (x, y) in enumerate(loop):
        x = x.to(DEVICE)
        y0, y1, y2 = (
            y[0].to(DEVICE),
            y[1].to(DEVICE),
            y[2].to(DEVICE),
        )

        with torch.amp.autocast(device_type=DEVICE, dtype=torch.float16):
            out = model(x)
            loss = loss_fn(out, (y0, y1, y2), scaled_anchors)

        losses.append(loss.item())
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Atualiza a barra de progresso
        mean_loss = sum(losses) / len(losses)
        loop.set_postfix(loss=mean_loss)
        
    return mean_loss


In [113]:
model = YOLOv3(num_classes=NUM_CLASSES).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
loss_fn = YOLOLoss()
scaler = torch.amp.GradScaler()

# --- CONFIGURAÇÃO DOS DATALOADERS ---
train_transform = A.Compose(
    [
        A.LongestMaxSize(max_size=IMAGE_SIZE),
        A.PadIfNeeded(min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=0),
        A.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5, p=0.5),
        A.HorizontalFlip(p=0.5),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
)
val_transform = A.Compose(
    [
        A.LongestMaxSize(max_size=IMAGE_SIZE),
        A.PadIfNeeded(min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=0),
        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
)
BATCH_SIZE = 16
train_dataset = YOLOv3Dataset(txt_path=TRAIN_CSV_PATH, anchors=ANCHORS, transform=train_transform)
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

val_dataset = YOLOv3Dataset(txt_path=VAL_CSV_PATH, anchors=ANCHORS, transform=val_transform)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Prepara as âncoras para a função de perda
scaled_anchors = (
    torch.tensor(ANCHORS) * torch.tensor([IMAGE_SIZE, IMAGE_SIZE]).view(1, 1, 2)
).to(DEVICE)
scaled_anchors = scaled_anchors.view(3,3,2)

# Lista para armazenar a perda de cada época para o gráfico
epoch_losses = []

for epoch in range(NUM_EPOCHS):
    print(f"\n--- Época {epoch+1} / {NUM_EPOCHS} ---")
    
    mean_loss = train_fn(train_loader, model, optimizer, loss_fn, scaler, scaled_anchors)
    epoch_losses.append(mean_loss)
    
    # Salva o checkpoint ao final de cada época
    if SAVE_MODEL:
        save_checkpoint(model, optimizer, filename=CHECKPOINT_FILE)

print("\nTreinamento concluído!")


--- Época 1 / 100 ---


  3%|▎         | 27/857 [00:11<05:41,  2.43it/s, loss=25.1]


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, 20 + 1), epoch_losses, marker='o', linestyle='-')
plt.title('Curva de Perda Durante o Treinamento', fontsize=16)
plt.xlabel('Época', fontsize=12)
plt.ylabel('Perda Média (Mean Loss)', fontsize=12)
plt.grid(True)
plt.xticks(range(1, 20 + 1))
plt.tight_layout()
plt.show()

In [None]:
# Avaliação Final - Cálculo do mAP
# ============================================

model.to(DEVICE)

print("=> Carregando modelo treinado para avaliação final...")
load_checkpoint("/kaggle/output/yolov3_voc.pth.tar", model, optimizer, lr=LEARNING_RATE)

print("\n==> Calculando mAP final no conjunto de validação...")

pred_boxes, true_boxes = get_evaluation_bboxes(
    val_loader, model, iou_threshold=NMS_IOU_THRESH, anchors=ANCHORS, 
    threshold=CONF_THRESHOLD, device=DEVICE
)
map_val = mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=MAP_IOU_THRESH, num_classes=NUM_CLASSES
)

print(f"\n" + "="*30)
print(f"  RESULTADO FINAL DO TREINAMENTO")
print(f"  mAP no conjunto de validação: {map_val:.4f}")
print("="*30)