In [1]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from ultralytics import YOLO
from tqdm import tqdm
from DataAugmentation import DatasetAugmentation
from torch.utils.data import DataLoader
import torchvision.transforms.v2 as v2
import matplotlib.pyplot as plt
from torchvision.tv_tensors import BoundingBoxFormat, BoundingBoxes
from ultralytics.utils.loss import v8DetectionLoss
from torchvision.transforms.v2 import ConvertBoundingBoxFormat
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR


In [2]:
# Script to debug the dataset structure 
# Evaluation dataset



IMG_SIZE = 640

resize_transform = v2.Compose([
    v2.Resize((IMG_SIZE, IMG_SIZE)),
    v2.SanitizeBoundingBoxes(),
    v2.ToImage(),
])

val_dataset = DatasetAugmentation(
    training_path="augmented_data/val",
    split_images=False,
    perform_transformations=True
)
val_dataset.transforms = resize_transform



In [3]:

# Analyse some statistics of the dataset 
# 1. Number of images
num_images = len(val_dataset)
print(f"Number of images in the validation dataset: {type(num_images)}")
print("Dimnension of the images in the validation dataset:", type(val_dataset[0][0])) # RGB images of 640x640 --> [3, 640, 640]
# se faccio  type(val_dataset[0][0]) ottengo class 'torchvision.tv_tensors._image.Image
# 2. Number of classes
print(f"Number of classes in the validation dataset: {val_dataset[0][1].data}") # Se gli passo val_dataset[0][1] ottengo questo tipo: 
# BoundingBoxes([[362.1333, 360.4000,  58.6667,  87.2000]], format=BoundingBoxFormat.CXCYWH, canvas_size=(640, 640)), mentre se gli passo questo: 
# val_dataset[0][1].data ottengo tensor([[362.1333, 360.4000,  58.6667,  87.2000]])

# 3. Number of bounding boxes
print(f"Number of bounding boxes in the validation dataset: {val_dataset[0][2]}") # 4 bounding boxes in the image


# Print all the bounding boxes in the validation set 
# for i in range(len(val_dataset)):
#     print(f"Image {i}:")
#     print(f"boxes = {val_dataset[i][1]}\n")


# Il codice si spacca all'immagine 51
# print the bounding boxes from image 51, 63, 94, 101, 102, 105
for i in range(1, len(val_dataset)):
    print(f"Image {i}:")
    print(f"boxes = {val_dataset[i][1]}\n")


# ora mi viene un dubbio, che tipo di immagini ci sono nel nostro dataset di validation?

# Display image 51

# Read all the images in "augmented_data/val/images" and store them in a vector, then print the image 51



# Train dataset 

Number of images in the validation dataset: <class 'int'>
Dimnension of the images in the validation dataset: <class 'torchvision.tv_tensors._image.Image'>
Number of classes in the validation dataset: tensor([[362.1333, 360.4000,  58.6667,  87.2000]])
Number of bounding boxes in the validation dataset: tensor([3])
Image 1:
boxes = BoundingBoxes([[356.5333, 360.8000,  51.7333,  59.2000]], format=BoundingBoxFormat.CXCYWH, canvas_size=(640, 640))

Image 2:
boxes = BoundingBoxes([[181.5861, 219.8304,  49.5723,  65.1536],
               [514.1365, 274.9248,  49.7835,  61.2096],
               [441.1200,  82.9248,  37.0848,  74.0096],
               [360.3563, 333.7264,  47.8592,  62.6528],
               [193.1968, 439.6192,  42.2165,  70.0640],
               [439.1691, 512.8960,  41.7163,  73.6864]], format=BoundingBoxFormat.CXCYWH, canvas_size=(640, 640))

Image 3:
boxes = BoundingBoxes([[ 75.4667, 152.4000,  57.0667,  87.2000],
               [514.1334, 424.4000,  55.4667,  80.8000],
  

KeyboardInterrupt: 

In [4]:
import os
from PIL import Image

# Path to your images folder
image_folder = '/Users/emanuelerimoldi/Desktop/Trial/augmented_data/val/images'

# List of image filenames sorted alphabetically
image_files = sorted([file for file in os.listdir(image_folder) if file.lower().endswith(('png', 'jpg', 'jpeg'))])

# Load images into a list
images = [Image.open(os.path.join(image_folder, file)) for file in image_files]

# Check if there are at least 51 images
if len(images) >= 52:
    print(f"51st image filename: {image_files[51]}")
    images[50].show()  # this will display the image
else:
    print("There are fewer than 51 images.")


51st image filename: Choco_000052.png


In [5]:
def collate_fn(batch):
    images, boxes_list, labels_list = zip(*batch) # Unzip the batch, * is used to unpack the list of tuples
    images = torch.stack(images, dim=0) 
    targets = []
    for batch_idx, (bbs, labels) in enumerate(zip(boxes_list, labels_list)):
        # bbs.data is (num_objs, 4) in CXCYWH absolute pixels
        coords = bbs.data  # torch.Tensor
        H, W = bbs.canvas_size
        # normalize [cx,cy,w,h]
        coords = coords / torch.tensor([W, H, W, H], dtype=torch.float32, device=coords.device)
        cls   = labels.to(torch.float32).view(-1,1)
        idxs  = torch.full((coords.size(0),1), batch_idx, dtype=torch.float32, device=coords.device)
        targets.append(torch.cat([idxs, cls, coords], dim=1))
    targets = torch.cat(targets, dim=0)
    return images, {
        "batch_idx": targets[:,0].long(),
        "cls":       targets[:,1],
        "bboxes":    targets[:,2:]
    }

In [6]:

# Parameters and settings
MODEL_PATH = "yolo12.yaml"
OUT_DIR = "output_dir"
device = "cpu"
print(f"Using device: {device}")
IMG_SIZE = 640
BATCH_SIZE = 6
N_EPOCHS = 30

os.makedirs(OUT_DIR, exist_ok=True)

resize_transform = v2.Compose([
    v2.Resize((IMG_SIZE, IMG_SIZE)),
    v2.SanitizeBoundingBoxes(),
    v2.ToImage(),
])

print("Loading datasets...")
train_dataset = DatasetAugmentation(
    training_path="augmented_data/train",
    split_images=False,
    perform_transformations=True
)
train_dataset.transforms = resize_transform


# Print dataset information
print(f"Train dataset size: {len(train_dataset)}")
print(f"Values per training sample: {len(train_dataset[10])}")
#Print the 2nd and 3rd elements of the first sample
print(f"Sample bounding boxes: {train_dataset[10][1]}")
print(f"Sample labels: {train_dataset[10][2]}")


val_dataset = DatasetAugmentation(
    training_path="augmented_data/val",
    split_images=False,
    perform_transformations=True
)
val_dataset.transforms = resize_transform

# Print dataset information
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Values per validation sample: {len(val_dataset[10])}")
# Print the 2nd and 3rd elements of the first sample
print(f"Sample bounding boxes: {val_dataset[10][1]}")
print(f"Sample labels: {val_dataset[10][2]}")


Using device: cpu
Loading datasets...
Train dataset size: 556
Values per training sample: 3
Sample bounding boxes: BoundingBoxes([[204.9067, 264.6832,  48.2133,  64.7264],
               [382.2400, 163.6448,  49.4581,  59.4496],
               [528.0181, 242.9744,  47.3248,  58.1088],
               [487.0048, 384.3392,  38.5419,  76.8400],
               [364.7648, 338.7920,  45.7952,  65.7456],
               [356.6752, 537.5936,  36.3733,  74.5472],
               [194.6134, 475.2176,  50.3819,  51.6608]], format=BoundingBoxFormat.CXCYWH, canvas_size=(640, 640))
Sample labels: tensor([5, 6, 5, 4, 6, 4, 4])
Validation dataset size: 147
Values per validation sample: 3
Sample bounding boxes: BoundingBoxes([[291.2245, 205.3696,  56.9845,  86.9888],
               [284.6859, 335.8320,  59.1627,  89.3328],
               [377.9979, 273.2032,  60.3328,  89.9744],
               [358.2325, 401.3488,  53.0795,  78.9296]], format=BoundingBoxFormat.CXCYWH, canvas_size=(640, 640))
Sample labels

In [7]:
print("Creating dataloaders...")
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,  # use 0 for notebooks; increase if your environment supports it
    pin_memory=True,
    collate_fn=collate_fn,
    persistent_workers=False
)






val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
    collate_fn=collate_fn,
    persistent_workers=False
)


Creating dataloaders...


In [8]:
print(f"Number of batches in train_loader: {len(train_loader)}")

# Prendi un batch di esempio
for batch_idx, batch in enumerate(train_loader):
    print(f"\nBatch {batch_idx}:")
    
    # Se il batch è una tupla (es. immagini e target)
    if isinstance(batch, (list, tuple)):
        for i, elem in enumerate(batch):
            # Se l'elemento è un tensore, stampa forma e tipo
            if isinstance(elem, torch.Tensor):
                print(f"  Element {i} - Tensor shape: {elem.shape}, dtype: {elem.dtype}")
            # Se è un dizionario o altro, stampa tipo e dimensioni se possibile
            elif isinstance(elem, dict):
                print(f"  Element {i} - Dict with keys: {list(elem.keys())}")
                for k, v in elem.items():
                    if isinstance(v, torch.Tensor):
                        print(f"    Key '{k}': shape {v.shape}, dtype {v.dtype}")
            else:
                print(f"  Element {i} - Type: {type(elem)}")
    else:
        # Se batch non è tupla/list, prova a stampare tipo e shape
        if isinstance(batch, torch.Tensor):
            print(f"Batch is a tensor with shape {batch.shape}, dtype {batch.dtype}")
        else:
            print(f"Batch type: {type(batch)}")

    # Dopo il primo batch esci dal ciclo (se vuoi vedere solo il primo)
    break


Number of batches in train_loader: 93

Batch 0:
  Element 0 - Tensor shape: torch.Size([6, 3, 640, 640]), dtype: torch.uint8
  Element 1 - Dict with keys: ['batch_idx', 'cls', 'bboxes']
    Key 'batch_idx': shape torch.Size([40]), dtype torch.int64
    Key 'cls': shape torch.Size([40]), dtype torch.float32
    Key 'bboxes': shape torch.Size([40, 4]), dtype torch.float32


In [9]:

print("Initializing model...")
model_yolo = YOLO(MODEL_PATH)
model = model_yolo.model
num_params_all = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in the model: {num_params_all}")
model = model.to(device)


Initializing model...
Total number of parameters in the model: 2570583


In [10]:

print(model)


DetectionModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (2): C3k2(
      (cv1): Conv(
        (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(48, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
   

In [11]:

print("Initializing optimizer and scheduler...")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4, nesterov=True)
total_steps = N_EPOCHS * len(train_loader)
warmup_steps = int(0.3 * total_steps)
sched1 = LinearLR(optimizer, start_factor=1e-3, total_iters=warmup_steps)
sched2 = CosineAnnealingLR(optimizer, T_max=total_steps - warmup_steps, eta_min=1e-6)
scheduler = SequentialLR(optimizer, [sched1, sched2], [warmup_steps])


Initializing optimizer and scheduler...


In [12]:
class FixedDict(dict):
    def __init__(self, d):
        self.__dict__.update(d)

In [13]:
compute_loss = v8DetectionLoss(model)
compute_loss.hyp = FixedDict(compute_loss.hyp)


loss_history_train = []
loss_history_val = []
best_val_loss = float("inf")
best_epoch = 0


In [16]:
print(f"Starting training using device: {device} ...")
for epoch in range(1, 3):
    print(f"Epoch {epoch}/{N_EPOCHS}")
    model.train()
    train_loss = 0.0

    for batch_idx, (imgs, targets) in enumerate(train_loader):
        imgs = imgs.to(device, non_blocking=True).float() / 255.0
        targets = {k: v.to(device, non_blocking=True) for k, v in targets.items()}

        # Stampa il tipo e la forma di ciascun target
        #for k, v in targets.items():
           # print(f"Type of targets['{k}']: {type(v)}")

        optimizer.zero_grad()
        preds = model(imgs)

        loss, _ = compute_loss(preds, targets)
        loss = loss.sum() / imgs.shape[0]

        print(f"Batch {batch_idx} loss before backward: {loss.item()}")

        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    # Valutazione su validation set solo se non è stato fatto break nel training
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch_idx, (imgs, targets) in enumerate(val_loader):
            imgs = imgs.to(device, non_blocking=True).float() / 255.0
            targets = {k: v.to(device, non_blocking=True) for k, v in targets.items()}

            preds = model(imgs)

            loss, _ = compute_loss(preds, targets)
            loss = loss.sum() / imgs.shape[0]

            val_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    loss_history_train.append(avg_train_loss)
    loss_history_val.append(avg_val_loss)

    print(f"Epoch {epoch} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_epoch = epoch
        best_model_state = model.state_dict()
        print(f"New best model at epoch {epoch} with val loss {best_val_loss:.4f}")


Starting training using device: cpu ...
Epoch 1/30
Type of targets['batch_idx']: <class 'torch.Tensor'>
Shape of targets['batch_idx']: torch.Size([23]), dtype: torch.int64
Type of targets['cls']: <class 'torch.Tensor'>
Shape of targets['cls']: torch.Size([23]), dtype: torch.float32
Type of targets['bboxes']: <class 'torch.Tensor'>
Shape of targets['bboxes']: torch.Size([23, 4]), dtype: torch.float32
Batch 0 loss before backward: 7.445131778717041
Type of targets['batch_idx']: <class 'torch.Tensor'>
Shape of targets['batch_idx']: torch.Size([36]), dtype: torch.int64
Type of targets['cls']: <class 'torch.Tensor'>
Shape of targets['cls']: torch.Size([36]), dtype: torch.float32
Type of targets['bboxes']: <class 'torch.Tensor'>
Shape of targets['bboxes']: torch.Size([36, 4]), dtype: torch.float32
Batch 1 loss before backward: 5.38627290725708


KeyboardInterrupt: 

In [None]:

print("Training complete. Saving best model...")
torch.save(best_model_state, os.path.join(OUT_DIR, "best.pt"))
torch.save(model.state_dict(), os.path.join(OUT_DIR, "last.pt"))
torch.save({
    "train_loss": loss_history_train,
    "val_loss": loss_history_val,
    "best_epoch": best_epoch,
}, os.path.join(OUT_DIR, "plot_data.pt"))

print("Plotting loss curves...")
epochs = range(1, N_EPOCHS + 1)
plt.figure(figsize=(10, 5))
plt.plot(epochs, loss_history_train, label="Train Loss")
plt.plot(epochs, loss_history_val, label="Val Loss")
plt.axvline(best_epoch, linestyle="--", color="red", label=f"Best Epoch: {best_epoch}")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.show()


Training complete. Saving best model...


NameError: name 'best_model_state' is not defined