In [None]:
%%capture
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path

import torch
from hydra import compose, initialize
from yolo import Config, PostProcess, create_converter, create_model, draw_bboxes
from yolo.utils.model_utils import get_device


project_root = Path().resolve().parent
sys.path.append(str(project_root))

print(project_root)

CONFIG_PATH = "../YOLO/yolo/config"
CONFIG_NAME = "config"
CLASS_NUM = 80

image_size = (640, 640)

with initialize(config_path=CONFIG_PATH, version_base=None, job_name="notebook_job"):
    cfg: Config = compose(config_name=CONFIG_NAME)

device, _ = get_device(cfg.device)
model = create_model(
    cfg.model,
    class_num=CLASS_NUM,
    # weight_path="/Users/simone.bonato/Desktop/ecolution/IFC_DL/submodules/YOLO/weights/v9-c.pt",
)
model = model.to(device)


    

In [None]:
my_config = get_config_for_notebook(
    "/Users/simone.bonato/Desktop/ecolution/IFC_DL/src/ifc_dl/conf"
)

augs = {
    "resize": {
        "params": {"height": image_size[0], "width": image_size[1], "interpolation": 3},
        "all_datasets": True,
    },
    "horizontal_flip": {"params": {"p": 0.5}},
}

transforms, val_transform = get_transform_fn(augs)

In [None]:
dataset = MockCOCODataset(partition="train", transforms=transforms)
datamodule = MockCocoDataModule(transforms=transforms, val_transform=val_transform)

x, y = dataset[1]

plot_instance_segmentation_data(x, y)

In [None]:
model_output = model(x[None])

model_output.keys()

In [None]:
datamodule.prepare_data()
datamodule.setup()

train_dl = datamodule.train_dataloader()
val_dl = datamodule.val_dataloader()

In [None]:
x, y = next(iter(train_dl))

x = torch.stack(x)

model_output = model(x)

In [None]:
c = converter(model_output["Main"])

c[0].shape

but why is the shape like this? (it was [10, 8400, 80] when I wrote this, and the images had a shape of 640x640)

because for every possible anchor box we get a prediction
in the config for the anchor boxes we had strides of [8, 16, 32], if you divide the original image in cells of these shapes, you will see that the total is 8400

Do: $$(640 / 8) ^2 + (640 / 16)^2 + (640 / 32)^2 = 8400$$ (anchors with stride 8) + ...

### Notes on the shapes of the output

There are 3 tuples, one for each resolution (60x80, 30x40, 15x20).
Within each tuple we have predictions for: class, objects and bounding boxes.

Takin the first tuple for example:

**Class predictions**:
- 80 is the number of object classes
- 60x80 is the feature map size 

**Object predictions**:
	•	1: Batch size
	•	16: reg_max = 16 → number of bins
	•	4: 4 box coordinates: [left, top, right, bottom]
	•	80, 80: Grid size (spatial locations)

**Bounding box predictions**:
- 4 channels for bounding boxes
- 60x80 is the feature map size 

In [None]:
print("Main output shapes:")
for o in model_output["Main"]:
    for el in o:
        print(el.shape)
    print()


print("\nAUX output shapes:")
for o in model_output["AUX"]:
    for el in o:
        print(el.shape)
    print()

In [None]:
post_processed = post_proccess(model_output)

for pred in post_processed:
    print(pred.shape)

In [None]:
draw_bboxes(x, post_processed, idx2label=cfg.dataset.class_list)

### Notes on the loss function, the **DUAL LOSS**

It is made of the following losses:
- BCELoss (Binary Cross-Entropy Loss)
- DFLoss (Distribution Focal Loss)
- BoxLoss

#### How should the loss inputs look like?

- `aux_predicts`: the post-processed outputs of the model, for the "Aux" key. The model output some vector, and the converter is used to turn them into normal boxes.
- `main_predicts`: same as `aux_predicts` but for the "Main" key.
- `targets`: The ground truth class and bounding box information as tensor of size `[batch x n_targets x 5]`. But since here we must consider all the images in the batch, they might have different numbers of elements. For this reason the "non-elements" must be labeled with `-1` to keep the shape consistent. Hence `n_targets` should be the max_number of elements in the batch (will have to make a `custom collate_fn` for this in the dataloader)

```
aux_predicts = converter(model_output["AUX"])
main_predicts = converter(model_output["Main"])

loss_val = loss(
    aux_predicts,
    main_predicts,
    targets=yolo_style_loss,
)
```

In [None]:
from yolo.tools.loss_functions import create_loss_function

loss = create_loss_function(cfg, converter)

loss

In [None]:
pred_scale_idx = 0

class_vec = torch.zeros(6, 1)
batch_size = model_output["Main"][pred_scale_idx][0].shape[0]
yolo_style_loss = torch.cat([class_vec, y["boxes"]], dim=1)[None].repeat(
    batch_size, 1, 1
)

aux_predicts = converter(model_output["AUX"])
main_predicts = converter(model_output["Main"])

loss_val = loss(
    aux_predicts,
    main_predicts,
    targets=yolo_style_loss,
)

loss_val

### Notes on the dataloader

Our dataloader should return:
- the `images`, as always normalised and with shape `BATCH_SIZE, 3, H, W`
- the `batch_targets`, which is a tensor of shape `BATCH_SIZE, max_annotations_in_batch_image, 5`. The 5 represents: `class object` (-1 for filler values), and `4 bbox coordinates` (probably `x_min, y_min, x_max, y_max`). Hence `[class, x_min, y_min, x_max, y_max]`


Extra:
- `rev_tensor` contains a tensor that can be used in the augmentation class they have to reverse the augmentation. We probably do not need this one.

In [None]:
from yolo.tools.data_loader import create_dataloader

dl = create_dataloader(cfg.task.data, cfg.dataset, cfg.task.task)
batch = next(iter(dl))