In [None]:
cd drive/MyDrive/Hands-on-CV-Project2

/content/drive/MyDrive/Hands-on-CV-Project2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip -q install fiftyone
!pip -q install wandb


In [None]:
import wandb

wandb.login()


[34m[1mwandb[0m: Currently logged in as: [33mconstantin-auga[0m ([33mConscht-Sht[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

1. Imports

In [None]:
import pandas as pd
import torch
from pathlib import Path

from src.models import LateFusionClassifier, IntermediateFusionClassifier
from src.training import train_task3

from src.datasets import AssessmentPairs, train_val_split, AssessmentTorchDataset, stratified_subsample, class_counts
from torch.utils.data import DataLoader
ROOT = Path("/content/drive/MyDrive/Hands-on-CV-Project2/assessment")

pairs = AssessmentPairs(ROOT).load_pairs()
pairs_10 = stratified_subsample(pairs, frac=0.10, seed=42)

train_pairs, val_pairs = train_val_split(pairs_10, val_ratio=0.2, seed=42)

train_ds = AssessmentTorchDataset(train_pairs)
val_ds   = AssessmentTorchDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

print("ALL:", len(pairs), class_counts(pairs))
print("10%:", len(pairs_10), class_counts(pairs_10))
print("train:", len(train_pairs), class_counts(train_pairs))
print("val  :", len(val_pairs), class_counts(val_pairs))



ALL: 19998 {'cubes': 9999, 'spheres': 9999}
10%: 2000 {'cubes': 1000, 'spheres': 1000}
train: 1600 {'cubes': 810, 'spheres': 790}
val  : 400 {'cubes': 190, 'spheres': 210}


2. Prepare Training

In [None]:
batch = next(iter(train_loader))
rgb_in_ch = batch["rgb"].shape[1]
lidar_in_ch = batch["lidar"].shape[1]
print("rgb_in_ch:", rgb_in_ch, "lidar_in_ch:", lidar_in_ch)


rgb_in_ch: 4 lidar_in_ch: 1


3. Run all models

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 40
LR = 1e-3
EMB = 200

runs = []

# Late fusion (classification) -> normalization OFF
m = LateFusionClassifier(
    rgb_in_ch=rgb_in_ch,
    lidar_in_ch=lidar_in_ch,
    emb_size=EMB,
    num_classes=2,
    normalize_embeddings=False,
)
out = train_task3(m, train_loader, val_loader, device=device, epochs=EPOCHS, lr=LR)
runs.append({
    "model": "late_fusion",
    **{k: out[k] for k in ["val_loss_final","params","sec_per_epoch_avg","gpu_mem_mb_peak"]},
    **({"val_f1_final": out["val_f1_final"]} if "val_f1_final" in out else {}),
})

# Intermediate fusion variants -> normalization OFF
for fusion in ["concat", "add", "hadamard"]:
    m = IntermediateFusionClassifier(
        fusion=fusion,
        rgb_in_ch=rgb_in_ch,
        lidar_in_ch=lidar_in_ch,
        emb_size=EMB,
        num_classes=2,
        normalize_embeddings=False,
    )
    out = train_task3(m, train_loader, val_loader, device=device, epochs=EPOCHS, lr=LR)
    runs.append({
        "model": f"intermediate_{fusion}",
        **{k: out[k] for k in ["val_loss_final","params","sec_per_epoch_avg","gpu_mem_mb_peak"]},
        **({"val_f1_final": out["val_f1_final"]} if "val_f1_final" in out else {}),
    })

df = pd.DataFrame(runs).sort_values("val_loss_final")
df


epoch 01 | train 0.6994 | val 0.6869 | f1 0.316 | 160.78s | 528 MB
epoch 02 | train 0.5844 | val 0.5291 | f1 0.723 | 11.11s | 528 MB
epoch 03 | train 0.5243 | val 0.4958 | f1 0.751 | 12.47s | 528 MB
epoch 04 | train 0.4877 | val 0.4786 | f1 0.785 | 12.39s | 528 MB
epoch 05 | train 0.4057 | val 0.4282 | f1 0.797 | 11.91s | 528 MB
epoch 06 | train 0.3906 | val 0.4402 | f1 0.802 | 11.65s | 528 MB
epoch 07 | train 0.3546 | val 0.4151 | f1 0.811 | 11.59s | 528 MB
epoch 08 | train 0.2600 | val 0.3628 | f1 0.860 | 11.83s | 528 MB
epoch 09 | train 0.2204 | val 0.2394 | f1 0.914 | 11.68s | 528 MB
epoch 10 | train 0.1605 | val 0.2728 | f1 0.887 | 11.66s | 528 MB
epoch 11 | train 0.1364 | val 0.1899 | f1 0.925 | 11.57s | 528 MB
epoch 12 | train 0.1094 | val 0.1655 | f1 0.932 | 11.45s | 528 MB
epoch 13 | train 0.0765 | val 0.0924 | f1 0.970 | 11.51s | 528 MB
epoch 14 | train 0.0716 | val 0.6833 | f1 0.849 | 11.11s | 528 MB
epoch 15 | train 0.2769 | val 0.2033 | f1 0.906 | 11.50s | 528 MB
epoch 16 

Unnamed: 0,model,val_loss_final,params,sec_per_epoch_avg,gpu_mem_mb_peak,val_f1_final
2,intermediate_add,4e-06,699236,11.440498,486.099121,1.0
0,late_fusion,2.6e-05,1305536,15.286475,527.566895,1.0
1,intermediate_concat,2.7e-05,879236,11.48406,667.48877,1.0
3,intermediate_hadamard,0.032418,699236,11.353085,498.85498,0.997493


4. Export Results

In [None]:
print(df.to_markdown(index=False))


| model                 |   val_loss_final |   params |   sec_per_epoch_avg |   gpu_mem_mb_peak |   val_f1_final |
|:----------------------|-----------------:|---------:|--------------------:|------------------:|---------------:|
| intermediate_add      |      4.47546e-06 |   699236 |             11.4405 |           486.099 |       1        |
| late_fusion           |      2.55917e-05 |  1305536 |             15.2865 |           527.567 |       1        |
| intermediate_concat   |      2.71426e-05 |   879236 |             11.4841 |           667.489 |       1        |
| intermediate_hadamard |      0.0324183   |   699236 |             11.3531 |           498.855 |       0.997493 |


We can see that intermediate_add, intermediate_concat, and late_fusion basically all perform perfectly on the dataset. Intermediate_hadamard is slightly worse, with a loss of around 3% and an F1 score of 99.97%.

It is expected that add and concat outperform the hadamard fusion. If we look at the data, the LiDAR dataset displays quite clean spheres and cubes, while the RGB dataset is relatively noisy. With Hadamard fusion, the noisy RGB features are multiplied with the cleaner LiDAR features, which can lead to a loss of information. This is not the case for concat or add. During late fusion, this is also less of an issue, since the fusion happens in deeper layers after each modality has already learned more robust features.

We can also see that intermediate_add and intermediate_hadamard are noticeably smaller than the other fusion models and therefore use less GPU memory. Concat is slightly larger due to the doubled feature channels at the fusion point, and late_fusion is noticeably larger overall because it maintains two separate encoders throughout the network.

Concat had the highest GPU memory peak, which can be attributed to the larger feature maps created by concatenation.

Finally, it is worth mentioning that concat was the fastest model to converge to a perfect score, reaching a validation loss of 0 and an F1 score of 1 after around 20 epochs. Late fusion and intermediate_add required more training, converging after approximately 30 and 36 epochs, respectively. Solving the concat problem being the easiest makes sense, since this fusion strategy clearly separates the less noisy LiDAR features from the noisier RGB ones.