In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
cd drive/MyDrive/Hands-on-CV-Project2

[Errno 2] No such file or directory: 'drive/MyDrive/Hands-on-CV-Project2'
/content/drive/MyDrive/Hands-on-CV-Project2


In [10]:
!pip -q install fiftyone
!pip -q install wandb


In [11]:
import wandb

wandb.login()




False

1. Imports

In [12]:
import pandas as pd
import torch
from pathlib import Path

from src.models import LateFusionClassifier, IntermediateFusionClassifier
from src.training import train_task3

from src.datasets import AssessmentPairs, train_val_split, AssessmentTorchDataset, stratified_subsample, class_counts
from torch.utils.data import DataLoader
ROOT = Path("/content/drive/MyDrive/Hands-on-CV-Project2/assessment")

pairs = AssessmentPairs(ROOT).load_pairs()
pairs_10 = stratified_subsample(pairs, frac=0.10, seed=42)

train_pairs, val_pairs = train_val_split(pairs_10, val_ratio=0.2, seed=42)

train_ds = AssessmentTorchDataset(train_pairs)
val_ds   = AssessmentTorchDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

print("ALL:", len(pairs), class_counts(pairs))
print("10%:", len(pairs_10), class_counts(pairs_10))
print("train:", len(train_pairs), class_counts(train_pairs))
print("val  :", len(val_pairs), class_counts(val_pairs))



ALL: 19998 {'cubes': 9999, 'spheres': 9999}
10%: 2000 {'cubes': 1000, 'spheres': 1000}
train: 1600 {'cubes': 810, 'spheres': 790}
val  : 400 {'cubes': 190, 'spheres': 210}


2. Prepare Training

In [13]:
batch = next(iter(train_loader))
rgb_in_ch = batch["rgb"].shape[1]
lidar_in_ch = batch["lidar"].shape[1]
print("rgb_in_ch:", rgb_in_ch, "lidar_in_ch:", lidar_in_ch)


rgb_in_ch: 4 lidar_in_ch: 1


3. Run all models

In [14]:
import wandb

device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 40
LR = 1e-3
EMB = 200

runs = []

project = "cilp-extended-assessment"

# Late fusion
run = wandb.init(
    project=project,
    name="task3_late_fusion",
    config={
        "task": 3,
        "model": "late_fusion",
        "fusion": "late",
        "epochs": EPOCHS,
        "lr": LR,
        "emb": EMB,
        "batch_size": train_loader.batch_size,
        "subset_frac": 0.10,
        "rgb_in_ch": rgb_in_ch,
        "lidar_in_ch": lidar_in_ch,
        "optimizer": "AdamW",
    },
    reinit=True,
)

m = LateFusionClassifier(
    rgb_in_ch=rgb_in_ch,
    lidar_in_ch=lidar_in_ch,
    emb_size=EMB,
    num_classes=2,
    normalize_embeddings=False,
)

out = train_task3(m, train_loader, val_loader, device=device, epochs=EPOCHS, lr=LR, wandb_run=run, ckpt_path="/content/drive/MyDrive/Hands-on-CV-Project2/fusion")
run.finish()

runs.append({"model": "late_fusion", **{k: out[k] for k in ["val_loss_final","val_f1_final","params","sec_per_epoch_avg","gpu_mem_mb_peak"]}})

# Intermediate variants
for fusion in ["concat", "add", "hadamard"]:
    run = wandb.init(
        project=project,
        name=f"task3_intermediate_{fusion}",
        config={
            "task": 3,
            "model": f"intermediate_{fusion}",
            "fusion": "intermediate",
            "fusion_op": fusion,
            "epochs": EPOCHS,
            "lr": LR,
            "emb": EMB,
            "batch_size": train_loader.batch_size,
            "subset_frac": 0.10,
            "rgb_in_ch": rgb_in_ch,
            "lidar_in_ch": lidar_in_ch,
            "optimizer": "AdamW",
        },
        reinit=True,
    )

    m = IntermediateFusionClassifier(
        fusion=fusion,
        rgb_in_ch=rgb_in_ch,
        lidar_in_ch=lidar_in_ch,
        emb_size=EMB,
        num_classes=2,
        normalize_embeddings=False,
    )

    out = train_task3(m, train_loader, val_loader, device=device, epochs=EPOCHS, lr=LR, wandb_run=run, ckpt_path="/content/drive/MyDrive/Hands-on-CV-Project2/fusion")
    run.finish()

    runs.append({"model": f"intermediate_{fusion}", **{k: out[k] for k in ["val_loss_final","val_f1_final","params","sec_per_epoch_avg","gpu_mem_mb_peak"]}})
    print("wandb run:", wandb.run.name, wandb.run.id)



epoch 01 | train 0.7153 | val 0.6959 | f1 0.322 | 11.52s | 528 MB
epoch 02 | train 0.6924 | val 0.6823 | f1 0.322 | 11.76s | 528 MB
epoch 03 | train 0.5427 | val 0.5167 | f1 0.713 | 11.31s | 528 MB
epoch 04 | train 0.4922 | val 0.4710 | f1 0.779 | 11.73s | 528 MB
epoch 05 | train 0.4283 | val 0.4166 | f1 0.798 | 11.81s | 528 MB
epoch 06 | train 0.3297 | val 0.3545 | f1 0.850 | 12.14s | 528 MB
epoch 07 | train 0.2886 | val 0.3165 | f1 0.874 | 12.03s | 528 MB
epoch 08 | train 0.2027 | val 0.2870 | f1 0.884 | 11.60s | 528 MB
epoch 09 | train 0.1980 | val 0.2904 | f1 0.871 | 11.15s | 528 MB
epoch 10 | train 0.1314 | val 0.1244 | f1 0.960 | 11.31s | 528 MB
epoch 11 | train 0.1454 | val 0.1651 | f1 0.942 | 11.76s | 528 MB
epoch 12 | train 0.0701 | val 0.0915 | f1 0.955 | 11.41s | 528 MB
epoch 13 | train 0.0481 | val 0.1128 | f1 0.960 | 11.60s | 528 MB
epoch 14 | train 0.0377 | val 0.0289 | f1 0.992 | 10.76s | 528 MB
epoch 15 | train 0.0331 | val 0.0235 | f1 0.995 | 11.64s | 528 MB
epoch 16 |

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
perf/gpu_mem_mb_peak,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
perf/sec_per_epoch,▅▆▄▆▆█▇▅▃▄▆▅▅▁▆▇▆▂▂▄▅▅▂▅▇▅▇▆▁▄▆▃▂▁▄▄▄▂▂▄
train/loss,██▆▆▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▄▂▁▁▁▁▁▁▁▁▁▁▃▂▁▁▁▁▁▁▁
val/f1_macro,▁▁▅▆▆▆▇▇▇█▇████████▇███████████▇████████
val/loss,██▆▆▅▅▄▄▄▂▃▂▂▁▁▁▁▁▂▃▂▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▂▁▁▁

0,1
epoch,40
gpu_mem_mb_peak,527.76172
params,1305536
perf/gpu_mem_mb_peak,527.76172
perf/sec_per_epoch,11.34554
sec_per_epoch_avg,11.4015
train/loss,0.00162
val/f1_macro,0.99498
val/loss,0.01033
val_f1_final,0.99498


epoch 01 | train 0.7155 | val 0.6960 | f1 0.322 | 11.20s | 667 MB
epoch 02 | train 0.6834 | val 0.6575 | f1 0.368 | 12.45s | 667 MB
epoch 03 | train 0.5890 | val 0.6534 | f1 0.322 | 11.56s | 667 MB
epoch 04 | train 0.5381 | val 0.5315 | f1 0.709 | 11.49s | 667 MB
epoch 05 | train 0.5076 | val 0.5133 | f1 0.736 | 11.30s | 667 MB
epoch 06 | train 0.4852 | val 0.5781 | f1 0.677 | 10.99s | 667 MB
epoch 07 | train 0.4821 | val 0.4826 | f1 0.760 | 11.45s | 667 MB
epoch 08 | train 0.4506 | val 0.4706 | f1 0.770 | 11.62s | 667 MB
epoch 09 | train 0.4250 | val 0.4473 | f1 0.773 | 11.82s | 667 MB
epoch 10 | train 0.3538 | val 0.3741 | f1 0.841 | 11.26s | 667 MB
epoch 11 | train 0.3534 | val 0.3720 | f1 0.848 | 11.00s | 667 MB
epoch 12 | train 0.3045 | val 0.3368 | f1 0.856 | 11.59s | 667 MB
epoch 13 | train 0.3031 | val 0.3103 | f1 0.880 | 11.63s | 667 MB
epoch 14 | train 0.2366 | val 0.2594 | f1 0.906 | 11.47s | 667 MB
epoch 15 | train 0.1805 | val 0.2915 | f1 0.874 | 11.40s | 667 MB
epoch 16 |

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
perf/gpu_mem_mb_peak,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
perf/sec_per_epoch,▃█▅▄▄▃▄▅▆▄▃▅▅▄▄▂▃▅▄▃▂▅▆▅▅▃▃▄▄▅▃▂▃▄▃▁▄▃▃▂
train/loss,██▇▆▆▆▆▅▅▄▄▄▄▃▃▄▃▄▄▄▂▂▂▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁
val/f1_macro,▁▁▁▅▅▅▆▆▆▆▆▇▇▇▇▇▄▇▅▇▇█████▆█████▇███████
val/loss,▇▇▇▆▆▆▅▅▅▄▄▄▄▃▄▄█▄▅▃▂▂▁▁▁▁█▂▁▁▁▁▃▁▁▁▁▁▁▁

0,1
epoch,40
gpu_mem_mb_peak,667.48877
params,879236
perf/gpu_mem_mb_peak,667.48877
perf/sec_per_epoch,10.79211
sec_per_epoch_avg,11.31003
train/loss,6e-05
val/f1_macro,0.99749
val/loss,0.00765
val_f1_final,0.99749


AttributeError: 'NoneType' object has no attribute 'name'

4. Export Results

In [None]:
print(df.to_markdown(index=False))


We can see that intermediate_add, intermediate_concat, and late_fusion basically all perform perfectly on the dataset. Intermediate_hadamard is slightly worse, with a loss of around 3% and an F1 score of 99.97%.

It is expected that add and concat outperform the hadamard fusion. If we look at the data, the LiDAR dataset displays quite clean spheres and cubes, while the RGB dataset is relatively noisy. With Hadamard fusion, the noisy RGB features are multiplied with the cleaner LiDAR features, which can lead to a loss of information. This is not the case for concat or add. During late fusion, this is also less of an issue, since the fusion happens in deeper layers after each modality has already learned more robust features.

We can also see that intermediate_add and intermediate_hadamard are noticeably smaller than the other fusion models and therefore use less GPU memory. Concat is slightly larger due to the doubled feature channels at the fusion point, and late_fusion is noticeably larger overall because it maintains two separate encoders throughout the network.

Concat had the highest GPU memory peak, which can be attributed to the larger feature maps created by concatenation.

Finally, it is worth mentioning that concat was the fastest model to converge to a perfect score, reaching a validation loss of 0 and an F1 score of 1 after around 20 epochs. Late fusion and intermediate_add required more training, converging after approximately 30 and 36 epochs, respectively. Solving the concat problem being the easiest makes sense, since this fusion strategy clearly separates the less noisy LiDAR features from the noisier RGB ones.