In [1]:
!pip -q install fiftyone
!pip -q install wandb


In [2]:
import wandb

wandb.login()


[34m[1mwandb[0m: Currently logged in as: [33mconstantin-auga[0m ([33mConscht-Sht[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

1. Imports

In [3]:
cd drive/MyDrive/Hands-on-CV-Project2

/content/drive/MyDrive/Hands-on-CV-Project2


In [4]:
import sys
from pathlib import Path

project_root = Path.cwd()  # go from notebooks/ -> CLIP-Multimodal-Learning/
sys.path.append(str(Path(".").resolve()))

print("CWD:", Path.cwd())
print("Using project root:", project_root)
print("src exists:", (project_root / "src").exists())
import pandas as pd
from src.models import LateFusionClassifier, IntermediateFusionClassifier
from src.training import train_task3

from src.datasets import AssessmentPairs, train_val_split, AssessmentTorchDataset, stratified_subsample, class_counts
from torch.utils.data import DataLoader
ROOT = Path("/content/drive/MyDrive/Hands-on-CV-Project2/assessment")

pairs = AssessmentPairs(ROOT).load_pairs()
pairs_10 = stratified_subsample(pairs, frac=0.10, seed=42)

train_pairs, val_pairs = train_val_split(pairs_10, val_ratio=0.2, seed=42)

train_ds = AssessmentTorchDataset(train_pairs)
val_ds   = AssessmentTorchDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

print("ALL:", len(pairs), class_counts(pairs))
print("10%:", len(pairs_10), class_counts(pairs_10))
print("train:", len(train_pairs), class_counts(train_pairs))
print("val  :", len(val_pairs), class_counts(val_pairs))



CWD: /content/drive/MyDrive/Hands-on-CV-Project2
Using project root: /content/drive/MyDrive/Hands-on-CV-Project2
src exists: True
ALL: 19998 {'cubes': 9999, 'spheres': 9999}
10%: 2000 {'cubes': 1000, 'spheres': 1000}
train: 1600 {'cubes': 810, 'spheres': 790}
val  : 400 {'cubes': 190, 'spheres': 210}


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


2. Prepare Training

In [6]:
batch = next(iter(train_loader))
rgb_in_ch = batch["rgb"].shape[1]
lidar_in_ch = batch["lidar"].shape[1]
print("rgb_in_ch:", rgb_in_ch, "lidar_in_ch:", lidar_in_ch)


rgb_in_ch: 4 lidar_in_ch: 1


3. Run all models

In [7]:
import torch, sys, os
print("torch from:", torch.__file__)
print("cwd:", os.getcwd())


torch from: /usr/local/lib/python3.12/dist-packages/torch/__init__.py
cwd: /content/drive/MyDrive/Hands-on-CV-Project2


In [8]:
import wandb

device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 40
LR = 1e-3
EMB = 200

runs = []

project = "cilp-extended-assessment"

# Late fusion
run = wandb.init(
    project=project,
    name="task3_late_fusion",
    config={
        "task": 3,
        "model": "late_fusion",
        "fusion": "late",
        "epochs": EPOCHS,
        "lr": LR,
        "emb": EMB,
        "batch_size": train_loader.batch_size,
        "subset_frac": 0.10,
        "rgb_in_ch": rgb_in_ch,
        "lidar_in_ch": lidar_in_ch,
        "optimizer": "AdamW",
    },
    reinit=True,
)

m = LateFusionClassifier(
    rgb_in_ch=rgb_in_ch,
    lidar_in_ch=lidar_in_ch,
    emb_size=EMB,
    num_classes=2,
    normalize_embeddings=False,
)

out = train_task3(m, train_loader, val_loader, device=device, epochs=EPOCHS, lr=LR, wandb_run=run, ckpt_path="checkpoints/fusion")
run.finish()

runs.append({"model": "late_fusion", **{k: out[k] for k in ["val_loss_final","val_f1_final","params","sec_per_epoch_avg","gpu_mem_mb_peak"]}})

# Intermediate variants
for fusion in ["concat", "add", "hadamard"]:
    run = wandb.init(
        project=project,
        name=f"task3_intermediate_{fusion}",
        config={
            "task": 3,
            "model": f"intermediate_{fusion}",
            "fusion": "intermediate",
            "fusion_op": fusion,
            "epochs": EPOCHS,
            "lr": LR,
            "emb": EMB,
            "batch_size": train_loader.batch_size,
            "subset_frac": 0.10,
            "rgb_in_ch": rgb_in_ch,
            "lidar_in_ch": lidar_in_ch,
            "optimizer": "AdamW",
        },
        reinit=True,
    )

    m = IntermediateFusionClassifier(
        fusion=fusion,
        rgb_in_ch=rgb_in_ch,
        lidar_in_ch=lidar_in_ch,
        emb_size=EMB,
        num_classes=2,
        normalize_embeddings=False,
    )

    out = train_task3(m, train_loader, val_loader, device=device, epochs=EPOCHS, lr=LR, wandb_run=run, ckpt_path=f"/checkpoints/task3_{fusion}.pt")

    print("wandb run:", wandb.run.name, wandb.run.id)

    run.finish()

    runs.append({"model": f"intermediate_{fusion}", **{k: out[k] for k in ["val_loss_final","val_f1_final","params","sec_per_epoch_avg","gpu_mem_mb_peak"]}})





epoch 01 | train 0.6960 | val 0.6713 | f1 0.304 | 744.64s | 528 MB
epoch 02 | train 0.5465 | val 0.5015 | f1 0.743 | 11.39s | 528 MB
epoch 03 | train 0.4583 | val 0.5193 | f1 0.716 | 11.67s | 528 MB
epoch 04 | train 0.3800 | val 0.3533 | f1 0.853 | 11.47s | 528 MB
epoch 05 | train 0.2625 | val 0.3395 | f1 0.852 | 11.60s | 528 MB
epoch 06 | train 0.2021 | val 0.2033 | f1 0.937 | 11.16s | 528 MB
epoch 07 | train 0.1304 | val 0.2171 | f1 0.929 | 11.50s | 528 MB
epoch 08 | train 0.2537 | val 0.2231 | f1 0.934 | 11.28s | 528 MB
epoch 09 | train 0.1428 | val 0.1841 | f1 0.934 | 11.29s | 528 MB
epoch 10 | train 0.1037 | val 0.0997 | f1 0.970 | 11.62s | 528 MB
epoch 11 | train 0.1468 | val 0.1624 | f1 0.939 | 11.35s | 528 MB
epoch 12 | train 0.0783 | val 0.0633 | f1 0.970 | 10.86s | 528 MB
epoch 13 | train 0.0563 | val 0.0348 | f1 0.997 | 11.51s | 528 MB
epoch 14 | train 0.0303 | val 0.1782 | f1 0.945 | 11.70s | 528 MB
epoch 15 | train 0.0990 | val 0.1133 | f1 0.950 | 11.23s | 528 MB
epoch 16 

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
perf/gpu_mem_mb_peak,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
perf/sec_per_epoch,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▆▆▅▄▃▂▄▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/f1_macro,▁▅▅▇▇▇▇▇▇█▇██▇▇█████████████████████████
val/loss,█▆▆▅▅▃▃▃▃▂▃▂▁▃▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,40
gpu_mem_mb_peak,527.56689
params,1305536
perf/gpu_mem_mb_peak,527.56689
perf/sec_per_epoch,11.10782
sec_per_epoch_avg,29.53435
train/loss,1e-05
val/f1_macro,0.99498
val/loss,0.01671
val_f1_final,0.99498


epoch 01 | train 0.6990 | val 0.6933 | f1 0.322 | 11.19s | 667 MB
epoch 02 | train 0.6932 | val 0.6929 | f1 0.322 | 10.75s | 667 MB
epoch 03 | train 0.6325 | val 0.6120 | f1 0.611 | 10.69s | 667 MB
epoch 04 | train 0.4950 | val 0.5218 | f1 0.718 | 11.27s | 667 MB
epoch 05 | train 0.4426 | val 0.4407 | f1 0.801 | 11.34s | 667 MB
epoch 06 | train 0.3920 | val 0.3916 | f1 0.826 | 11.12s | 667 MB
epoch 07 | train 0.3033 | val 0.6555 | f1 0.567 | 10.77s | 667 MB
epoch 08 | train 0.2830 | val 0.2960 | f1 0.882 | 10.65s | 667 MB
epoch 09 | train 0.2143 | val 0.2812 | f1 0.898 | 11.04s | 667 MB
epoch 10 | train 0.1881 | val 0.2311 | f1 0.914 | 11.00s | 667 MB
epoch 11 | train 0.1559 | val 0.2041 | f1 0.932 | 10.97s | 667 MB
epoch 12 | train 0.1179 | val 0.3030 | f1 0.900 | 10.40s | 667 MB
epoch 13 | train 0.1508 | val 0.1856 | f1 0.930 | 10.70s | 667 MB
epoch 14 | train 0.0955 | val 0.1133 | f1 0.962 | 11.06s | 667 MB
epoch 15 | train 0.1403 | val 0.2238 | f1 0.921 | 10.93s | 667 MB
epoch 16 |

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
perf/gpu_mem_mb_peak,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
perf/sec_per_epoch,▇▄▄██▇▄▃▆▆▆▂▄▆▅▃▃▆▅▅▂▆▆▅▂▃▆▇▄▁▆▅▅▂▅▅▅▃▂▅
train/loss,██▇▆▅▅▄▄▃▃▃▂▃▂▂▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/f1_macro,▁▁▄▅▆▆▄▇▇▇▇▇▇█▇▇▇███████████████████████
val/loss,██▇▆▅▅█▄▄▃▃▄▃▂▃▃▃▂▂▂▂▂▂▁▁▂▂▁▁▁▁▃▂▁▁▁▁▁▁▁

0,1
epoch,40
gpu_mem_mb_peak,667.48877
params,879236
perf/gpu_mem_mb_peak,667.48877
perf/sec_per_epoch,10.93446
sec_per_epoch_avg,10.82491
train/loss,0.00025
val/f1_macro,0.99749
val/loss,0.00391
val_f1_final,0.99749


epoch 01 | train 0.6973 | val 0.6921 | f1 0.344 | 10.86s | 486 MB
epoch 02 | train 0.6938 | val 0.6927 | f1 0.322 | 10.93s | 486 MB
epoch 03 | train 0.6688 | val 0.6231 | f1 0.593 | 10.91s | 486 MB
epoch 04 | train 0.5470 | val 0.5197 | f1 0.723 | 10.34s | 486 MB
epoch 05 | train 0.4791 | val 0.4989 | f1 0.736 | 10.71s | 486 MB
epoch 06 | train 0.4221 | val 0.4235 | f1 0.811 | 10.92s | 486 MB
epoch 07 | train 0.4375 | val 0.4534 | f1 0.781 | 10.69s | 486 MB
epoch 08 | train 0.3721 | val 0.4115 | f1 0.818 | 10.68s | 486 MB
epoch 09 | train 0.3283 | val 0.3581 | f1 0.831 | 10.50s | 486 MB
epoch 10 | train 0.2436 | val 0.2412 | f1 0.889 | 10.92s | 486 MB
epoch 11 | train 0.2254 | val 0.2220 | f1 0.895 | 11.15s | 486 MB
epoch 12 | train 0.1557 | val 0.1585 | f1 0.940 | 10.66s | 486 MB
epoch 13 | train 0.1034 | val 0.1075 | f1 0.972 | 10.29s | 486 MB
epoch 14 | train 0.0598 | val 0.0562 | f1 0.982 | 11.03s | 486 MB
epoch 15 | train 0.0419 | val 0.0815 | f1 0.967 | 10.87s | 486 MB
epoch 16 |

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
perf/gpu_mem_mb_peak,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
perf/sec_per_epoch,▆▆▆▂▅▆▅▅▃▆█▄▂▇▆▆▃▄▆▇▄▆▆▆▇▁▆▇▅▂▅▅▅▄▃▆▇▄▂▇
train/loss,███▆▆▅▅▅▄▃▃▃▂▂▁▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/f1_macro,▁▁▄▅▅▆▆▆▆▇▇▇████████████████████████████
val/loss,██▇▆▆▅▆▅▅▃▃▃▂▂▂▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,40
gpu_mem_mb_peak,486.09912
params,699236
perf/gpu_mem_mb_peak,486.09912
perf/sec_per_epoch,11.05701
sec_per_epoch_avg,10.73646
train/loss,8e-05
val/f1_macro,0.99749
val/loss,0.0028
val_f1_final,0.99749


epoch 01 | train 0.6882 | val 0.6703 | f1 0.329 | 10.60s | 499 MB
epoch 02 | train 0.5725 | val 0.5398 | f1 0.716 | 11.04s | 499 MB
epoch 03 | train 0.4991 | val 0.4745 | f1 0.761 | 10.87s | 499 MB
epoch 04 | train 0.3968 | val 0.3641 | f1 0.828 | 10.60s | 499 MB
epoch 05 | train 0.3088 | val 0.3263 | f1 0.858 | 10.53s | 499 MB
epoch 06 | train 0.2259 | val 0.1740 | f1 0.911 | 10.83s | 499 MB
epoch 07 | train 0.0974 | val 0.0760 | f1 0.972 | 10.94s | 499 MB
epoch 08 | train 0.0937 | val 0.0854 | f1 0.970 | 11.06s | 499 MB
epoch 09 | train 0.0343 | val 0.1032 | f1 0.967 | 10.12s | 499 MB
epoch 10 | train 0.0468 | val 0.0207 | f1 0.997 | 10.94s | 499 MB
epoch 11 | train 0.0320 | val 0.1923 | f1 0.944 | 11.17s | 499 MB
epoch 12 | train 0.0283 | val 0.0204 | f1 0.995 | 10.81s | 499 MB
epoch 13 | train 0.0090 | val 0.0469 | f1 0.972 | 10.42s | 499 MB
epoch 14 | train 0.1134 | val 0.1224 | f1 0.965 | 10.55s | 499 MB
epoch 15 | train 0.0423 | val 0.0143 | f1 0.997 | 10.73s | 499 MB
epoch 16 |

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
perf/gpu_mem_mb_peak,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
perf/sec_per_epoch,▂▃▃▂▂▃▃▃▁▃▃▂▂▂▂▃▂█▃▂▃▂▂▃▃▃▂▅▃▃▂▂▂▃▃▁▂▃▃▂
train/loss,█▇▆▅▄▃▂▂▁▁▁▁▁▂▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val/f1_macro,▁▅▆▆▇▇████▇█████████████████████████████
val/loss,█▇▆▅▄▃▂▂▂▁▃▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,40
gpu_mem_mb_peak,498.85498
params,699236
perf/gpu_mem_mb_peak,498.85498
perf/sec_per_epoch,10.55533
sec_per_epoch_avg,10.86506
train/loss,1e-05
val/f1_macro,0.99749
val/loss,0.00727
val_f1_final,0.99749


In [9]:

import pandas as pd

df = pd.DataFrame(runs)
print(df.to_markdown(index=False))


| model                 |   val_loss_final |   val_f1_final |   params |   sec_per_epoch_avg |   gpu_mem_mb_peak |
|:----------------------|-----------------:|---------------:|---------:|--------------------:|------------------:|
| late_fusion           |       0.0167079  |       0.994985 |  1305536 |             29.5343 |           527.567 |
| intermediate_concat   |       0.0039106  |       0.997493 |   879236 |             10.8249 |           667.489 |
| intermediate_add      |       0.00279778 |       0.997493 |   699236 |             10.7365 |           486.099 |
| intermediate_hadamard |       0.00727478 |       0.997493 |   699236 |             10.8651 |           498.855 |


We can see that intermediate_add, intermediate_concat, and late_fusion basically all perform perfectly on the dataset. Intermediate_hadamard is slightly worse, with a loss of around 3% and an F1 score of 99.97%.

It is expected that add and concat outperform the hadamard fusion. If we look at the data, the LiDAR dataset displays quite clean spheres and cubes, while the RGB dataset is relatively noisy. With Hadamard fusion, the noisy RGB features are multiplied with the cleaner LiDAR features, which can lead to a loss of information. This is not the case for concat or add. During late fusion, this is also less of an issue, since the fusion happens in deeper layers after each modality has already learned more robust features.

We can also see that intermediate_add and intermediate_hadamard are noticeably smaller than the other fusion models and therefore use less GPU memory. Concat is slightly larger due to the doubled feature channels at the fusion point, and late_fusion is noticeably larger overall because it maintains two separate encoders throughout the network.

Concat had the highest GPU memory peak, which can be attributed to the larger feature maps created by concatenation.

Finally, it is worth mentioning that concat was the fastest model to converge to a perfect score, reaching a validation loss of 0 and an F1 score of 1 after around 20 epochs. Late fusion and intermediate_add required more training, converging after approximately 30 and 36 epochs, respectively. Solving the concat problem being the easiest makes sense, since this fusion strategy clearly separates the less noisy LiDAR features from the noisier RGB ones.