In [1]:
!pip -q install fiftyone
!pip -q install wandb


In [2]:
import wandb

wandb.login()


[34m[1mwandb[0m: Currently logged in as: [33mconstantin-auga[0m ([33mConscht-Sht[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

1. Imports

In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent  # go from notebooks/ -> CLIP-Multimodal-Learning/
sys.path.insert(0, str(project_root))

print("CWD:", Path.cwd())
print("Using project root:", project_root)
print("src exists:", (project_root / "src").exists())
import pandas as pd
from src.models import LateFusionClassifier, IntermediateFusionClassifier
from src.training import train_task3

from src.datasets import AssessmentPairs, train_val_split, AssessmentTorchDataset, stratified_subsample, class_counts
from torch.utils.data import DataLoader
ROOT = Path(r"G:\My Drive\Hands-on-CV-Project2\assessment")

pairs = AssessmentPairs(ROOT).load_pairs()
pairs_10 = stratified_subsample(pairs, frac=0.10, seed=42)

train_pairs, val_pairs = train_val_split(pairs_10, val_ratio=0.2, seed=42)

train_ds = AssessmentTorchDataset(train_pairs)
val_ds   = AssessmentTorchDataset(val_pairs)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

print("ALL:", len(pairs), class_counts(pairs))
print("10%:", len(pairs_10), class_counts(pairs_10))
print("train:", len(train_pairs), class_counts(train_pairs))
print("val  :", len(val_pairs), class_counts(val_pairs))



CWD: g:\My Drive\Hands-on-CV-Project2\notebooks
Using project root: g:\My Drive\Hands-on-CV-Project2
src exists: True
ALL: 19998 {'cubes': 9999, 'spheres': 9999}
10%: 2000 {'cubes': 1000, 'spheres': 1000}
train: 1600 {'cubes': 810, 'spheres': 790}
val  : 400 {'cubes': 190, 'spheres': 210}


2. Prepare Training

In [4]:
batch = next(iter(train_loader))
rgb_in_ch = batch["rgb"].shape[1]
lidar_in_ch = batch["lidar"].shape[1]
print("rgb_in_ch:", rgb_in_ch, "lidar_in_ch:", lidar_in_ch)


rgb_in_ch: 4 lidar_in_ch: 1


3. Run all models

In [5]:
import torch, sys, os
print("torch from:", torch.__file__)
print("cwd:", os.getcwd())


torch from: c:\Users\const\anaconda3\Lib\site-packages\torch\__init__.py
cwd: g:\My Drive\Hands-on-CV-Project2\notebooks


In [7]:
import wandb

device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 20
LR = 1e-3
EMB = 200

runs = []

project = "cilp-extended-assessment"

# Late fusion
run = wandb.init(
    project=project,
    name="task3_late_fusion",
    config={
        "task": 3,
        "model": "late_fusion",
        "fusion": "late",
        "epochs": EPOCHS,
        "lr": LR,
        "emb": EMB,
        "batch_size": train_loader.batch_size,
        "subset_frac": 0.10,
        "rgb_in_ch": rgb_in_ch,
        "lidar_in_ch": lidar_in_ch,
        "optimizer": "AdamW",
    },
    reinit=True,
)

m = LateFusionClassifier(
    rgb_in_ch=rgb_in_ch,
    lidar_in_ch=lidar_in_ch,
    emb_size=EMB,
    num_classes=2,
    normalize_embeddings=False,
)

out = train_task3(m, train_loader, val_loader, device=device, epochs=EPOCHS, lr=LR, wandb_run=run, ckpt_path="checkpoints/fusion")
run.finish()

runs.append({"model": "late_fusion", **{k: out[k] for k in ["val_loss_final","val_f1_final","params","sec_per_epoch_avg","gpu_mem_mb_peak"]}})

# Intermediate variants
for fusion in ["concat", "add", "hadamard"]:
    run = wandb.init(
        project=project,
        name=f"task3_intermediate_{fusion}",
        config={
            "task": 3,
            "model": f"intermediate_{fusion}",
            "fusion": "intermediate",
            "fusion_op": fusion,
            "epochs": EPOCHS,
            "lr": LR,
            "emb": EMB,
            "batch_size": train_loader.batch_size,
            "subset_frac": 0.10,
            "rgb_in_ch": rgb_in_ch,
            "lidar_in_ch": lidar_in_ch,
            "optimizer": "AdamW",
        },
        reinit=True,
    )

    m = IntermediateFusionClassifier(
        fusion=fusion,
        rgb_in_ch=rgb_in_ch,
        lidar_in_ch=lidar_in_ch,
        emb_size=EMB,
        num_classes=2,
        normalize_embeddings=False,
    )

    out = train_task3(m, train_loader, val_loader, device=device, epochs=EPOCHS, lr=LR, wandb_run=run, ckpt_path=f"/checkpoints/task3_{fusion}.pt")
    
    print("wandb run:", wandb.run.name, wandb.run.id)

    run.finish()

    runs.append({"model": f"intermediate_{fusion}", **{k: out[k] for k in ["val_loss_final","val_f1_final","params","sec_per_epoch_avg","gpu_mem_mb_peak"]}})



epoch 01 | train 0.6969 | val 0.6933 | f1 0.322 | 100.51s | 0 MB
epoch 02 | train 0.6012 | val 0.5545 | f1 0.685 | 94.68s | 0 MB
epoch 03 | train 0.4857 | val 0.5873 | f1 0.702 | 99.42s | 0 MB
epoch 04 | train 0.4556 | val 0.4452 | f1 0.777 | 97.78s | 0 MB
epoch 05 | train 0.3392 | val 0.4262 | f1 0.821 | 96.01s | 0 MB
epoch 06 | train 0.3674 | val 0.3495 | f1 0.853 | 114.50s | 0 MB
epoch 07 | train 0.2286 | val 0.2458 | f1 0.909 | 107.23s | 0 MB
epoch 08 | train 0.1739 | val 0.2601 | f1 0.906 | 107.12s | 0 MB
epoch 09 | train 0.1514 | val 0.1648 | f1 0.950 | 107.88s | 0 MB


KeyboardInterrupt: 

In [None]:

import pandas as pd

df = pd.DataFrame(runs)
print(df.to_markdown(index=False))


We can see that intermediate_add, intermediate_concat, and late_fusion basically all perform perfectly on the dataset. Intermediate_hadamard is slightly worse, with a loss of around 3% and an F1 score of 99.97%.

It is expected that add and concat outperform the hadamard fusion. If we look at the data, the LiDAR dataset displays quite clean spheres and cubes, while the RGB dataset is relatively noisy. With Hadamard fusion, the noisy RGB features are multiplied with the cleaner LiDAR features, which can lead to a loss of information. This is not the case for concat or add. During late fusion, this is also less of an issue, since the fusion happens in deeper layers after each modality has already learned more robust features.

We can also see that intermediate_add and intermediate_hadamard are noticeably smaller than the other fusion models and therefore use less GPU memory. Concat is slightly larger due to the doubled feature channels at the fusion point, and late_fusion is noticeably larger overall because it maintains two separate encoders throughout the network.

Concat had the highest GPU memory peak, which can be attributed to the larger feature maps created by concatenation.

Finally, it is worth mentioning that concat was the fastest model to converge to a perfect score, reaching a validation loss of 0 and an F1 score of 1 after around 20 epochs. Late fusion and intermediate_add required more training, converging after approximately 30 and 36 epochs, respectively. Solving the concat problem being the easiest makes sense, since this fusion strategy clearly separates the less noisy LiDAR features from the noisier RGB ones.