"""
README
======

Food-Waste Prediction – CLIP Backbone
------------------------------------

A **single-file pipeline** that predicts how many grams of each ingredient will be
left over on a cafeteria tray.  Images and ingredient lists are encoded with a
*pre-trained* OpenAI CLIP ViT-B/32 model; a small MLP head is trained to regress
per-ingredient waste.

Contents
~~~~~~~~
1.  **Imports & Utils** – sets seeds and disables tokenizer parallelism.
2.  **Data & Ingredient Index** – downloads the `Voxel51/food-waste-dataset`
    from Hugging Face, builds `ing2idx`/`idx2ing`.
3.  **CLIP Objects** – loads `CLIPProcessor` + `CLIPModel`.
4.  **Dataset & Collate** – a `torch.utils.data.Dataset` that returns
    *(PIL image, ingredient sentence, target tensor)* and a `collate_fn` that
    tokenises & pads the batch with `CLIPProcessor`.
5.  **Imbalance Weights** – computes inverse-log frequency weights so rare
    ingredients matter in the loss.
6.  **Model** – `CLIPFoodWastePredictor` (frozen CLIP → 2×512-D → MLP →
    ReLU-clamped grams).
7.  **Training & Evaluation** – weighted ingredient-level MSE + a small
    plate-level L1 term; reports MAE, RMSE, Spearman ρ, R².
8.  **Run Evaluation** – prints metrics on the held-out *test* split.


In [2]:
!pip install fiftyone



In [None]:
# -*- coding: utf-8 -*-
"""
Food-Waste Prediction – CLIP backbone
====================================
Full replacement for clip_food_waste_hackathon_hpi.py
"""

# ---------- 1. Imports & utils ----------
import os, random, numpy as np, torch, torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import spearmanr
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

os.environ["TOKENIZERS_PARALLELISM"] = "false"
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed()


# ---------- 2. Data & ingredient index ----------
dataset = load_from_hub("Voxel51/food-waste-dataset", overwrite=True)
all_ing = sorted({ing for s in dataset for ing in s["ingredient_name"]})
ing2idx = {ing: i for i, ing in enumerate(all_ing)}
idx2ing = {i: ing for ing, i in ing2idx.items()}
NUM_ING  = len(ing2idx)

# ---------- 3. CLIP objects ----------
clip_ckpt = "openai/clip-vit-base-patch32"
clip_processor = CLIPProcessor.from_pretrained(clip_ckpt)
clip_model      = CLIPModel.from_pretrained(clip_ckpt)


# ---------- 4. Dataset & collate ----------
class FoodWasteCLIPDataset(Dataset):
    """
    Returns:
        image  : PIL.Image                (no transforms – CLIPProcessor handles it)
        text   : str   = comma-separated ingredient list
        target : FloatTensor [NUM_ING]
    """
    def __init__(self, view):
        self.view = view
        self.ids  = view.values("id")

    def __len__(self): return len(self.ids)

    def __getitem__(self, idx):
        sample = self.view[self.ids[idx]]
        img = Image.open(sample.filepath).convert("RGB")
        text = ", ".join(sample["ingredient_name"])  # single sentence

        tgt = torch.zeros(NUM_ING, dtype=torch.float32)
        for ing, amt in zip(sample["ingredient_name"], sample["return_quantity"]):
            if amt is not None:
                tgt[ing2idx[ing]] = amt
        return img, text, tgt


def collate_fn(batch):
    imgs, texts, tgts = zip(*batch)
    inputs = clip_processor(text=list(texts),
                            images=list(imgs),
                            return_tensors="pt",
                            padding=True)
    return {k: v for k, v in inputs.items()}, torch.stack(tgts)


train_view = dataset.match({"split": "train"})
test_view  = dataset.match({"split": "test"})
train_ds   = FoodWasteCLIPDataset(train_view)
test_ds    = FoodWasteCLIPDataset(test_view)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, collate_fn=collate_fn)


# ---------- 5. Imbalance weights ----------
with torch.no_grad():
    freq = torch.zeros(NUM_ING, dtype=torch.float32)
    for _, _, tgt in train_ds:
        freq += (tgt > 0).float()
weight_vec = (1.0 / torch.log(freq + 2.0))
weight_vec = (weight_vec / weight_vec.mean()).to(torch.float32)


# ---------- 6. Model ----------
class CLIPFoodWastePredictor(nn.Module):
    def __init__(self, clip_model, num_ingredients):
        super().__init__()
        self.clip = clip_model
        for p in self.clip.parameters():
            p.requires_grad = False

        dim = self.clip.config.projection_dim  # 512
        self.head = nn.Sequential(
            nn.Linear(dim * 2, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(512, num_ingredients)
        )

    def forward(self, inputs):
        # move dict tensors to same device as head
        dev = next(self.head.parameters()).device
        inputs = {k: v.to(dev) for k, v in inputs.items()}

        with torch.no_grad():
            img_emb = self.clip.get_image_features(pixel_values=inputs["pixel_values"])
            txt_emb = self.clip.get_text_features(
                input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
            )
        x = torch.cat([img_emb, txt_emb], dim=1)
        return F.relu(self.head(x))  # keep grams ≥ 0


# ---------- 7. Training & evaluation ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPFoodWastePredictor(clip_model, NUM_ING).to(device)
optimizer = torch.optim.AdamW(model.head.parameters(), lr=1e-3, weight_decay=1e-4)
criterion  = nn.MSELoss(reduction="none")
weight_vec = weight_vec.to(device)

def jitter(targets, eps=1.0):
    mask = (targets > 0).float()
    noise = torch.empty_like(targets).uniform_(-eps, eps)
    return torch.clamp(targets + noise * mask, min=0.0)

EPOCHS, λ_tot = 30, 1e-3
print("Starting training…")
for epoch in range(EPOCHS):
    model.train()
    running = 0.0
    for inputs, targets in train_loader:
        targets = targets.to(device)
        targets = jitter(targets)

        optimizer.zero_grad(set_to_none=True)
        preds = model(inputs)

        # ingredient-level weighted MSE
        diff_sq = criterion(preds, targets)
        loss_ing = (diff_sq * weight_vec).mean()

        # plate-level total consistency
        loss_tot = F.l1_loss(preds.sum(1), targets.sum(1))

        loss = loss_ing + λ_tot * loss_tot
        loss.backward()
        optimizer.step()

        running += loss.item() * targets.size(0)

    print(f"Epoch {epoch+1:02d}/{EPOCHS}   loss={running/len(train_ds):.4f}")

print("Training finished.")


@torch.no_grad()
def predict(loader):
    model.eval()
    y_true, y_pred = [], []
    for inp, tgt in loader:
        preds = model(inp)
        y_true.append(tgt.numpy())
        y_pred.append(preds.cpu().numpy())
    y_true, y_pred = np.concatenate(y_true), np.concatenate(y_pred)
    return y_true, y_pred, y_true.sum(1), y_pred.sum(1)


def evaluate(y_true, y_pred, true_tot, pred_tot):
    mae_tot = np.abs(pred_tot - true_tot).mean()
    rho, _  = spearmanr(true_tot, pred_tot)
    mse  = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    print(f"\nTotal-waste MAE = {mae_tot:.3f}   Spearman ρ = {rho:.3f}")
    print(f"MSE  = {mse:.4f}\nRMSE = {rmse:.4f}\nMAE  = {mae:.4f}\nR²   = {r2:.4f}")
    return dict(MSE=mse, RMSE=rmse, MAE=mae, R2=r2)


# ---------- 8. Run evaluation ----------
y_t, y_p, tot_t, tot_p = predict(test_loader)
metrics = evaluate(y_t, y_p, tot_t, tot_p)


| Metric                  | Baseline      | After CLIP + weights | Δ          |
| ----------------------- | ------------- | -------------------- | ---------- |
| **Total-waste MAE (g)** | 35.45         | **34.84**            | ↓ 0.6      |
| **Spearman ρ**          | –0.26         | **0.85**             | **⇧ huge** |
| **MAE (per-ingr.)**     | 3.46          | **2.97**             | ↓ 14 %     |
| **MSE / RMSE**          | 128.9 / 11.35 | **107.2 / 10.35**    | ↓ 17 %     |
| **R²**                  | **0.41**      | 0.26                 | ↓          |


In [None]:
# -*- coding: utf-8 -*-
"""
Food-Waste Prediction – CLIP backbone
====================================
Full replacement for clip_food_waste_hackathon_hpi.py
"""

# ---------- 1. Imports & utils ----------
import os, random, numpy as np, torch, torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import spearmanr
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

os.environ["TOKENIZERS_PARALLELISM"] = "false"
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed()


# ---------- 2. Data & ingredient index ----------
dataset = load_from_hub("FoodWasteProjectBIBI/food_waste_dataset_with_added_samples", overwrite=True)


all_ing = sorted({ing for s in dataset for ing in s["ingredient_name"]})
ing2idx = {ing: i for i, ing in enumerate(all_ing)}
idx2ing = {i: ing for ing, i in ing2idx.items()}
NUM_ING  = len(ing2idx)

# ---------- 3. CLIP objects ----------
clip_ckpt = "openai/clip-vit-base-patch32"
clip_processor = CLIPProcessor.from_pretrained(clip_ckpt)
clip_model      = CLIPModel.from_pretrained(clip_ckpt)


# ---------- 4. Dataset & collate ----------
class FoodWasteCLIPDataset(Dataset):
    """
    Returns:
        image  : PIL.Image                (no transforms – CLIPProcessor handles it)
        text   : str   = comma-separated ingredient list
        target : FloatTensor [NUM_ING]
    """
    def __init__(self, view):
        self.view = view
        self.ids  = view.values("id")

    def __len__(self): return len(self.ids)

    def __getitem__(self, idx):
        sample = self.view[self.ids[idx]]
        img = Image.open(sample.filepath).convert("RGB")
        text = ", ".join(sample["ingredient_name"])  # single sentence

        tgt = torch.zeros(NUM_ING, dtype=torch.float32)
        for ing, amt in zip(sample["ingredient_name"], sample["return_quantity"]):
            if amt is not None:
                tgt[ing2idx[ing]] = amt
        return img, text, tgt


def collate_fn(batch):
    imgs, texts, tgts = zip(*batch)
    inputs = clip_processor(text=list(texts),
                            images=list(imgs),
                            return_tensors="pt",
                            padding=True)
    return {k: v for k, v in inputs.items()}, torch.stack(tgts)


train_view = dataset.match({"split": "train"})
test_view  = dataset.match({"split": "test"})
train_ds   = FoodWasteCLIPDataset(train_view)
test_ds    = FoodWasteCLIPDataset(test_view)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, collate_fn=collate_fn)


# ---------- 5. Imbalance weights ----------
with torch.no_grad():
    freq = torch.zeros(NUM_ING, dtype=torch.float32)
    for _, _, tgt in train_ds:
        freq += (tgt > 0).float()
weight_vec = (1.0 / torch.log(freq + 2.0))
weight_vec = (weight_vec / weight_vec.mean()).to(torch.float32)


# ---------- 6. Model ----------
class CLIPFoodWastePredictor(nn.Module):
    def __init__(self, clip_model, num_ingredients):
        super().__init__()
        self.clip = clip_model
        for p in self.clip.parameters():
            p.requires_grad = False

        dim = self.clip.config.projection_dim  # 512
        self.head = nn.Sequential(
            nn.Linear(dim * 2, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(512, num_ingredients)
        )

    def forward(self, inputs):
        # move dict tensors to same device as head
        dev = next(self.head.parameters()).device
        inputs = {k: v.to(dev) for k, v in inputs.items()}

        with torch.no_grad():
            img_emb = self.clip.get_image_features(pixel_values=inputs["pixel_values"])
            txt_emb = self.clip.get_text_features(
                input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
            )
        x = torch.cat([img_emb, txt_emb], dim=1)
        return F.relu(self.head(x))  # keep grams ≥ 0


# ---------- 7. Training & evaluation ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPFoodWastePredictor(clip_model, NUM_ING).to(device)
optimizer = torch.optim.AdamW(model.head.parameters(), lr=1e-3, weight_decay=1e-4)
criterion  = nn.MSELoss(reduction="none")
weight_vec = weight_vec.to(device)

def jitter(targets, eps=1.0):
    mask = (targets > 0).float()
    noise = torch.empty_like(targets).uniform_(-eps, eps)
    return torch.clamp(targets + noise * mask, min=0.0)

EPOCHS, λ_tot = 30, 1e-3
print("Starting training…")
for epoch in range(EPOCHS):
    model.train()
    running = 0.0
    for inputs, targets in train_loader:
        targets = targets.to(device)
        targets = jitter(targets)

        optimizer.zero_grad(set_to_none=True)
        preds = model(inputs)

        # ingredient-level weighted MSE
        diff_sq = criterion(preds, targets)
        loss_ing = (diff_sq * weight_vec).mean()

        # plate-level total consistency
        loss_tot = F.l1_loss(preds.sum(1), targets.sum(1))

        loss = loss_ing + λ_tot * loss_tot
        loss.backward()
        optimizer.step()

        running += loss.item() * targets.size(0)

    print(f"Epoch {epoch+1:02d}/{EPOCHS}   loss={running/len(train_ds):.4f}")

print("Training finished.")


@torch.no_grad()
def predict(loader):
    model.eval()
    y_true, y_pred = [], []
    for inp, tgt in loader:
        preds = model(inp)
        y_true.append(tgt.numpy())
        y_pred.append(preds.cpu().numpy())
    y_true, y_pred = np.concatenate(y_true), np.concatenate(y_pred)
    return y_true, y_pred, y_true.sum(1), y_pred.sum(1)


def evaluate(y_true, y_pred, true_tot, pred_tot):
    mae_tot = np.abs(pred_tot - true_tot).mean()
    rho, _  = spearmanr(true_tot, pred_tot)
    mse  = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    print(f"\nTotal-waste MAE = {mae_tot:.3f}   Spearman ρ = {rho:.3f}")
    print(f"MSE  = {mse:.4f}\nRMSE = {rmse:.4f}\nMAE  = {mae:.4f}\nR²   = {r2:.4f}")
    return dict(MSE=mse, RMSE=rmse, MAE=mae, R2=r2)


# ---------- 8. Run evaluation ----------
y_t, y_p, tot_t, tot_p = predict(test_loader)
metrics = evaluate(y_t, y_p, tot_t, tot_p)


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/datasets/FoodWasteProjectBIBI/food_waste_dataset_with_added_samples/resolve/main/fiftyone.yml
Retrying in 1s [Retry 1/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/datasets/FoodWasteProjectBIBI/food_waste_dataset_with_added_samples/resolve/main/fiftyone.yml
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/datasets/FoodWasteProjectBIBI/food_waste_dataset_with_added_samples/resolve/main/fiftyone.yml
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/datasets/FoodWasteProjectBIBI/food_waste_dataset_with_added_samples/resolve/426b0ebf5639e5f71ccb1c8f3a4406270da1126c/data/data_5/train_000196-2.jpg
Retrying in 8s [Retry 5/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/datasets/FoodWasteProjectBIBI/food_waste_dataset_with_added_samples/resolve/main/fiftyone.yml
Retrying in 8s [Retry 4/5].
H

Downloading config file fiftyone.yml from FoodWasteProjectBIBI/food_waste_dataset_with_added_samples


INFO:fiftyone.utils.huggingface:Downloading config file fiftyone.yml from FoodWasteProjectBIBI/food_waste_dataset_with_added_samples


Loading dataset


INFO:fiftyone.utils.huggingface:Loading dataset


Importing samples...


INFO:fiftyone.utils.data.importers:Importing samples...


 100% |█████████████████| 691/691 [129.9ms elapsed, 0s remaining, 5.3K samples/s]     


INFO:eta.core.utils: 100% |█████████████████| 691/691 [129.9ms elapsed, 0s remaining, 5.3K samples/s]     


Downloading 128 media files...


INFO:fiftyone.utils.huggingface:Downloading 128 media files...
100%|██████████| 2/2 [00:16<00:00,  8.15s/it]


Starting training…
Epoch 01/30   loss=22.6957
Epoch 02/30   loss=20.6854
Epoch 03/30   loss=18.6727
Epoch 04/30   loss=16.4250
Epoch 05/30   loss=14.4053
Epoch 06/30   loss=12.6945
Epoch 07/30   loss=11.2009
Epoch 08/30   loss=10.0348
Epoch 09/30   loss=8.9387
Epoch 10/30   loss=8.0485
Epoch 11/30   loss=7.2311
Epoch 12/30   loss=6.6807


# Update:

The code now includes synthetic image augmentations (horizontal flip, vertical flip, 90-degree rotation) and averages CLIP image embeddings across these augmented views. This increases data diversity and improves model robustness.

In [3]:
"""
Food-Waste Prediction – CLIP backbone
====================================
Full replacement for clip_food_waste_hackathon_hpi.py
"""

# ---------- 1. Imports & utils ----------
import os, random, numpy as np, torch, torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import spearmanr
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

os.environ["TOKENIZERS_PARALLELISM"] = "false"
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed()


# ---------- 2. Data & ingredient index ----------
dataset = load_from_hub("FoodWasteProjectBIBI/food_waste_dataset_with_added_samples", overwrite=True)


all_ing = sorted({ing for s in dataset for ing in s["ingredient_name"]})
ing2idx = {ing: i for i, ing in enumerate(all_ing)}
idx2ing = {i: ing for ing, i in ing2idx.items()}
NUM_ING  = len(ing2idx)

# ---------- 3. CLIP objects ----------
clip_ckpt = "openai/clip-vit-base-patch32"
clip_processor = CLIPProcessor.from_pretrained(clip_ckpt)
clip_model      = CLIPModel.from_pretrained(clip_ckpt)


# ---------- 4. Dataset & collate ----------
class FoodWasteCLIPDataset(Dataset):
    """
    Returns:
        image  : PIL.Image                (no transforms – CLIPProcessor handles it)
        text   : str   = comma-separated ingredient list
        target : FloatTensor [NUM_ING]
    """
    def __init__(self, view):
        self.view = view
        self.ids  = view.values("id")

    def __len__(self): return len(self.ids)

    def __getitem__(self, idx):
        sample = self.view[self.ids[idx]]
        img = Image.open(sample.filepath).convert("RGB")
        text = ", ".join(sample["ingredient_name"])  # single sentence

        tgt = torch.zeros(NUM_ING, dtype=torch.float32)
        for ing, amt in zip(sample["ingredient_name"], sample["return_quantity"]):
            if amt is not None:
                tgt[ing2idx[ing]] = amt
        return img, text, tgt


def collate_fn(batch):
    imgs, texts, tgts = zip(*batch)
    inputs = clip_processor(text=list(texts),
                            images=list(imgs),
                            return_tensors="pt",
                            padding=True)
    return {k: v for k, v in inputs.items()}, torch.stack(tgts)


train_view = dataset.match({"split": "train"})
test_view  = dataset.match({"split": "test"})
train_ds   = FoodWasteCLIPDataset(train_view)
test_ds    = FoodWasteCLIPDataset(test_view)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, collate_fn=collate_fn)


# ---------- 5. Imbalance weights ----------
with torch.no_grad():
    freq = torch.zeros(NUM_ING, dtype=torch.float32)
    for _, _, tgt in train_ds:
        freq += (tgt > 0).float()
weight_vec = (1.0 / torch.log(freq + 2.0))
weight_vec = (weight_vec / weight_vec.mean()).to(torch.float32)


# ---------- 6. Model  (with built-in image augmentations) ----------
class CLIPFoodWastePredictor(nn.Module):
    def __init__(self, clip_model, num_ingredients, n_augs: int = 4):
        super().__init__()
        self.clip = clip_model.eval()          # keep frozen, eval mode
        for p in self.clip.parameters():
            p.requires_grad = False

        self.n_augs = n_augs                   # 1 = no augmentation
        dim = self.clip.config.projection_dim  # 512
        self.head = nn.Sequential(
            nn.Linear(dim * 2, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(512, num_ingredients)
        )

    # ----- helper: generate N augmented versions of a tensor batch -----
    @staticmethod
    def _apply_aug(x, aug_id):
        if aug_id == 0:  # identity
            return x
        elif aug_id == 1:  # horizontal flip
            return torch.flip(x, dims=[-1])
        elif aug_id == 2:  # vertical flip
            return torch.flip(x, dims=[-2])
        elif aug_id == 3:  # 90° rotation
            return torch.rot90(x, k=1, dims=(-2, -1))
        else:
            raise ValueError("aug_id must be 0–3")

    def forward(self, inputs):
        dev = next(self.head.parameters()).device
        inputs = {k: v.to(dev) for k, v in inputs.items()}          # move to GPU

        # ---- 1. average CLIP image embeddings across N views ----
        img_embs = []
        with torch.no_grad():
            for aug_id in range(self.n_augs):
                pv_aug = self._apply_aug(inputs["pixel_values"], aug_id)
                img_embs.append(self.clip.get_image_features(pixel_values=pv_aug))
            img_emb = torch.stack(img_embs, dim=0).mean(0)         # (B,512)

            txt_emb = self.clip.get_text_features(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"]
            )                                                      # (B,512)

        fused = torch.cat([img_emb, txt_emb], dim=1)               # (B,1024)
        return F.relu(self.head(fused))                            # (B,N_ing)


# ---------- 7. Training & evaluation (unchanged, just re-run) ----------
device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model   = CLIPFoodWastePredictor(clip_model, NUM_ING, n_augs=4).to(device)
optimizer = torch.optim.AdamW(model.head.parameters(), lr=1e-3, weight_decay=1e-4)
criterion  = nn.MSELoss(reduction="none")
weight_vec = weight_vec.to(device)

def jitter(targets, eps=1.0):
    mask  = (targets > 0).float()
    noise = torch.empty_like(targets).uniform_(-eps, eps)
    return torch.clamp(targets + noise * mask, min=0.0)

EPOCHS, λ_tot = 30, 1e-3
print("Starting training…")
for epoch in range(EPOCHS):
    model.train()
    run = 0.0
    for inp, tgt in train_loader:
        tgt = tgt.to(device)
        tgt = jitter(tgt)

        optimizer.zero_grad(set_to_none=True)
        pred = model(inp)

        loss_ing = (criterion(pred, tgt) * weight_vec).mean()
        loss_tot = F.l1_loss(pred.sum(1), tgt.sum(1))
        loss = loss_ing + λ_tot * loss_tot
        loss.backward()
        optimizer.step()

        run += loss.item() * tgt.size(0)

    print(f"Epoch {epoch+1:02d}/{EPOCHS} | loss = {run/len(train_ds):.4f}")

print("Training finished.")


@torch.no_grad()
def predict(loader):
    model.eval()
    ys, ps = [], []
    for inp, tgt in loader:
        ys.append(tgt.numpy())
        ps.append(model(inp).cpu().numpy())
    y_true = np.concatenate(ys);   y_pred = np.concatenate(ps)
    return y_true, y_pred, y_true.sum(1), y_pred.sum(1)


def evaluate(y_true, y_pred, tot_t, tot_p):
    mae_tot = np.abs(tot_p - tot_t).mean()
    rho, _  = spearmanr(tot_t, tot_p)
    mse  = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    print(f"\nTotal-waste MAE = {mae_tot:.3f}   Spearman ρ = {rho:.3f}")
    print(f"MSE  = {mse:.4f}\nRMSE = {rmse:.4f}\nMAE  = {mae:.4f}\nR²   = {r2:.4f}")
    return dict(MSE=mse, RMSE=rmse, MAE=mae, R2=r2)


# ---------- 8. Evaluate ----------
y_t, y_p, tot_t, tot_p = predict(test_loader)
_ = evaluate(y_t, y_p, tot_t, tot_p)


Downloading config file fiftyone.yml from FoodWasteProjectBIBI/food_waste_dataset_with_added_samples


INFO:fiftyone.utils.huggingface:Downloading config file fiftyone.yml from FoodWasteProjectBIBI/food_waste_dataset_with_added_samples


Loading dataset


INFO:fiftyone.utils.huggingface:Loading dataset


Importing samples...


INFO:fiftyone.utils.data.importers:Importing samples...


 100% |█████████████████| 691/691 [101.0ms elapsed, 0s remaining, 6.8K samples/s]     


INFO:eta.core.utils: 100% |█████████████████| 691/691 [101.0ms elapsed, 0s remaining, 6.8K samples/s]     


Downloading 330 media files...


INFO:fiftyone.utils.huggingface:Downloading 330 media files...
100%|██████████| 4/4 [00:19<00:00,  4.78s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Starting training…
Epoch 01/30 | loss = 22.7315
Epoch 02/30 | loss = 20.7673
Epoch 03/30 | loss = 18.6595
Epoch 04/30 | loss = 16.5580
Epoch 05/30 | loss = 14.6198
Epoch 06/30 | loss = 12.9512
Epoch 07/30 | loss = 11.3951
Epoch 08/30 | loss = 10.1382
Epoch 09/30 | loss = 9.1062
Epoch 10/30 | loss = 8.2579
Epoch 11/30 | loss = 7.4195
Epoch 12/30 | loss = 6.8064
Epoch 13/30 | loss = 6.3231
Epoch 14/30 | loss = 5.9275
Epoch 15/30 | loss = 5.5333
Epoch 16/30 | loss = 5.0770
Epoch 17/30 | loss = 4.7815
Epoch 18/30 | loss = 4.4385
Epoch 19/30 | loss = 4.2958
Epoch 20/30 | loss = 4.1391
Epoch 21/30 | loss = 3.9300
Epoch 22/30 | loss = 3.8043
Epoch 23/30 | loss = 3.7275
Epoch 24/30 | loss = 3.5828
Epoch 25/30 | loss = 3.4223
Epoch 26/30 | loss = 3.3730
Epoch 27/30 | loss = 3.3167
Epoch 28/30 | loss = 3.2412
Epoch 29/30 | loss = 3.2106
Epoch 30/30 | loss = 3.1575
Training finished.

Total-waste MAE = 69.549   Spearman ρ = 0.690
MSE  = 54.1623
RMSE = 7.3595
MAE  = 1.1662
R²   = 0.6827
