In [1]:
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from PIL import Image
from typing import Tuple, Optional
from torchvision import models

import os
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import torchvision.transforms as T
import torchvision.transforms.functional as TF
import random

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seed for reproducibility
torch.manual_seed(789)
np.random.seed(789)
random.seed(789)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(789)

Using device: cuda


# 1. Model and Trainer

## 1.1 Model Definition

In [2]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torchvision.models as models

class InceptionV3Encoder(nn.Module):
    """InceptionV3 encoder as used in the original Nutrition5k paper"""
    
    def __init__(self, pretrained: bool = False, in_channels: int = 3):
        super().__init__()
        
        # Load InceptionV3 model
        inception = models.inception_v3(pretrained=pretrained, aux_logits=False)
        
        # The output of InceptionV3 features is 2048 channels
        self.out_channels = 2048
        
        # Modify first conv if we have different input channels (e.g., 1 for depth)
        if in_channels != 3:
            self.Conv2d_1a_3x3 = nn.Conv2d(
                in_channels, 32, kernel_size=3, stride=2, bias=False
            )
        else:
            self.Conv2d_1a_3x3 = inception.Conv2d_1a_3x3
        
        # Copy all other layers from InceptionV3
        # First block
        self.Conv2d_2a_3x3 = inception.Conv2d_2a_3x3
        self.Conv2d_2b_3x3 = inception.Conv2d_2b_3x3
        self.maxpool1 = inception.maxpool1
        
        # Second block
        self.Conv2d_3b_1x1 = inception.Conv2d_3b_1x1
        self.Conv2d_4a_3x3 = inception.Conv2d_4a_3x3
        self.maxpool2 = inception.maxpool2
        
        # Inception blocks
        self.Mixed_5b = inception.Mixed_5b
        self.Mixed_5c = inception.Mixed_5c
        self.Mixed_5d = inception.Mixed_5d
        self.Mixed_6a = inception.Mixed_6a
        self.Mixed_6b = inception.Mixed_6b
        self.Mixed_6c = inception.Mixed_6c
        self.Mixed_6d = inception.Mixed_6d
        self.Mixed_6e = inception.Mixed_6e
        self.Mixed_7a = inception.Mixed_7a
        self.Mixed_7b = inception.Mixed_7b
        self.Mixed_7c = inception.Mixed_7c
    
    def forward(self, x):
        """
        Args:
            x: Input tensor (B, C, H, W)
        Returns:
            Feature map (B, 2048, H/32, W/32)
        """
        # First block
        x = self.Conv2d_1a_3x3(x)
        x = self.Conv2d_2a_3x3(x)
        x = self.Conv2d_2b_3x3(x)
        x = self.maxpool1(x)
        
        # Second block
        x = self.Conv2d_3b_1x1(x)
        x = self.Conv2d_4a_3x3(x)
        x = self.maxpool2(x)
        
        # Inception blocks
        x = self.Mixed_5b(x)
        x = self.Mixed_5c(x)
        x = self.Mixed_5d(x)
        x = self.Mixed_6a(x)
        x = self.Mixed_6b(x)
        x = self.Mixed_6c(x)
        x = self.Mixed_6d(x)
        x = self.Mixed_6e(x)
        x = self.Mixed_7a(x)
        x = self.Mixed_7b(x)
        x = self.Mixed_7c(x)
        
        return x

# Early Fusion Module (RGB + Depth fused at input level)
class EarlyFusion(nn.Module):
    """
    Early Fusion: Combine RGB and Depth channels at the input level
    before processing through the network
    """
    
    def __init__(self, pretrained: bool = False, fusion_channels: int = 2048, dropout_rate: float = 0.4):
        super().__init__()
        
        # Create a single encoder with 4 input channels (3 RGB + 1 Depth)
        self.encoder = InceptionV3Encoder(pretrained=pretrained, in_channels=4)
        
        # Regression head for calorie prediction
        self.regression_head = RegressionHead(
            in_channels=self.encoder.out_channels,
            dropout_rate=dropout_rate
        )
    
    def forward(self, rgb, depth):
        """
        Args:
            rgb: RGB images (B, 3, H, W)
            depth: Depth images (B, 1, H, W)
        
        Returns:
            Predicted calories (B, 1)
        """
        # Concatenate RGB and depth along channel dimension
        x = torch.cat([rgb, depth], dim=1)  # (B, 4, H, W)
        
        # Process through the encoder
        features = self.encoder(x)
        
        # Predict calories
        calories = self.regression_head(features)
        
        return calories

# Late Fusion Module (RGB + Depth processed separately and fused at regression level)
class LateFusion(nn.Module):
    """
    Late Fusion: Process RGB and Depth streams independently, then fuse at the regression head level
    """
    
    def __init__(self, pretrained: bool = False, fusion_channels: int = 2048, dropout_rate: float = 0.4):
        super().__init__()
        
        # RGB and Depth encoders
        self.rgb_encoder = InceptionV3Encoder(pretrained=pretrained, in_channels=3)
        self.depth_encoder = InceptionV3Encoder(pretrained=pretrained, in_channels=1)
        
        # Global average pooling
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Fusion at the feature vector level
        in_features = self.rgb_encoder.out_channels + self.depth_encoder.out_channels
        
        # Fully connected layers for regression
        self.regression_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 1)
        )
    
    def forward(self, rgb, depth):
        """
        Args:
            rgb: RGB images (B, 3, H, W)
            depth: Depth images (B, 1, H, W)
        
        Returns:
            Predicted calories (B, 1)
        """
        # Extract features from both streams
        rgb_features = self.rgb_encoder(rgb)    # (B, 2048, H/32, W/32)
        depth_features = self.depth_encoder(depth)  # (B, 2048, H/32, W/32)
        
        # Apply global average pooling
        rgb_features = self.avgpool(rgb_features)    # (B, 2048, 1, 1)
        depth_features = self.avgpool(depth_features)  # (B, 2048, 1, 1)
        
        # Concatenate feature vectors
        fused = torch.cat([rgb_features, depth_features], dim=1)  # (B, 4096, 1, 1)
        
        # Predict calories
        calories = self.regression_layers(fused)
        
        return calories

class RegressionHead(nn.Module):
    def __init__(self, in_channels: int = 2048, dropout_rate: float = 0.4):
        super().__init__()
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_channels, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 1)
        )
    
    def forward(self, x):
        x = self.avgpool(x)  # (B, C, 1, 1)
        x = self.fc_layers(x)  # (B, 1)
        return x


class VolumeEstimator(nn.Module):
    """
    Food volume estimation from overhead depth images following the Nutrition5k paper.
    
    Given:
    - Distance between camera and capture plane: 35.9 cm
    - Per-pixel surface area at this distance: 5.957 × 10^-3 cm²
    
    The volume is calculated by:
    1. Computing per-pixel volume (depth × surface_area)
    2. Summing over all food pixels (using binary threshold segmentation)
    """
    
    def __init__(self, 
                 camera_distance: float = 35.9,  # cm
                 pixel_surface_area: float = 5.957e-3,  # cm²
                 depth_threshold: float = 0.1):  # Threshold for simple segmentation
        super().__init__()
        
        self.camera_distance = camera_distance
        self.pixel_surface_area = pixel_surface_area
        self.depth_threshold = depth_threshold
    
    def forward(self, depth_images):
        """
        Args:
            depth_images: Depth images (B, 1, H, W), normalized to [0, 1] range
        
        Returns:
            volume_estimates: Volume in cm³ for each image (B, 1)
        """
        # Simple threshold-based segmentation for foreground/background
        segmentation_mask = (depth_images > self.depth_threshold).float()
        
        # Convert normalized depth back to actual depth values
        # Assuming depth is normalized to [0, 1] and represents distance from camera
        # For simplicity, we assume the depth represents actual distance in cm scaled to [0, 1]
        depth_cm = depth_images * self.camera_distance
        
        # Calculate per-pixel volume: depth × surface_area
        per_pixel_volume = depth_cm * self.pixel_surface_area  # (B, 1, H, W)
        
        # Apply segmentation mask to consider only food pixels
        masked_volume = per_pixel_volume * segmentation_mask
        
        # Sum over all pixels to get total volume
        volume_estimates = masked_volume.sum(dim=[2, 3])  # (B, 1)
        
        return volume_estimates


class RegressionHeadWithVolume(nn.Module):
    """
    Regression head that concatenates volume estimate to InceptionV3 features.
    
    According to the paper: "concatenating the volume estimation value to the output 
    of the InceptionV3 backbone, before the following two fully connected layers"
    with FC layers of 64 and 1 dimension.
    """
    
    def __init__(self, in_channels: int = 2048, dropout_rate: float = 0.4):
        super().__init__()
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Two FC layers as described in the paper (2048+1 -> 64 -> 1)
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_channels + 1, 64),  # +1 for volume
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 1)
        )
    
    def forward(self, features, volume):
        """
        Args:
            features: Feature maps from backbone (B, 2048, H, W)
            volume: Volume estimates (B, 1)
        
        Returns:
            Predicted calories (B, 1)
        """
        # Global average pooling
        x = self.avgpool(features)  # (B, 2048, 1, 1)
        x = torch.flatten(x, 1)  # (B, 2048)
        
        # Concatenate volume estimate
        x = torch.cat([x, volume], dim=1)  # (B, 2049)
        
        # Predict calories
        x = self.fc_layers(x)  # (B, 1)
        
        return x

class Nutrition5kModel(nn.Module):
    """
    Implementation of the dual-stream architecture used in the original Nutrition5k paper
    Uses InceptionV3 as the backbone and middle fusion
    """
    
    def __init__(
        self,
        fusion: str = 'middle',
        fusion_channels: int = 2048,
        dropout_rate: float = 0.4,
        pretrained: bool = False,
        use_volume: bool = False
    ):
        super().__init__()
        
        self.use_volume = use_volume
        
        if fusion == 'early':
            self.model = EarlyFusion(
                pretrained=pretrained,
                fusion_channels=fusion_channels,
                dropout_rate=dropout_rate
            )
        elif fusion == 'late':
            self.model = LateFusion(
                pretrained=pretrained,
                fusion_channels=fusion_channels,
                dropout_rate=dropout_rate
            )
        elif fusion == 'image_only':
            # Image-only variant: only RGB is used
            self.rgb_encoder = InceptionV3Encoder(pretrained=pretrained, in_channels=3)
            
            # Volume estimator (if enabled)
            if use_volume:
                self.volume_estimator = VolumeEstimator()
                self.regression_head = RegressionHeadWithVolume(
                    in_channels=self.rgb_encoder.out_channels,
                    dropout_rate=dropout_rate
                )
            else:
                self.regression_head = RegressionHead(
                    in_channels=self.rgb_encoder.out_channels,
                    dropout_rate=dropout_rate
                )
        elif fusion == 'image_volume':
            # Image+Volume variant: RGB encoder + volume as additional signal
            self.rgb_encoder = InceptionV3Encoder(pretrained=pretrained, in_channels=3)
            self.volume_estimator = VolumeEstimator()
            self.regression_head = RegressionHeadWithVolume(
                in_channels=self.rgb_encoder.out_channels,
                dropout_rate=dropout_rate
            )
            self.use_volume = True  # Always use volume for this variant
        else:  # middle fusion
            # RGB and Depth encoders using InceptionV3
            self.rgb_encoder = InceptionV3Encoder(pretrained=pretrained, in_channels=3)
            self.depth_encoder = InceptionV3Encoder(pretrained=pretrained, in_channels=1)
            
            # Create middle fusion module
            from_channels = self.rgb_encoder.out_channels + self.depth_encoder.out_channels
            self.fusion_conv = nn.Sequential(
                nn.Conv2d(from_channels, fusion_channels, kernel_size=1, bias=False),
                nn.BatchNorm2d(fusion_channels),
                nn.ReLU(inplace=True)
            )
            
            # Volume estimator (if enabled)
            if use_volume:
                self.volume_estimator = VolumeEstimator()
                self.regression_head = RegressionHeadWithVolume(
                    in_channels=fusion_channels,
                    dropout_rate=dropout_rate
                )
            else:
                self.regression_head = RegressionHead(
                    in_channels=fusion_channels,
                    dropout_rate=dropout_rate
                )
    
    def forward(self, rgb, depth):
        """
        Args:
            rgb: RGB images (B, 3, H, W)
            depth: Depth images (B, 1, H, W)
        
        Returns:
            calorie_pred: Predicted calories (B, 1)
        """
        if hasattr(self, 'model'):
            return self.model(rgb, depth)
        
        # Calculate volume estimate if enabled
        volume = None
        if self.use_volume and hasattr(self, 'volume_estimator'):
            volume = self.volume_estimator(depth)  # (B, 1)
        
        # Image-only or Image+Volume variant
        if hasattr(self, 'rgb_encoder') and not hasattr(self, 'depth_encoder'):
            rgb_features = self.rgb_encoder(rgb)  # (B, 2048, H/32, W/32)
            
            if volume is not None:
                calorie_pred = self.regression_head(rgb_features, volume)
            else:
                calorie_pred = self.regression_head(rgb_features)
            
            return calorie_pred
        
        # Extract features from both streams
        rgb_features = self.rgb_encoder(rgb)      # (B, 2048, H/32, W/32)
        depth_features = self.depth_encoder(depth)  # (B, 2048, H/32, W/32)
        
        # Middle fusion - concatenate and apply 1x1 conv
        fused = torch.cat([rgb_features, depth_features], dim=1)  # (B, 4096, H/32, W/32)
        fused = self.fusion_conv(fused)  # (B, 2048, H/32, W/32)
        
        # Predict calories (with or without volume)
        if volume is not None:
            calorie_pred = self.regression_head(fused, volume)
        else:
            calorie_pred = self.regression_head(fused)
        
        return calorie_pred
    
    def get_num_parameters(self):
        """Get total number of trainable parameters"""
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

# Factory function to build Nutrition5k models with different fusion types
def build_nutrition5k_model(fusion='middle', pretrained=False, dropout_rate=0.4, fusion_channels=2048, 
                           use_volume=False, **kwargs):
    """
    Factory function to build models using the Nutrition5k paper architecture (InceptionV3 backbone)
    
    Args:
        fusion: Fusion type ('early', 'middle', 'late', 'image_only', or 'image_volume')
        pretrained: Whether to use pretrained weights for InceptionV3
        dropout_rate: Dropout rate for regression head
        fusion_channels: Number of channels after fusion
        use_volume: Whether to use volume estimation as additional signal (uses simple threshold-based segmentation)
    
    Returns:
        Nutrition5k model with specified configuration
    """
    return Nutrition5kModel(
        fusion=fusion,
        fusion_channels=fusion_channels,
        dropout_rate=dropout_rate,
        pretrained=pretrained,
        use_volume=use_volume
    )

## 1.2 Trainer Definition

In [3]:
import math

def get_warmup_cosine_scheduler(optimizer, warmup_steps, total_steps, min_lr_ratio=0.0):
    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return float(current_step) / float(max(1, warmup_steps))
        else:
            progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
            return min_lr_ratio + (1.0 - min_lr_ratio) * 0.5 * (1.0 + math.cos(math.pi * progress))
    
    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


class EarlyStopping:
    """Early stopping to stop training when validation loss stops improving"""
    
    def __init__(self, patience: int = 10, min_delta: float = 0.0, mode: str = 'min'):
        """
        Args:
            patience: Number of epochs with no improvement after which training will be stopped
            min_delta: Minimum change to qualify as an improvement
            mode: 'min' or 'max' - whether lower or higher metric is better
        """
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_epoch = 0
        
    def __call__(self, score, epoch):
        if self.best_score is None:
            self.best_score = score
            self.best_epoch = epoch
            return False
        
        if self.mode == 'min':
            improved = score < (self.best_score - self.min_delta)
        else:
            improved = score > (self.best_score + self.min_delta)
        
        if improved:
            self.best_score = score
            self.best_epoch = epoch
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                
        return self.early_stop


class Trainer:
    """Training manager for calorie prediction"""
    
    def __init__(
        self,
        model,
        train_loader,
        val_loader,
        criterion,
        optimizer,
        scheduler,
        device,
        output_dir,
        early_stopping_patience=15,
        scheduler_step_on_batch=False
    ):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.output_dir = output_dir
        self.scheduler_step_on_batch = scheduler_step_on_batch
        
        # Early stopping
        self.early_stopping = EarlyStopping(
            patience=early_stopping_patience,
            min_delta=0.1,
            mode='min'
        )
        
        # Tensorboard
        self.writer = SummaryWriter(log_dir=os.path.join(output_dir, 'tensorboard'))
        
        # Tracking
        self.best_val_loss = float('inf')
        self.train_losses = []
        self.val_losses = []
        self.best_metrics = {}
    
    def train_epoch(self):
        """Train for one epoch"""
        self.model.train()
        total_loss = 0.0
        num_batches = 0
        
        pbar = tqdm(self.train_loader, desc="Training")
        for batch_idx, batch in enumerate(pbar):
            # Move to device
            rgb = batch['rgb'].to(self.device)
            depth = batch['depth'].to(self.device)
            calories = batch['calorie'].to(self.device)
            
            # Forward pass
            self.optimizer.zero_grad()
            calorie_pred = self.model(rgb, depth)
            
            # Compute loss (MSE for calorie prediction)
            loss = self.criterion(calorie_pred.squeeze(), calories)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            
            # Update learning rate (if step_on_batch)
            if self.scheduler_step_on_batch and self.scheduler:
                self.scheduler.step()
            
            # Track metrics
            total_loss += loss.item()
            num_batches += 1
            
            # Update progress bar
            pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
        
        return total_loss / num_batches
    
    def validate_epoch(self):
        """Validate for one epoch"""
        self.model.eval()
        total_loss = 0.0
        all_predictions = []
        all_targets = []
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc="Validation"):
                # Move to device
                rgb = batch['rgb'].to(self.device)
                depth = batch['depth'].to(self.device)
                calories = batch['calorie'].to(self.device)
                
                # Forward pass
                calorie_pred = self.model(rgb, depth)
                
                # Compute loss
                loss = self.criterion(calorie_pred.squeeze(), calories)
                total_loss += loss.item()
                
                # Store predictions and targets for metrics
                all_predictions.extend(calorie_pred.squeeze().cpu().numpy())
                all_targets.extend(calories.cpu().numpy())
        
        # Calculate metrics
        avg_loss = total_loss / len(self.val_loader)
        predictions = np.array(all_predictions)
        targets = np.array(all_targets)
        
        mae = np.mean(np.abs(predictions - targets))
        
        return avg_loss, mae
    
    def train(self, num_epochs):
        """Full training loop"""
        print(f"Starting training for {num_epochs} epochs...")
        
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch+1}/{num_epochs}")
            
            # Train
            train_loss = self.train_epoch()
            
            # Validate
            val_loss, mae = self.validate_epoch()
            
            # Update learning rate (if not step_on_batch)
            if not self.scheduler_step_on_batch and self.scheduler:
                self.scheduler.step(val_loss)
            
            # Log metrics
            self.writer.add_scalar('Loss/Train', train_loss, epoch)
            self.writer.add_scalar('Loss/Val', val_loss, epoch)
            self.writer.add_scalar('MAE', mae, epoch)
            
            # Save best model
            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.best_metrics = {
                    'epoch': epoch + 1,
                    'val_loss': val_loss,
                    'mae': mae,
                }
                
                # Save model checkpoint
                torch.save({
                    'epoch': epoch + 1,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'val_loss': val_loss,
                    'mae': mae,
                }, os.path.join(self.output_dir, 'best_model.pth'))
            
            # Print epoch results
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Loss: {val_loss:.4f}")
            print(f"MAE: {mae:.2f}")
            
            # Early stopping
            if self.early_stopping(val_loss, epoch):
                print(f"Early stopping triggered after {epoch+1} epochs")
                print(f"Best epoch: {self.early_stopping.best_epoch+1}")
                break
        
        self.writer.close()
        print(f"\nTraining completed!")
        print(f"Best validation loss: {self.best_val_loss:.4f}")


# 2. Dataset

## 2.1 Dataset Definition

In [None]:
# Dataset Implementation
class Nutrition5KDataset(Dataset):
    """
    Dataset class for Nutrition5K with multi-modal inputs (RGB + Depth)
    """
    
    def __init__(
        self,
        csv_path: str,
        data_root: str,
        split: str = 'train',
        augment: bool = True,
        img_size: int = 224,
    ):
        self.data_root = data_root
        self.split = split
        self.augment = augment
        self.img_size = img_size
        
        # Load CSV
        self.df = pd.read_csv(csv_path)
        if 'Value' in self.df.columns and 'calories' not in self.df.columns:
            self.df = self.df.rename(columns={'Value': 'calories'})
        if 'calories' not in self.df.columns:
            raise ValueError("CSV file must contain a 'calories' column or a 'Value' column that can be renamed")
        self.df = self.df[self.df['calories'] < 3000].reset_index(drop=True)
                
        self.color_dir = os.path.join(data_root, 'color')
        self.depth_raw_dir = os.path.join(data_root, 'depth_raw')
        
        self.valid_indices = self._validate_dataset()
        print(f"Loaded {len(self.valid_indices)} valid samples out of {len(self.df)}")
        
        # Color normalization (ImageNet stats as baseline)
        self.color_normalize = T.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
        
    def _validate_dataset(self):
        """This method ensure that the code don't break when there are corrupted images."""
        valid_indices = []
        
        for idx in range(len(self.df)):
            dish_id = self.df.iloc[idx]['ID']
            
            rgb_path = os.path.join(self.color_dir, dish_id, 'rgb.png')
            depth_path = os.path.join(self.depth_raw_dir, dish_id, 'depth_raw.png')
            
            # Check if files exist
            if not os.path.exists(rgb_path):
                continue
            if not os.path.exists(depth_path):
                continue
            
            # Try to load images to check for corruption
            try:
                with Image.open(rgb_path) as img:
                    img.verify()
                with Image.open(depth_path) as img:
                    img.verify()
                valid_indices.append(idx)
            except Exception as e:
                continue
                
        return valid_indices
    
    def __len__(self):
        return len(self.valid_indices)
    
    def _load_image_safe(self, path: str, mode: str = 'RGB') -> Optional[Image.Image]:
        """Safely load an image with error handling"""
        try:
            with Image.open(path) as img:
                return img.convert(mode).copy()
        except Exception as e:
            return None
    
    def _apply_augmentation(self, rgb_img, depth_img):
        """Apply geometric augmentation only (no color changes)"""
        if not self.augment:
            return rgb_img, depth_img
        
        # Convert to tensors first
        rgb_tensor = TF.to_tensor(rgb_img)
        depth_tensor = TF.to_tensor(depth_img)
        
        # Random horizontal flip
        if random.random() > 0.5:
            rgb_tensor = TF.hflip(rgb_tensor)
            depth_tensor = TF.hflip(depth_tensor)
        
        # Random rotation (±15 degrees)
        if random.random() > 0.5:
            angle = random.uniform(-15, 15)
            rgb_tensor = TF.rotate(rgb_tensor, angle)
            depth_tensor = TF.rotate(depth_tensor, angle)
        
        # Convert back to PIL
        rgb_img = TF.to_pil_image(rgb_tensor)
        depth_img = TF.to_pil_image(depth_tensor)
        
        return rgb_img, depth_img
    
    def _resize_and_center_crop(self, img, target_size: int = 256):
        """
        Resize and center crop image to target_size x target_size
        Matches the preprocessing in the Nutrition5k paper
        
        Args:
            img: PIL Image
            target_size: Target size (default 256x256 as per paper)
        
        Returns:
            Cropped PIL Image
        """
        # Get original dimensions
        width, height = img.size
        
        # Resize so the shorter side is target_size
        if width < height:
            new_width = target_size
            new_height = int(target_size * height / width)
        else:
            new_height = target_size
            new_width = int(target_size * width / height)
        
        img = img.resize((new_width, new_height), Image.LANCZOS)
        
        # Center crop to target_size x target_size
        left = (new_width - target_size) // 2
        top = (new_height - target_size) // 2
        right = left + target_size
        bottom = top + target_size
        
        img = img.crop((left, top, right, bottom))
        
        return img
    
    def __getitem__(self, idx):
        """Get a single sample"""
        actual_idx = self.valid_indices[idx]
        row = self.df.iloc[actual_idx]
        
        dish_id = row['ID']
        calorie = float(row['calories'])
        
        # Load images
        rgb_path = os.path.join(self.color_dir, dish_id, 'rgb.png')
        depth_path = os.path.join(self.depth_raw_dir, dish_id, 'depth_raw.png')
        
        rgb_img = self._load_image_safe(rgb_path, 'RGB')
        depth_img = self._load_image_safe(depth_path, 'L')  # Grayscale for depth
        
        # Fallback: return a black image
        if rgb_img is None or depth_img is None:
            rgb_img = Image.new('RGB', (self.img_size, self.img_size), (0, 0, 0))
            depth_img = Image.new('L', (self.img_size, self.img_size), 0)
        
        # Apply augmentation
        rgb_img, depth_img = self._apply_augmentation(rgb_img, depth_img)
        
        # Resize and center crop to match paper preprocessing (256x256)
        rgb_img = self._resize_and_center_crop(rgb_img, target_size=self.img_size)
        depth_img = self._resize_and_center_crop(depth_img, target_size=self.img_size)
        
        # Convert to tensors
        rgb_tensor = TF.to_tensor(rgb_img)  # (3, H, W)
        depth_tensor = TF.to_tensor(depth_img)  # (1, H, W)
        
        # Normalize RGB
        rgb_tensor = self.color_normalize(rgb_tensor)
        
        # Normalize depth (0-1 range, assuming depth is already in reasonable range)
        depth_tensor = depth_tensor / 255.0
        
        return {
            'dish_id': dish_id,
            'rgb': rgb_tensor,
            'depth': depth_tensor,
            'calorie': torch.tensor(calorie, dtype=torch.float32)
        }


def create_train_val_split(csv_path: str, val_ratio: float = 0.15, random_seed: int = 42):
    """
    Create train/validation split CSV files
    """
    # Read original CSV
    df = pd.read_csv(csv_path)    
    
    # Shuffle with fixed seed
    df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Split
    val_size = int(len(df_shuffled) * val_ratio)
    train_df = df_shuffled[val_size:]
    val_df = df_shuffled[:val_size]
    
    # Save temporary CSV files
    base_dir = os.path.dirname(csv_path)
    train_csv = os.path.join(base_dir, 'train_split.csv')
    val_csv = os.path.join(base_dir, 'val_split.csv')
    
    train_df.to_csv(train_csv, index=False)
    val_df.to_csv(val_csv, index=False)
    
    return train_csv, val_csv

## 2.2 Dataset Loading

In [5]:
# Configuration - Update these paths to match your setup
DATA_ROOT = './Nutrition5K/Nutrition5K/train'  # Path to training data directory
CSV_PATH = './Nutrition5K/Nutrition5K/nutrition5k_train.csv'  # Path to training CSV
OUTPUT_DIR = './experiments'  # Directory to save experiment results

# Global training hyperparameters (learning rate and weight decay set per experiment)
BATCH_SIZE = 32
NUM_EPOCHS = 40
VAL_RATIO = 0.15
IMG_SIZE = 256
NUM_WORKERS = 4

print("Configuration:")
print(f"  Data root: {DATA_ROOT}")
print(f"  CSV path: {CSV_PATH}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Number of epochs: {NUM_EPOCHS}")
print(f"  Image size: {IMG_SIZE}")
print(f"  Workers: {NUM_WORKERS}")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)


Configuration:
  Data root: ./Nutrition5K/Nutrition5K/train
  CSV path: ./Nutrition5K/Nutrition5K/nutrition5k_train.csv
  Output directory: ./experiments
  Batch size: 32
  Number of epochs: 40
  Image size: 256
  Workers: 4


### Dataset Loading

In [6]:
# Create train/validation split
print("Creating train/validation split...")
train_csv, val_csv = create_train_val_split(
    CSV_PATH,
    val_ratio=VAL_RATIO,
    random_seed=42
)

print(f"Train CSV: {train_csv}")
print(f"Validation CSV: {val_csv}")

# Load a sample to check data
sample_dataset = Nutrition5KDataset(
    csv_path=train_csv,
    data_root=DATA_ROOT,
    split='train',
    augment=False,  # No augmentation for checking
    img_size=IMG_SIZE,
)

print(f"\nDataset loaded successfully!")
print(f"Training samples: {len(sample_dataset)}")
print(f"RGB shape: {sample_dataset[0]['rgb'].shape}")
print(f"Depth shape: {sample_dataset[0]['depth'].shape}")


Creating train/validation split...
Train CSV: ./Nutrition5K/Nutrition5K/train_split.csv
Validation CSV: ./Nutrition5K/Nutrition5K/val_split.csv
Loaded 2804 valid samples out of 2805

Dataset loaded successfully!
Training samples: 2804
RGB shape: torch.Size([3, 256, 256])
Depth shape: torch.Size([1, 256, 256])


# 3. Experiments
We'll conduct experiments to compare different fusion strategies using the InceptionV3 architecture.

**Architecture**: InceptionV3
- **RGB encoder**: InceptionV3 (in_channels=3)
- **Depth encoder**: InceptionV3 (in_channels=1) 
- **Fusion**: Various fusion strategies (early, middle, late)
- **Volume estimation**: Optional food volume calculation from depth images

**Experiments**:
1. **InceptionV3 - Middle Fusion**: RGB and Depth features concatenated at feature map level, then fused with 1×1 conv
2. **InceptionV3 - Early Fusion**: RGB and Depth concatenated at input level (4 channels), processed by single encoder
3. **InceptionV3 - Late Fusion**: RGB and Depth processed separately, features concatenated after global pooling
4. **InceptionV3 - Image+Volume**: RGB encoder only + volume estimate from depth as additional signal
5. **InceptionV3 - Middle+Volume**: Middle fusion (RGB+Depth) + volume estimate as additional signal

## 3.1 InceptionV3 - Middle Fusion

In [None]:
# Define experiment hyperparamers
# BATCH_SIZE = 32
NUM_EPOCHS = 40
DROPOUT_RATE = 0.4
LEARNING_RATE = 3e-4
WEIGHT_DECAY = 1e-6
EARLY_STOPPING_PATIENCE = 15
WARMUP_RATIO = 0.1
MIN_LR_RATIO = 0.05
FUSION_CHANNELS = 2048

def train_nutrition5k_model(fusion_type='middle'):
    """Train the Nutrition5k model with InceptionV3 and specified fusion type"""
    
    print("="*60)
    print(f"TRAINING: Nutrition5k InceptionV3 + {fusion_type.capitalize()} Fusion")
    print("="*60)
    
    # Create datasets
    train_dataset = Nutrition5KDataset(
        csv_path=train_csv,
        data_root=DATA_ROOT,
        split='train',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    val_dataset = Nutrition5KDataset(
        csv_path=val_csv,
        data_root=DATA_ROOT,
        split='val',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    # Build model with specified fusion type
    model = build_nutrition5k_model(
        fusion=fusion_type,
        pretrained=False,
        dropout_rate=DROPOUT_RATE,
        fusion_channels=FUSION_CHANNELS
    )
    model = model.to(device)
    
    print(f"Model parameters: {model.get_num_parameters():,}")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    
    # Loss function
    criterion = nn.MSELoss()
    
    # Optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY
    )
    
    print(f"Learning rate: {LEARNING_RATE}")
    print(f"Weight decay: {WEIGHT_DECAY}")
    
    # Learning rate scheduler
    steps_per_epoch = len(train_loader)
    total_steps = NUM_EPOCHS * steps_per_epoch
    warmup_steps = int(total_steps * WARMUP_RATIO)
    
    scheduler = get_warmup_cosine_scheduler(
        optimizer, 
        warmup_steps=warmup_steps, 
        total_steps=total_steps,
        min_lr_ratio=MIN_LR_RATIO
    )
    
    # Create experiment directory
    exp_name = f"inceptionv3_{fusion_type}_fusion_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    exp_dir = os.path.join(OUTPUT_DIR, 'nutrition5k_experiments', exp_name)
    os.makedirs(exp_dir, exist_ok=True)
    
    # Save experiment configuration
    config = {
        'fusion': fusion_type,
        'pretrained': False,
        'dropout_rate': DROPOUT_RATE,
        'fusion_channels': FUSION_CHANNELS,
        'learning_rate': LEARNING_RATE,
        'weight_decay': WEIGHT_DECAY,
        'batch_size': BATCH_SIZE,
        'img_size': IMG_SIZE,
        'num_epochs': NUM_EPOCHS
    }
    
    with open(os.path.join(exp_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)
    
    # Create trainer
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        output_dir=exp_dir,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        scheduler_step_on_batch=False
    )
    
    # Train the model
    trainer.train(NUM_EPOCHS)
    
    print(f"\nExperiment completed! Results saved to: {exp_dir}")
    return trainer.best_metrics

# Run an experiment with middle fusion
middle_fusion_results = train_nutrition5k_model(fusion_type='middle')

TRAINING: Nutrition5k InceptionV3 + Middle Fusion
Loaded 2804 valid samples out of 2805
Loaded 495 valid samples out of 495
Model parameters: 53,143,873
Training samples: 2804
Validation samples: 495
Learning rate: 0.0003
Weight decay: 1e-06
Starting training for 40 epochs...

Epoch 1/40


Training: 100%|██████████| 87/87 [00:15<00:00,  5.72it/s, Loss=128568.6797]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.13it/s]


Train Loss: 99179.5700
Val Loss: 107412.9351
MAE: 240.64

Epoch 2/40


Training: 100%|██████████| 87/87 [00:14<00:00,  6.18it/s, Loss=14551.3506] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.31it/s]


Train Loss: 44613.2511
Val Loss: 22115.9072
MAE: 103.60

Epoch 3/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.24it/s, Loss=8529.9629] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.52it/s]


Train Loss: 14990.2540
Val Loss: 15339.2216
MAE: 89.52

Epoch 4/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.24it/s, Loss=10611.9082]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.42it/s]


Train Loss: 12844.5238
Val Loss: 14614.1202
MAE: 86.15

Epoch 5/40


Training: 100%|██████████| 87/87 [00:14<00:00,  6.19it/s, Loss=10989.2109]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.62it/s]


Train Loss: 12333.9567
Val Loss: 13903.3691
MAE: 84.82

Epoch 6/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.34it/s, Loss=7037.2222] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.65it/s]


Train Loss: 13701.8433
Val Loss: 15968.9327
MAE: 87.62

Epoch 7/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=9634.6934] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.52it/s]


Train Loss: 9885.7110
Val Loss: 13036.7188
MAE: 78.05

Epoch 8/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.32it/s, Loss=15831.9697]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.27it/s]


Train Loss: 13061.2367
Val Loss: 37848.2272
MAE: 150.94

Epoch 9/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=15873.6582]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.62it/s]


Train Loss: 13630.2193
Val Loss: 26709.6769
MAE: 118.80

Epoch 10/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.29it/s, Loss=12883.2695]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.69it/s]


Train Loss: 11182.7567
Val Loss: 16951.6092
MAE: 94.59

Epoch 11/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.38it/s, Loss=8178.9229] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 8048.6062
Val Loss: 12620.6396
MAE: 74.06

Epoch 12/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.34it/s, Loss=7492.9854] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.49it/s]


Train Loss: 10334.1747
Val Loss: 15044.2647
MAE: 86.57

Epoch 13/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.30it/s, Loss=3736.9170] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 7712.2835
Val Loss: 10110.7316
MAE: 70.29

Epoch 14/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.35it/s, Loss=12397.4609]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 4862.7000
Val Loss: 9211.0626
MAE: 63.99

Epoch 15/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.34it/s, Loss=4223.6538] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.44it/s]


Train Loss: 4543.4875
Val Loss: 9900.7881
MAE: 65.54

Epoch 16/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.34it/s, Loss=2259.6072] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.78it/s]


Train Loss: 4426.4736
Val Loss: 9328.8389
MAE: 66.59

Epoch 17/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.31it/s, Loss=1298.2256] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.48it/s]


Train Loss: 3814.8753
Val Loss: 8683.5598
MAE: 63.18

Epoch 18/40


Training: 100%|██████████| 87/87 [00:14<00:00,  6.18it/s, Loss=4756.7666] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.51it/s]


Train Loss: 4460.9755
Val Loss: 9520.8630
MAE: 66.56

Epoch 19/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.25it/s, Loss=3241.6943] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.34it/s]


Train Loss: 4638.1852
Val Loss: 8618.1980
MAE: 63.57

Epoch 20/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.23it/s, Loss=2759.9041] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.33it/s]


Train Loss: 4441.7072
Val Loss: 10380.2209
MAE: 66.44

Epoch 21/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.29it/s, Loss=8066.5176] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.26it/s]


Train Loss: 4397.2711
Val Loss: 9014.0540
MAE: 64.05

Epoch 22/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.23it/s, Loss=2299.1699] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.47it/s]


Train Loss: 3708.5228
Val Loss: 8564.3510
MAE: 61.92

Epoch 23/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.30it/s, Loss=3101.3701] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.50it/s]


Train Loss: 3872.7702
Val Loss: 9195.6286
MAE: 63.51

Epoch 24/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=1211.8209] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.49it/s]


Train Loss: 3619.2267
Val Loss: 8286.5531
MAE: 59.58

Epoch 25/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.31it/s, Loss=4860.5879] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.64it/s]


Train Loss: 4642.6053
Val Loss: 11539.1972
MAE: 76.86

Epoch 26/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.31it/s, Loss=5091.5918] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.65it/s]


Train Loss: 5310.7747
Val Loss: 9601.1501
MAE: 65.47

Epoch 27/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=3598.9797]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 4402.4465
Val Loss: 8586.3208
MAE: 60.98

Epoch 28/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=3286.5671] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Train Loss: 4379.7765
Val Loss: 9463.2135
MAE: 64.45

Epoch 29/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=3635.2405] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.29it/s]


Train Loss: 3820.1549
Val Loss: 7934.8172
MAE: 57.35

Epoch 30/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.23it/s, Loss=9945.4971] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.53it/s]


Train Loss: 5754.6514
Val Loss: 10848.7680
MAE: 67.77

Epoch 31/40


Training: 100%|██████████| 87/87 [00:14<00:00,  6.12it/s, Loss=2592.0410] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.46it/s]


Train Loss: 4085.6081
Val Loss: 8705.7453
MAE: 62.18

Epoch 32/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=1845.0674] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.59it/s]


Train Loss: 3637.0483
Val Loss: 8562.5231
MAE: 62.51

Epoch 33/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.27it/s, Loss=5132.6309] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.37it/s]


Train Loss: 3630.7866
Val Loss: 9332.6526
MAE: 62.17

Epoch 34/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.27it/s, Loss=1935.2677] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.50it/s]


Train Loss: 3029.1384
Val Loss: 7578.5881
MAE: 56.28

Epoch 35/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.31it/s, Loss=5809.5571] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.51it/s]


Train Loss: 4695.0795
Val Loss: 9082.4254
MAE: 66.55

Epoch 36/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=2360.2610] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.70it/s]


Train Loss: 4250.1567
Val Loss: 8774.8670
MAE: 62.02

Epoch 37/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=5048.2212]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.41it/s]


Train Loss: 3479.9186
Val Loss: 9305.3121
MAE: 62.28

Epoch 38/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=1282.8358] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.34it/s]


Train Loss: 3230.0179
Val Loss: 7757.0370
MAE: 56.59

Epoch 39/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=3625.0686] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]


Train Loss: 4668.2630
Val Loss: 12126.1959
MAE: 82.30

Epoch 40/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=8918.3311] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.40it/s]

Train Loss: 5625.5605
Val Loss: 14431.2035
MAE: 85.84

Training completed!
Best validation loss: 7578.5881

Experiment completed! Results saved to: ../experiments/nutrition5k_experiments/inceptionv3_middle_fusion_20251024_130923





## 3.2 InceptionV3 - Early Fusion

In [None]:
# Define experiment hyperparamers
BATCH_SIZE = 32
NUM_EPOCHS = 40
DROPOUT_RATE = 0.4
LEARNING_RATE = 3e-4
WEIGHT_DECAY = 1e-6
EARLY_STOPPING_PATIENCE = 15
WARMUP_RATIO = 0.1
MIN_LR_RATIO = 0.05
FUSION_CHANNELS = 2048

def train_nutrition5k_model(fusion_type='middle'):
    """Train the Nutrition5k model with InceptionV3 and specified fusion type"""
    
    print("="*60)
    print(f"TRAINING: Nutrition5k InceptionV3 + {fusion_type.capitalize()} Fusion")
    print("="*60)
    
    # Create datasets
    train_dataset = Nutrition5KDataset(
        csv_path=train_csv,
        data_root=DATA_ROOT,
        split='train',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    val_dataset = Nutrition5KDataset(
        csv_path=val_csv,
        data_root=DATA_ROOT,
        split='val',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    # Build model with specified fusion type
    model = build_nutrition5k_model(
        fusion=fusion_type,
        pretrained=False,
        dropout_rate=DROPOUT_RATE,
        fusion_channels=FUSION_CHANNELS
    )
    model = model.to(device)
    
    print(f"Model parameters: {model.get_num_parameters():,}")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    
    # Loss function
    criterion = nn.MSELoss()
    
    # Optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY
    )
    
    print(f"Learning rate: {LEARNING_RATE}")
    print(f"Weight decay: {WEIGHT_DECAY}")
    
    # Learning rate scheduler
    steps_per_epoch = len(train_loader)
    total_steps = NUM_EPOCHS * steps_per_epoch
    warmup_steps = int(total_steps * WARMUP_RATIO)
    
    scheduler = get_warmup_cosine_scheduler(
        optimizer, 
        warmup_steps=warmup_steps, 
        total_steps=total_steps,
        min_lr_ratio=MIN_LR_RATIO
    )
    
    # Create experiment directory
    exp_name = f"inceptionv3_{fusion_type}_fusion_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    exp_dir = os.path.join(OUTPUT_DIR, 'nutrition5k_experiments', exp_name)
    os.makedirs(exp_dir, exist_ok=True)
    
    # Save experiment configuration
    config = {
        'fusion': fusion_type,
        'pretrained': False,
        'dropout_rate': DROPOUT_RATE,
        'fusion_channels': FUSION_CHANNELS,
        'learning_rate': LEARNING_RATE,
        'weight_decay': WEIGHT_DECAY,
        'batch_size': BATCH_SIZE,
        'img_size': IMG_SIZE,
        'num_epochs': NUM_EPOCHS
    }
    
    with open(os.path.join(exp_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)
    
    # Create trainer
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        output_dir=exp_dir,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        scheduler_step_on_batch=False
    )
    
    # Train the model
    trainer.train(NUM_EPOCHS)
    
    print(f"\nExperiment completed! Results saved to: {exp_dir}")
    return trainer.best_metrics

# Run an experiment with middle fusion
early_fusion_results = train_nutrition5k_model(fusion_type='early')

TRAINING: Nutrition5k InceptionV3 + Early Fusion
Loaded 2804 valid samples out of 2805
Loaded 495 valid samples out of 495
Model parameters: 22,966,465
Training samples: 2804
Validation samples: 495
Learning rate: 0.0003
Weight decay: 1e-06
Starting training for 40 epochs...

Epoch 1/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.49it/s, Loss=66747.9062] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.54it/s]


Train Loss: 99325.6495
Val Loss: 107376.2798
MAE: 240.56

Epoch 2/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.57it/s, Loss=22843.2305] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.52it/s]


Train Loss: 42928.3323
Val Loss: 19980.8986
MAE: 103.48

Epoch 3/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.55it/s, Loss=10708.5781]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.70it/s]


Train Loss: 17106.7717
Val Loss: 19836.2162
MAE: 95.29

Epoch 4/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.48it/s, Loss=12754.4180]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.69it/s]


Train Loss: 17037.9287
Val Loss: 29315.9156
MAE: 112.77

Epoch 5/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.52it/s, Loss=11109.4961]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.56it/s]


Train Loss: 12419.4974
Val Loss: 12350.2427
MAE: 78.81

Epoch 6/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.60it/s, Loss=8752.6113] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.47it/s]


Train Loss: 13488.8264
Val Loss: 12729.3925
MAE: 76.98

Epoch 7/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.46it/s, Loss=8419.4541] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.58it/s]


Train Loss: 12929.6196
Val Loss: 21976.7961
MAE: 100.01

Epoch 8/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.56it/s, Loss=4679.3589] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.54it/s]


Train Loss: 10146.2328
Val Loss: 10747.8688
MAE: 71.96

Epoch 9/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.50it/s, Loss=5635.6797] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.61it/s]


Train Loss: 8792.4451
Val Loss: 10783.5305
MAE: 71.93

Epoch 10/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.62it/s, Loss=5006.7964] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.74it/s]


Train Loss: 7525.5879
Val Loss: 11707.3728
MAE: 71.07

Epoch 11/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.63it/s, Loss=4415.2920] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.77it/s]


Train Loss: 8775.2512
Val Loss: 11203.9194
MAE: 70.39

Epoch 12/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.59it/s, Loss=8206.9492] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 7488.8060
Val Loss: 11448.6037
MAE: 70.75

Epoch 13/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.58it/s, Loss=20127.1152]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 7698.7578
Val Loss: 19493.1765
MAE: 90.35

Epoch 14/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.55it/s, Loss=12658.3789]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.54it/s]


Train Loss: 8682.6980
Val Loss: 18340.8633
MAE: 85.01

Epoch 15/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.58it/s, Loss=5315.3403] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.64it/s]


Train Loss: 7846.1895
Val Loss: 13579.7655
MAE: 80.14

Epoch 16/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.62it/s, Loss=7562.2861] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.60it/s]


Train Loss: 8193.6605
Val Loss: 10145.4527
MAE: 69.51

Epoch 17/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=2269.3503] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.70it/s]


Train Loss: 5541.4729
Val Loss: 7903.5556
MAE: 58.91

Epoch 18/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.57it/s, Loss=8156.2046] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.61it/s]


Train Loss: 6543.4155
Val Loss: 8652.4755
MAE: 62.92

Epoch 19/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.58it/s, Loss=2912.4287] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.59it/s]


Train Loss: 4890.6906
Val Loss: 8625.3224
MAE: 62.41

Epoch 20/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.47it/s, Loss=3772.5403] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.49it/s]


Train Loss: 5196.3465
Val Loss: 10165.2951
MAE: 63.64

Epoch 21/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.46it/s, Loss=1960.0081] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.59it/s]


Train Loss: 3562.4957
Val Loss: 7652.7168
MAE: 54.76

Epoch 22/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.45it/s, Loss=17051.5156]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.66it/s]


Train Loss: 5509.5268
Val Loss: 12240.1665
MAE: 74.17

Epoch 23/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.50it/s, Loss=3583.2676] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.69it/s]


Train Loss: 6604.5801
Val Loss: 12108.0193
MAE: 69.45

Epoch 24/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.45it/s, Loss=7632.9282] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.51it/s]


Train Loss: 6506.6554
Val Loss: 14720.9593
MAE: 77.30

Epoch 25/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.45it/s, Loss=2055.4907] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.74it/s]


Train Loss: 4572.6020
Val Loss: 9063.0943
MAE: 59.88

Epoch 26/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.50it/s, Loss=1919.8328] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.54it/s]


Train Loss: 3421.9659
Val Loss: 7674.3759
MAE: 56.12

Epoch 27/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.42it/s, Loss=1848.4658] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.59it/s]


Train Loss: 4576.4738
Val Loss: 9714.7449
MAE: 61.45

Epoch 28/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.40it/s, Loss=2453.5884] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.59it/s]


Train Loss: 3644.3707
Val Loss: 7877.2721
MAE: 55.54

Epoch 29/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.46it/s, Loss=4258.2334] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 4365.7270
Val Loss: 11066.9588
MAE: 69.82

Epoch 30/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.44it/s, Loss=7192.0464]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.64it/s]


Train Loss: 3430.3935
Val Loss: 7680.3667
MAE: 57.24

Epoch 31/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.50it/s, Loss=3328.4053] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.66it/s]


Train Loss: 4283.5907
Val Loss: 8796.7368
MAE: 64.82

Epoch 32/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.56it/s, Loss=2363.3750] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.39it/s]


Train Loss: 3478.3367
Val Loss: 7675.7208
MAE: 58.12

Epoch 33/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.48it/s, Loss=5590.2734] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.74it/s]


Train Loss: 3638.4839
Val Loss: 9331.1797
MAE: 65.83

Epoch 34/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=4025.5542] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Train Loss: 3395.1593
Val Loss: 7289.9086
MAE: 54.76

Epoch 35/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.47it/s, Loss=1952.4146] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.61it/s]


Train Loss: 4409.1664
Val Loss: 10487.2523
MAE: 68.48

Epoch 36/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.56it/s, Loss=5168.2134] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


Train Loss: 3580.7155
Val Loss: 7592.5793
MAE: 55.65

Epoch 37/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.44it/s, Loss=5320.6187] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.52it/s]


Train Loss: 4219.9357
Val Loss: 9092.4391
MAE: 63.40

Epoch 38/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=2338.5122]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Train Loss: 2817.4882
Val Loss: 8004.2764
MAE: 54.92

Epoch 39/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.47it/s, Loss=2916.2383]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


Train Loss: 3088.7458
Val Loss: 8371.1610
MAE: 58.62

Epoch 40/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.51it/s, Loss=2650.1313] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]

Train Loss: 2973.2887
Val Loss: 9626.2067
MAE: 60.45

Training completed!
Best validation loss: 7289.9086

Experiment completed! Results saved to: ../experiments/nutrition5k_experiments/inceptionv3_early_fusion_20251024_132025





## 3.3 InceptionV3 - Late Fusion

In [None]:
# Define experiment hyperparamers
BATCH_SIZE = 32
NUM_EPOCHS = 40
DROPOUT_RATE = 0.4
LEARNING_RATE = 3e-4
WEIGHT_DECAY = 1e-6
EARLY_STOPPING_PATIENCE = 15
WARMUP_RATIO = 0.1
MIN_LR_RATIO = 0.05
FUSION_CHANNELS = 2048

def train_nutrition5k_model(fusion_type='middle'):
    """Train the Nutrition5k model with InceptionV3 and specified fusion type"""
    
    print("="*60)
    print(f"TRAINING: Nutrition5k InceptionV3 + {fusion_type.capitalize()} Fusion")
    print("="*60)
    
    # Create datasets
    train_dataset = Nutrition5KDataset(
        csv_path=train_csv,
        data_root=DATA_ROOT,
        split='train',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    val_dataset = Nutrition5KDataset(
        csv_path=val_csv,
        data_root=DATA_ROOT,
        split='val',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    # Build model with specified fusion type
    model = build_nutrition5k_model(
        fusion=fusion_type,
        pretrained=False,
        dropout_rate=DROPOUT_RATE,
        fusion_channels=FUSION_CHANNELS
    )
    model = model.to(device)
    
    print(f"Model parameters: {model.get_num_parameters():,}")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    
    # Loss function
    criterion = nn.MSELoss()
    
    # Optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY
    )
    
    print(f"Learning rate: {LEARNING_RATE}")
    print(f"Weight decay: {WEIGHT_DECAY}")
    
    # Learning rate scheduler
    steps_per_epoch = len(train_loader)
    total_steps = NUM_EPOCHS * steps_per_epoch
    warmup_steps = int(total_steps * WARMUP_RATIO)
    
    scheduler = get_warmup_cosine_scheduler(
        optimizer, 
        warmup_steps=warmup_steps, 
        total_steps=total_steps,
        min_lr_ratio=MIN_LR_RATIO
    )
    
    # Create experiment directory
    exp_name = f"inceptionv3_{fusion_type}_fusion_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    exp_dir = os.path.join(OUTPUT_DIR, 'nutrition5k_experiments', exp_name)
    os.makedirs(exp_dir, exist_ok=True)
    
    # Save experiment configuration
    config = {
        'fusion': fusion_type,
        'pretrained': False,
        'dropout_rate': DROPOUT_RATE,
        'fusion_channels': FUSION_CHANNELS,
        'learning_rate': LEARNING_RATE,
        'weight_decay': WEIGHT_DECAY,
        'batch_size': BATCH_SIZE,
        'img_size': IMG_SIZE,
        'num_epochs': NUM_EPOCHS
    }
    
    with open(os.path.join(exp_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)
    
    # Create trainer
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        output_dir=exp_dir,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        scheduler_step_on_batch=False
    )
    
    # Train the model
    trainer.train(NUM_EPOCHS)
    
    print(f"\nExperiment completed! Results saved to: {exp_dir}")
    return trainer.best_metrics

# Run an experiment with middle fusion
late_fusion_results = train_nutrition5k_model(fusion_type='late')

TRAINING: Nutrition5k InceptionV3 + Late Fusion
Loaded 2804 valid samples out of 2805
Loaded 495 valid samples out of 495
Model parameters: 45,799,745
Training samples: 2804
Validation samples: 495
Learning rate: 0.0003
Weight decay: 1e-06
Starting training for 40 epochs...

Epoch 1/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.40it/s, Loss=106439.0938]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.52it/s]


Train Loss: 98389.8398
Val Loss: 107367.1013
MAE: 240.54

Epoch 2/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.25it/s, Loss=28412.5645]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.45it/s]


Train Loss: 41690.2808
Val Loss: 31997.4932
MAE: 123.44

Epoch 3/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.32it/s, Loss=11473.9180]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.50it/s]


Train Loss: 17350.7568
Val Loss: 19154.0181
MAE: 99.54

Epoch 4/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.25it/s, Loss=10099.2783]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.26it/s]


Train Loss: 14514.1249
Val Loss: 18284.8318
MAE: 101.39

Epoch 5/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=8358.5098] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]


Train Loss: 13459.6902
Val Loss: 13000.7582
MAE: 83.92

Epoch 6/40


Training: 100%|██████████| 87/87 [00:14<00:00,  6.19it/s, Loss=11738.3379]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.38it/s]


Train Loss: 11558.1775
Val Loss: 14764.5721
MAE: 93.13

Epoch 7/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=5837.0171] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]


Train Loss: 7952.4247
Val Loss: 10476.4639
MAE: 72.17

Epoch 8/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.38it/s, Loss=5708.1650] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.49it/s]


Train Loss: 6212.3241
Val Loss: 10303.3420
MAE: 67.74

Epoch 9/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=7737.3613] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]


Train Loss: 5673.2633
Val Loss: 8837.4902
MAE: 63.97

Epoch 10/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.35it/s, Loss=9406.4307] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.46it/s]


Train Loss: 5494.6568
Val Loss: 9756.0814
MAE: 69.59

Epoch 11/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.27it/s, Loss=4610.4297] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.54it/s]


Train Loss: 5146.7232
Val Loss: 8753.4821
MAE: 63.10

Epoch 12/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.35it/s, Loss=5909.7256] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.66it/s]


Train Loss: 4907.9320
Val Loss: 9570.8997
MAE: 65.66

Epoch 13/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=4963.5718] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.56it/s]


Train Loss: 4037.6672
Val Loss: 8444.3814
MAE: 61.17

Epoch 14/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=7843.6504] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.30it/s]


Train Loss: 4759.5694
Val Loss: 9681.8932
MAE: 66.88

Epoch 15/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.27it/s, Loss=9072.1475] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 4229.9857
Val Loss: 8599.4233
MAE: 62.60

Epoch 16/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.22it/s, Loss=2737.9417] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.62it/s]


Train Loss: 4172.8494
Val Loss: 10331.9440
MAE: 70.47

Epoch 17/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=3295.8223] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.60it/s]


Train Loss: 4371.7250
Val Loss: 8629.5308
MAE: 65.55

Epoch 18/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.30it/s, Loss=4097.1982] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.34it/s]


Train Loss: 4424.5678
Val Loss: 8790.1743
MAE: 65.26

Epoch 19/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.38it/s, Loss=2633.3958] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Train Loss: 4350.4276
Val Loss: 8730.1980
MAE: 62.17

Epoch 20/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.25it/s, Loss=4055.3335] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.38it/s]


Train Loss: 4177.5882
Val Loss: 14836.8188
MAE: 79.11

Epoch 21/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.23it/s, Loss=2228.0410] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.32it/s]


Train Loss: 4229.4994
Val Loss: 9068.2230
MAE: 66.28

Epoch 22/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.30it/s, Loss=1770.4275] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.48it/s]


Train Loss: 3978.3809
Val Loss: 8781.0372
MAE: 61.05

Epoch 23/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.32it/s, Loss=3093.6758] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 3067.3215
Val Loss: 8732.8710
MAE: 64.11

Epoch 24/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.42it/s, Loss=3645.4475] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.35it/s]


Train Loss: 3630.3558
Val Loss: 8726.5923
MAE: 62.54

Epoch 25/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=1093.2750] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.34it/s]


Train Loss: 3741.8051
Val Loss: 9379.7649
MAE: 62.13

Epoch 26/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=1971.1019] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.48it/s]


Train Loss: 3466.8906
Val Loss: 8442.0509
MAE: 58.94

Epoch 27/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.32it/s, Loss=4339.6250] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.71it/s]


Train Loss: 4190.1686
Val Loss: 8390.8783
MAE: 63.37

Epoch 28/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=2052.4866] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.62it/s]


Train Loss: 3881.3878
Val Loss: 9256.8487
MAE: 65.92

Epoch 29/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.30it/s, Loss=12281.3994]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.64it/s]


Train Loss: 3442.6623
Val Loss: 7819.5536
MAE: 57.46

Epoch 30/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.38it/s, Loss=4406.4668] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.42it/s]


Train Loss: 5363.2380
Val Loss: 13414.7968
MAE: 77.36

Epoch 31/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=4942.8740] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.79it/s]


Train Loss: 6517.6254
Val Loss: 24391.3788
MAE: 103.60

Epoch 32/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.48it/s, Loss=4491.8784] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.41it/s]


Train Loss: 5878.0402
Val Loss: 10430.6566
MAE: 65.04

Epoch 33/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.23it/s, Loss=2186.6104] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.49it/s]


Train Loss: 4494.0169
Val Loss: 8213.3772
MAE: 59.08

Epoch 34/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=7051.7349] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


Train Loss: 3761.5112
Val Loss: 10694.6189
MAE: 66.91

Epoch 35/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.29it/s, Loss=2668.6448]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.37it/s]


Train Loss: 3711.7728
Val Loss: 7766.9089
MAE: 59.69

Epoch 36/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.28it/s, Loss=3103.6143] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.69it/s]


Train Loss: 4424.6941
Val Loss: 12087.1532
MAE: 74.20

Epoch 37/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=3766.8242] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.64it/s]


Train Loss: 5107.7292
Val Loss: 16110.6986
MAE: 81.93

Epoch 38/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.27it/s, Loss=6010.6538] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.42it/s]


Train Loss: 4091.8209
Val Loss: 8399.1342
MAE: 57.81

Epoch 39/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.40it/s, Loss=2086.4343] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.74it/s]


Train Loss: 4500.5079
Val Loss: 9125.9881
MAE: 62.44

Epoch 40/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=1422.8608]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]

Train Loss: 3323.3737
Val Loss: 8567.3797
MAE: 60.29

Training completed!
Best validation loss: 7766.9089

Experiment completed! Results saved to: ../experiments/nutrition5k_experiments/inceptionv3_late_fusion_20251024_133101





## 3.4 InceptionV3 - Image + Volume

This section implements the food volume estimation method as described in the Nutrition5k paper. The method:

1. **Estimates food volume from overhead depth images** using:
   - Camera distance: 35.9 cm
   - Per-pixel surface area: 5.957 × 10⁻³ cm²
   
2. **Uses binary foreground/background segmentation** to identify food pixels

3. **Calculates volume** by summing per-pixel volumes (depth × surface_area) over all food pixels

4. **Concatenates volume estimate** to the InceptionV3 backbone output before FC layers

We implement three variants:
- **Image+Volume**: RGB + volume estimate as additional signal  
- **Middle+Volume**: RGB + Depth fusion + volume estimate


In [None]:
# Define experiment hyperparamers
BATCH_SIZE = 32
NUM_EPOCHS = 40
DROPOUT_RATE = 0.4
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 1e-6
EARLY_STOPPING_PATIENCE = 15
WARMUP_RATIO = 0.1
MIN_LR_RATIO = 0.05
FUSION_CHANNELS = 2048

# Training function with volume estimation support
def train_nutrition5k_with_volume(fusion_type='image_volume'):
    """
    Train the Nutrition5k model with volume estimation
    
    Args:
        fusion_type: 'image_only', 'image_volume', 'middle', etc.
        use_segmentation: Whether to use learned segmentation for volume estimation
    """
    
    print("="*60)
    print(f"TRAINING: Nutrition5k InceptionV3 + {fusion_type.upper()}")

    print("="*60)
    
    # Create datasets
    train_dataset = Nutrition5KDataset(
        csv_path=train_csv,
        data_root=DATA_ROOT,
        split='train',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    val_dataset = Nutrition5KDataset(
        csv_path=val_csv,
        data_root=DATA_ROOT,
        split='val',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False
    )
    

   # Use Volume    
    model = build_nutrition5k_model(
        fusion=fusion_type,
        pretrained=False,
        dropout_rate=DROPOUT_RATE,
        fusion_channels=FUSION_CHANNELS,
        use_volume=True,
    )
    model = model.to(device)
    
    print(f"Model parameters: {model.get_num_parameters():,}")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    
    # Loss function
    criterion = nn.MSELoss()
    
    # Optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY
    )
    
    print(f"Learning rate: {LEARNING_RATE}")
    print(f"Weight decay: {WEIGHT_DECAY}")
    
    # Learning rate scheduler
    steps_per_epoch = len(train_loader)
    total_steps = NUM_EPOCHS * steps_per_epoch
    warmup_steps = int(total_steps * WARMUP_RATIO)
    
    scheduler = get_warmup_cosine_scheduler(
        optimizer, 
        warmup_steps=warmup_steps, 
        total_steps=total_steps,
        min_lr_ratio=MIN_LR_RATIO
    )
    
    # Create experiment directory
    exp_name = f"inceptionv3_{fusion_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    exp_dir = os.path.join(OUTPUT_DIR, 'nutrition5k_experiments', exp_name)
    os.makedirs(exp_dir, exist_ok=True)
    
    # Save experiment configuration
    config = {
        'fusion': fusion_type,
        'use_volume': True,
        'pretrained': False,
        'dropout_rate': DROPOUT_RATE,
        'fusion_channels': FUSION_CHANNELS,
        'learning_rate': LEARNING_RATE,
        'weight_decay': WEIGHT_DECAY,
        'batch_size': BATCH_SIZE,
        'img_size': IMG_SIZE,
        'num_epochs': NUM_EPOCHS
    }
    
    with open(os.path.join(exp_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)
    
    # Create trainer
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        output_dir=exp_dir,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        scheduler_step_on_batch=False
    )
    
    # Train the model
    trainer.train(NUM_EPOCHS)
    
    print(f"\nExperiment completed! Results saved to: {exp_dir}")
    return trainer.best_metrics


image_volume_result = train_nutrition5k_with_volume(fusion_type='image_volume')

TRAINING: Nutrition5k InceptionV3 + IMAGE_VOLUME
Loaded 2804 valid samples out of 2805
Loaded 495 valid samples out of 495
Model parameters: 21,916,833
Training samples: 2804
Validation samples: 495
Learning rate: 0.0005
Weight decay: 1e-06
Starting training for 40 epochs...

Epoch 1/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=108259.2578]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.60it/s]


Train Loss: 98651.2712
Val Loss: 107381.6948
MAE: 240.57

Epoch 2/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.54it/s, Loss=16013.5312] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 65919.9727
Val Loss: 29319.6537
MAE: 118.82

Epoch 3/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.54it/s, Loss=21910.8262]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.68it/s]


Train Loss: 26365.2399
Val Loss: 25831.0439
MAE: 110.35

Epoch 4/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.54it/s, Loss=28824.6973]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.61it/s]


Train Loss: 19369.3317
Val Loss: 26086.2035
MAE: 106.78

Epoch 5/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.52it/s, Loss=12186.3164]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.66it/s]


Train Loss: 16064.0011
Val Loss: 22472.0131
MAE: 101.50

Epoch 6/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=7965.6699] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.70it/s]


Train Loss: 14435.9392
Val Loss: 12856.3794
MAE: 76.76

Epoch 7/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.66it/s, Loss=11353.6094]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.74it/s]


Train Loss: 14900.7881
Val Loss: 16189.5482
MAE: 85.86

Epoch 8/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.52it/s, Loss=11691.4023]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.78it/s]


Train Loss: 13149.4884
Val Loss: 11781.9545
MAE: 76.72

Epoch 9/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.55it/s, Loss=11914.6602]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 12836.9574
Val Loss: 12081.9814
MAE: 75.09

Epoch 10/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.65it/s, Loss=5833.5947] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.66it/s]


Train Loss: 12259.6983
Val Loss: 34023.9789
MAE: 135.16

Epoch 11/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.51it/s, Loss=7867.6704] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 10627.7787
Val Loss: 8857.1537
MAE: 64.40

Epoch 12/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.45it/s, Loss=8124.0371] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]


Train Loss: 8637.2702
Val Loss: 12734.0982
MAE: 73.18

Epoch 13/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.48it/s, Loss=5287.5591] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.62it/s]


Train Loss: 12329.5391
Val Loss: 11926.0453
MAE: 73.37

Epoch 14/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.53it/s, Loss=4309.2856] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Train Loss: 10885.5472
Val Loss: 13142.9994
MAE: 78.21

Epoch 15/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.51it/s, Loss=20139.8066]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.49it/s]


Train Loss: 10598.2898
Val Loss: 14689.3955
MAE: 81.68

Epoch 16/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=7086.2871] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Train Loss: 8293.4044
Val Loss: 10231.8171
MAE: 66.58

Epoch 17/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.57it/s, Loss=12543.9473]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 6399.1024
Val Loss: 9016.7161
MAE: 60.06

Epoch 18/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.55it/s, Loss=5665.5361] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.41it/s]


Train Loss: 5826.5098
Val Loss: 8145.8707
MAE: 57.01

Epoch 19/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.39it/s, Loss=5068.1343] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Train Loss: 7035.3614
Val Loss: 9229.9055
MAE: 61.73

Epoch 20/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.56it/s, Loss=5287.8306] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]


Train Loss: 6057.8679
Val Loss: 7616.8182
MAE: 56.79

Epoch 21/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=4746.9902] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.51it/s]


Train Loss: 7072.8501
Val Loss: 24474.4791
MAE: 99.65

Epoch 22/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.53it/s, Loss=13119.5117]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 7839.2704
Val Loss: 10457.4959
MAE: 65.65

Epoch 23/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.62it/s, Loss=3085.6042] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Train Loss: 6021.8229
Val Loss: 7191.0435
MAE: 55.98

Epoch 24/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.61it/s, Loss=6064.8857] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.70it/s]


Train Loss: 7473.2339
Val Loss: 9309.3742
MAE: 63.62

Epoch 25/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.55it/s, Loss=3918.4976] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.78it/s]


Train Loss: 6366.5307
Val Loss: 7951.3516
MAE: 57.45

Epoch 26/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.59it/s, Loss=3958.7363] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.55it/s]


Train Loss: 5714.5977
Val Loss: 8676.6589
MAE: 61.97

Epoch 27/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.59it/s, Loss=4025.3110] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.61it/s]


Train Loss: 6011.6670
Val Loss: 8741.7768
MAE: 62.08

Epoch 28/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.63it/s, Loss=3402.2810] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.65it/s]


Train Loss: 5449.7801
Val Loss: 9306.1301
MAE: 59.73

Epoch 29/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.50it/s, Loss=3335.1790] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 4545.7973
Val Loss: 7175.3438
MAE: 54.02

Epoch 30/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.53it/s, Loss=10892.3789]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 7070.0363
Val Loss: 13045.5674
MAE: 77.76

Epoch 31/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.51it/s, Loss=5397.4434] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.51it/s]


Train Loss: 6838.4158
Val Loss: 10400.0666
MAE: 64.40

Epoch 32/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.44it/s, Loss=4656.6797] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.39it/s]


Train Loss: 5807.6003
Val Loss: 8157.4585
MAE: 59.51

Epoch 33/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.52it/s, Loss=1558.7113] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.75it/s]


Train Loss: 5879.0825
Val Loss: 8046.5217
MAE: 57.06

Epoch 34/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.62it/s, Loss=7381.2354] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.74it/s]


Train Loss: 6066.1867
Val Loss: 9070.3390
MAE: 60.64

Epoch 35/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.55it/s, Loss=3927.4146] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.77it/s]


Train Loss: 4413.8461
Val Loss: 7681.4189
MAE: 54.37

Epoch 36/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.51it/s, Loss=9086.3516] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Train Loss: 5761.2919
Val Loss: 9406.7413
MAE: 65.93

Epoch 37/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.50it/s, Loss=6872.7788] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.60it/s]


Train Loss: 4781.9159
Val Loss: 7606.9587
MAE: 55.86

Epoch 38/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.54it/s, Loss=6143.3491] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.68it/s]


Train Loss: 5480.3070
Val Loss: 10385.1789
MAE: 63.28

Epoch 39/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.49it/s, Loss=3758.9685] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.61it/s]


Train Loss: 4822.6640
Val Loss: 7814.0743
MAE: 54.46

Epoch 40/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.51it/s, Loss=2849.0405] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.48it/s]

Train Loss: 5745.1829
Val Loss: 9136.1897
MAE: 59.73

Training completed!
Best validation loss: 7175.3438

Experiment completed! Results saved to: ../experiments/nutrition5k_experiments/inceptionv3_image_volume_20251024_140149





: 

## 3.5 InceptionV3 - Image + Depth + Volume

In [9]:
# Configure experiment settings
BATCH_SIZE = 32
NUM_EPOCHS = 40
DROPOUT_RATE = 0.4
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 1e-6
EARLY_STOPPING_PATIENCE = 15
WARMUP_RATIO = 0.1
MIN_LR_RATIO = 0.05
FUSION_CHANNELS = 2048  # InceptionV3 output channels

# Training function with volume estimation support
def train_nutrition5k_with_volume(fusion_type='image_volume'):
    """
    Train the Nutrition5k model with volume estimation
    
    Args:
        fusion_type: 'image_only', 'image_volume', 'middle', etc.
        use_segmentation: Whether to use learned segmentation for volume estimation
    """
    
    print("="*60)
    print(f"TRAINING: Nutrition5k InceptionV3 + {fusion_type.upper()}")

    print("="*60)
    
    # Create datasets
    train_dataset = Nutrition5KDataset(
        csv_path=train_csv,
        data_root=DATA_ROOT,
        split='train',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    val_dataset = Nutrition5KDataset(
        csv_path=val_csv,
        data_root=DATA_ROOT,
        split='val',
        augment=False,
        img_size=IMG_SIZE,
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    # Use Volume  
    model = build_nutrition5k_model(
        fusion=fusion_type,
        pretrained=False,
        dropout_rate=DROPOUT_RATE,
        fusion_channels=FUSION_CHANNELS,
        use_volume=True,
    )
    model = model.to(device)
    
    print(f"Model parameters: {model.get_num_parameters():,}")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    
    # Loss function
    criterion = nn.MSELoss()
    
    # Optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY
    )
    
    print(f"Learning rate: {LEARNING_RATE}")
    print(f"Weight decay: {WEIGHT_DECAY}")
    
    # Learning rate scheduler
    steps_per_epoch = len(train_loader)
    total_steps = NUM_EPOCHS * steps_per_epoch
    warmup_steps = int(total_steps * WARMUP_RATIO)
    
    scheduler = get_warmup_cosine_scheduler(
        optimizer, 
        warmup_steps=warmup_steps, 
        total_steps=total_steps,
        min_lr_ratio=MIN_LR_RATIO
    )
    
    # Create experiment directory
    exp_name = f"inceptionv3_{fusion_type}_volume_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    exp_dir = os.path.join(OUTPUT_DIR, 'nutrition5k_experiments', exp_name)
    os.makedirs(exp_dir, exist_ok=True)
    
    # Save experiment configuration
    config = {
        'fusion': fusion_type,
        'use_volume': True,
        'pretrained': False,
        'dropout_rate': DROPOUT_RATE,
        'fusion_channels': FUSION_CHANNELS,
        'learning_rate': LEARNING_RATE,
        'weight_decay': WEIGHT_DECAY,
        'batch_size': BATCH_SIZE,
        'img_size': IMG_SIZE,
        'num_epochs': NUM_EPOCHS
    }
    
    with open(os.path.join(exp_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)
    
    # Create trainer
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        output_dir=exp_dir,
        early_stopping_patience=EARLY_STOPPING_PATIENCE,
        scheduler_step_on_batch=False
    )
    
    # Train the model
    trainer.train(NUM_EPOCHS)
    
    print(f"\nExperiment completed! Results saved to: {exp_dir}")
    return trainer.best_metrics


volume_with_middle_fusion_result = train_nutrition5k_with_volume(fusion_type='middle')

TRAINING: Nutrition5k InceptionV3 + MIDDLE
Loaded 2804 valid samples out of 2805
Loaded 495 valid samples out of 495
Model parameters: 52,094,465
Training samples: 2804
Validation samples: 495
Learning rate: 0.0005
Weight decay: 1e-06
Starting training for 40 epochs...

Epoch 1/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=80694.2109] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 98966.1618
Val Loss: 107401.5198
MAE: 240.62

Epoch 2/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.39it/s, Loss=32316.2949] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.52it/s]


Train Loss: 69272.0114
Val Loss: 24299.0708
MAE: 107.20

Epoch 3/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.40it/s, Loss=14151.7119]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 23787.9891
Val Loss: 16633.3892
MAE: 88.82

Epoch 4/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.36it/s, Loss=15007.1816]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 14682.7154
Val Loss: 12146.5886
MAE: 75.05

Epoch 5/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=19958.5156]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.73it/s]


Train Loss: 15840.9862
Val Loss: 17202.8272
MAE: 94.55

Epoch 6/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.49it/s, Loss=16198.6484]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.30it/s]


Train Loss: 13342.0799
Val Loss: 15637.7800
MAE: 82.39

Epoch 7/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.39it/s, Loss=13739.9629]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.32it/s]


Train Loss: 10382.3783
Val Loss: 10612.7254
MAE: 68.76

Epoch 8/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.38it/s, Loss=4878.6045] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.41it/s]


Train Loss: 9570.5377
Val Loss: 10028.6673
MAE: 66.16

Epoch 9/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=8596.0879] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.64it/s]


Train Loss: 8189.9623
Val Loss: 9066.1782
MAE: 64.52

Epoch 10/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.46it/s, Loss=9611.9727] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Train Loss: 7434.2731
Val Loss: 9510.0068
MAE: 64.08

Epoch 11/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=8335.9004] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.35it/s]


Train Loss: 7191.1866
Val Loss: 9110.0953
MAE: 62.14

Epoch 12/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.39it/s, Loss=9409.6133] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.75it/s]


Train Loss: 6747.8446
Val Loss: 10531.1671
MAE: 65.82

Epoch 13/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.30it/s, Loss=9208.3613] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Train Loss: 7328.7460
Val Loss: 11288.8193
MAE: 73.27

Epoch 14/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.40it/s, Loss=8933.9141] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.43it/s]


Train Loss: 9884.4828
Val Loss: 11184.1066
MAE: 71.21

Epoch 15/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=5830.4062] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.42it/s]


Train Loss: 9719.4533
Val Loss: 12809.6439
MAE: 74.93

Epoch 16/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=6700.2168] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.36it/s]


Train Loss: 12226.5881
Val Loss: 14135.2347
MAE: 77.15

Epoch 17/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.27it/s, Loss=8846.7324] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.45it/s]


Train Loss: 10628.8874
Val Loss: 17854.5973
MAE: 85.47

Epoch 18/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.48it/s, Loss=6236.5356] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.38it/s]


Train Loss: 9605.7054
Val Loss: 11989.8474
MAE: 75.97

Epoch 19/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=5089.1396] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.28it/s]


Train Loss: 10650.8845
Val Loss: 12936.2934
MAE: 78.96

Epoch 20/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.36it/s, Loss=10002.7715]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.66it/s]


Train Loss: 10585.4611
Val Loss: 20954.0295
MAE: 93.90

Epoch 21/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=9147.0654] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.65it/s]


Train Loss: 7982.7874
Val Loss: 12198.2485
MAE: 69.25

Epoch 22/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=9361.1641] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.61it/s]


Train Loss: 8563.8640
Val Loss: 12346.0579
MAE: 82.81

Epoch 23/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=13987.2080]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.46it/s]


Train Loss: 9314.2981
Val Loss: 15242.2944
MAE: 78.82

Epoch 24/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=3405.4075] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.27it/s]


Train Loss: 7241.1247
Val Loss: 8901.5778
MAE: 59.49

Epoch 25/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.34it/s, Loss=5384.7852] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.63it/s]


Train Loss: 5968.9408
Val Loss: 8102.0835
MAE: 57.21

Epoch 26/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.42it/s, Loss=6591.0449] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.71it/s]


Train Loss: 6917.0128
Val Loss: 12186.5141
MAE: 68.78

Epoch 27/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.45it/s, Loss=8429.6357] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.57it/s]


Train Loss: 8189.9398
Val Loss: 9175.6458
MAE: 66.36

Epoch 28/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=14604.1523]
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.38it/s]


Train Loss: 6946.9857
Val Loss: 8362.9894
MAE: 59.92

Epoch 29/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=2387.0117] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.53it/s]


Train Loss: 6338.9732
Val Loss: 10491.9875
MAE: 64.81

Epoch 30/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.33it/s, Loss=5547.8535] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 6336.7661
Val Loss: 7986.1679
MAE: 57.02

Epoch 31/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.41it/s, Loss=9401.0254] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.53it/s]


Train Loss: 6179.5254
Val Loss: 10851.4927
MAE: 73.47

Epoch 32/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.39it/s, Loss=6818.7988] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.60it/s]


Train Loss: 6252.9587
Val Loss: 8419.3591
MAE: 61.60

Epoch 33/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.37it/s, Loss=2115.7651] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.46it/s]


Train Loss: 5743.9070
Val Loss: 7800.9672
MAE: 58.19

Epoch 34/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.44it/s, Loss=5256.0479] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.72it/s]


Train Loss: 6294.0270
Val Loss: 8742.9595
MAE: 63.41

Epoch 35/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=3182.2451] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.53it/s]


Train Loss: 6482.3907
Val Loss: 8548.3289
MAE: 61.67

Epoch 36/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.51it/s, Loss=4070.2322] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.71it/s]


Train Loss: 4683.8792
Val Loss: 8047.7720
MAE: 60.68

Epoch 37/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.40it/s, Loss=4029.4165] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.32it/s]


Train Loss: 6090.3191
Val Loss: 9532.8971
MAE: 65.66

Epoch 38/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.43it/s, Loss=3800.9351] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


Train Loss: 5001.6133
Val Loss: 8364.4256
MAE: 57.44

Epoch 39/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.45it/s, Loss=3780.6094] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.70it/s]


Train Loss: 5878.9752
Val Loss: 8378.5119
MAE: 59.38

Epoch 40/40


Training: 100%|██████████| 87/87 [00:13<00:00,  6.44it/s, Loss=6472.6729] 
Validation: 100%|██████████| 16/16 [00:02<00:00,  6.64it/s]

Train Loss: 5337.3297
Val Loss: 9411.3538
MAE: 60.59

Training completed!
Best validation loss: 7800.9672

Experiment completed! Results saved to: ./experiments/nutrition5k_experiments/inceptionv3_middle_volume_20251025_171018





# Test Inference

In [11]:
class TestDataset(Dataset):
    """Dataset class for test set inference"""
    
    def __init__(self, test_root, img_size=256):
        self.test_root = test_root
        self.img_size = img_size
        
        # Paths to subdirectories
        self.color_dir = os.path.join(test_root, 'color')
        self.depth_raw_dir = os.path.join(test_root, 'depth_raw')
        
        # Get all dish IDs from color directory
        self.dish_ids = sorted([d for d in os.listdir(self.color_dir) 
                              if os.path.isdir(os.path.join(self.color_dir, d))])
        
        print(f"Found {len(self.dish_ids)} test samples")
        
        # Color normalization (same as training)
        self.color_normalize = T.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    
    def __len__(self):
        return len(self.dish_ids)
    
    def _resize_and_center_crop(self, img, target_size=256):
        """Resize and center crop (same as training)"""
        width, height = img.size
        
        if width < height:
            new_width = target_size
            new_height = int(target_size * height / width)
        else:
            new_height = target_size
            new_width = int(target_size * width / height)
        
        img = img.resize((new_width, new_height), Image.LANCZOS)
        
        left = (new_width - target_size) // 2
        top = (new_height - target_size) // 2
        right = left + target_size
        bottom = top + target_size
        
        return img.crop((left, top, right, bottom))
    
    def __getitem__(self, idx):
        dish_id = self.dish_ids[idx]
        
        # Load images
        rgb_path = os.path.join(self.color_dir, dish_id, 'rgb.png')
        depth_path = os.path.join(self.depth_raw_dir, dish_id, 'depth_raw.png')
        
        rgb_img = Image.open(rgb_path).convert('RGB')
        depth_img = Image.open(depth_path).convert('L')
        
        # Resize and center crop
        rgb_img = self._resize_and_center_crop(rgb_img, target_size=self.img_size)
        depth_img = self._resize_and_center_crop(depth_img, target_size=self.img_size)
        
        # Convert to tensors
        rgb_tensor = TF.to_tensor(rgb_img)
        depth_tensor = TF.to_tensor(depth_img)
        
        # Normalize
        rgb_tensor = self.color_normalize(rgb_tensor)
        depth_tensor = depth_tensor / 255.0
        
        return {
            'rgb': rgb_tensor,
            'depth': depth_tensor,
            'dish_id': dish_id
        }

In [12]:
def load_model_from_checkpoint(checkpoint_path, device='cuda'):
    """Load trained model from checkpoint"""
    
    # Load checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
    
    # Get the directory to look for config
    model_dir = os.path.dirname(checkpoint_path)
    config_path = os.path.join(model_dir, 'config.json')
    
    # Load config
    if os.path.exists(config_path):
        with open(config_path, 'r') as f:
            config = json.load(f)
        print(f"Loaded config: {config}")
    else:
        raise FileNotFoundError(f"Config file not found: {config_path}")
    
    # Build model with the same configuration
    model = build_nutrition5k_model(
        fusion=config.get('fusion', 'middle'),
        pretrained=False,
        dropout_rate=config.get('dropout_rate', 0.4),
        fusion_channels=config.get('fusion_channels', 2048),
        use_volume=config.get('use_volume', False)
    )
    
    # Load state dict
    if 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])
        print(f"Loaded model from epoch {checkpoint.get('epoch', 'unknown')}")
        if 'val_loss' in checkpoint:
            print(f"Validation loss: {checkpoint['val_loss']:.4f}")
            print(f"MAE: {checkpoint.get('mae', 'N/A')}")
    else:
        model.load_state_dict(checkpoint)
    
    model = model.to(device)
    model.eval()
    
    return model

In [13]:
@torch.no_grad()
def run_inference(model, test_dataset, batch_size=32, device='cuda'):
    """Run inference on test set"""
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    predictions = {}
    
    print("Running inference...")
    for batch in tqdm(test_loader):
        rgb = batch['rgb'].to(device)
        depth = batch['depth'].to(device)
        dish_ids = batch['dish_id']
        
        # Forward pass
        calorie_pred = model(rgb, depth)
        
        # Store predictions
        calorie_pred = calorie_pred.cpu().numpy().flatten()
        for i, dish_id in enumerate(dish_ids):
            predictions[dish_id] = float(calorie_pred[i])
    
    return predictions

def create_submission(predictions, output_path='submission.csv'):
    """Create submission CSV file"""
    
    submission_data = []
    for dish_id in sorted(predictions.keys()):
        submission_data.append({
            'ID': dish_id,
            'Value': predictions[dish_id]
        })
    
    submission_df = pd.DataFrame(submission_data)
    submission_df.to_csv(output_path, index=False)
    
    print(f"Submission saved to: {output_path}")
    print(f"Total predictions: {len(submission_data)}")
    print(f"\nSample predictions:")
    print(submission_df.head(10))
    
    # Statistics
    values = submission_df['Value'].values
    print(f"\nPrediction Statistics:")
    print(f"  Min: {values.min():.2f}")
    print(f"  Max: {values.max():.2f}")
    print(f"  Mean: {values.mean():.2f}")
    print(f"  Median: {np.median(values):.2f}")
    
    return submission_df

In [14]:
# Configuration
TEST_ROOT = './Nutrition5K/Nutrition5K/test'  # Path to test data
MODEL_PATH = './experiments/nutrition5k_experiments/inceptionv3_middle_volume_20251025_163021/best_model.pth'  # Your best model
OUTPUT_PATH = './submission.csv'

# Load test dataset
test_dataset = TestDataset(
    test_root=TEST_ROOT,
    img_size=IMG_SIZE
)

# Load model
model = load_model_from_checkpoint(MODEL_PATH, device=device)

# Run inference
predictions = run_inference(
    model=model,
    test_dataset=test_dataset,
    batch_size=BATCH_SIZE,
    device=device
)

# Create submission
submission_df = create_submission(predictions, output_path=OUTPUT_PATH)

Found 189 test samples
Loaded config: {'fusion': 'middle', 'use_volume': True, 'pretrained': False, 'dropout_rate': 0.4, 'fusion_channels': 2048, 'learning_rate': 0.0005, 'weight_decay': 1e-06, 'batch_size': 32, 'img_size': 256, 'num_epochs': 40}
Loaded model from epoch 17
Validation loss: 6987.1615
MAE: 54.757686614990234
Running inference...


100%|██████████| 6/6 [00:01<00:00,  4.13it/s]

Submission saved to: ./submission.csv
Total predictions: 189

Sample predictions:
          ID       Value
0  dish_3301  948.794556
1  dish_3302   16.694901
2  dish_3303   27.170441
3  dish_3304  222.738480
4  dish_3305  466.522308
5  dish_3306    3.652470
6  dish_3307  472.011383
7  dish_3308   44.091991
8  dish_3309  544.475403
9  dish_3310  253.851929

Prediction Statistics:
  Min: 2.25
  Max: 948.79
  Mean: 248.84
  Median: 206.76



