# Nutrition5k InceptionV2 Implementation

This notebook implements the InceptionV2-based model architecture as described in the Nutrition5k paper. It includes:

- Data loading and preprocessing
- InceptionV2 model architecture
- Different fusion methods (early, middle, late)
- Training and validation loops
- Memory optimization to prevent OOM errors


In [1]:
# Standard imports
import os
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
import warnings
import gc
warnings.filterwarnings('ignore')

# PyTorch imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

# Data processing imports
import pandas as pd
from PIL import Image
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from collections import OrderedDict
import random
from typing import Tuple, Optional

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Device info function
def print_device_info():
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


Using device: cuda


In [2]:
# Configuration - Update these paths to match your setup
DATA_ROOT = '../Nutrition5K/train'  # Path to training data directory
CSV_PATH = '../Nutrition5K/nutrition5k_train.csv'  # Path to training CSV
OUTPUT_DIR = '../experiments'  # Directory to save experiment results

# Global training hyperparameters (based on the Nutrition5k paper)
BATCH_SIZE = 16  # Reduced batch size to prevent OOM errors
NUM_EPOCHS = 40
VAL_RATIO = 0.15
IMG_SIZE = 256  # Paper specifies 256x256 input
NUM_WORKERS = 4  # Reduced workers to prevent memory issues
DROPOUT_RATE = 0.4
LEARNING_RATE = 1e-3  # As specified in the paper (RMSProp with lr=1e-4)
MOMENTUM = 0.9  # As specified in the paper
DECAY = 0.9  # As specified in the paper
EPSILON = 1e-8  # As specified in the paper
FEATURE_DIM = 2048  # Feature map dimension from InceptionV2
FC_DIM = 1024  # Using smaller FC dimensions since we're training from scratch (original paper used 4096)

print("Configuration (based on Nutrition5k paper):")
print(f"  Data root: {DATA_ROOT}")
print(f"  CSV path: {CSV_PATH}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Number of epochs: {NUM_EPOCHS}")
print(f"  Image size: {IMG_SIZE}x{IMG_SIZE}")
print(f"  Workers: {NUM_WORKERS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Optimizer: RMSProp (momentum={MOMENTUM}, decay={DECAY}, epsilon={EPSILON})")
print(f"  FC dimensions: {FC_DIM}")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)


Configuration (based on Nutrition5k paper):
  Data root: ../Nutrition5K/train
  CSV path: ../Nutrition5K/nutrition5k_train.csv
  Output directory: ../experiments
  Batch size: 16
  Number of epochs: 40
  Image size: 256x256
  Workers: 4
  Learning rate: 0.001
  Optimizer: RMSProp (momentum=0.9, decay=0.9, epsilon=1e-08)
  FC dimensions: 1024


In [3]:
# Nutrition5k Dataset Class
class Nutrition5KDataset(Dataset):
    """Dataset class for Nutrition5K with preprocessing as described in the paper"""
    
    def __init__(
        self,
        csv_path: str,
        data_root: str,
        split: str = 'train',
        augment: bool = True,
        img_size: int = 256  # Updated to 256 per paper
    ):
        """
        Args:
            csv_path: Path to the CSV file with dish IDs and calorie values
            data_root: Root directory containing color/, depth_raw/ subdirectories
            split: 'train' or 'val'
            augment: Whether to apply data augmentation
            img_size: Size to resize images (256x256 as specified in paper)
        """
        self.data_root = data_root
        self.split = split
        self.img_size = img_size
        self.augment = augment and split == 'train'
        
        # Load CSV
        self.df = pd.read_csv(csv_path)
        
        # Rename 'Value' column to 'calories' if it exists
        if 'Value' in self.df.columns and 'calories' not in self.df.columns:
            self.df = self.df.rename(columns={'Value': 'calories'})
        
        # Make sure calories column exists
        if 'calories' not in self.df.columns:
            raise ValueError("CSV file must contain a 'calories' column or a 'Value' column")
        
        # Filter out high-calorie samples (as mentioned in the paper)
        self.df = self.df[self.df['calories'] < 3000].reset_index(drop=True)
        
        # Build paths
        self.color_dir = os.path.join(data_root, 'color')
        self.depth_raw_dir = os.path.join(data_root, 'depth_raw')
        
        # Validate dataset
        self.valid_indices = self._validate_dataset()
        print(f"Loaded {len(self.valid_indices)} valid samples out of {len(self.df)}")
        
        # Normalization values from the paper
        self.color_normalize = T.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    
    def _validate_dataset(self):
        """Pre-validate all samples and return valid indices"""
        valid_indices = []
        
        for idx in range(len(self.df)):
            dish_id = self.df.iloc[idx]['ID']
            
            rgb_path = os.path.join(self.color_dir, dish_id, 'rgb.png')
            depth_path = os.path.join(self.depth_raw_dir, dish_id, 'depth_raw.png')
            
            # Check if files exist
            if not os.path.exists(rgb_path):
                warnings.warn(f"Missing RGB image: {rgb_path}")
                continue
            if not os.path.exists(depth_path):
                warnings.warn(f"Missing depth image: {depth_path}")
                continue
            
            # Try to load images to check for corruption
            try:
                with Image.open(rgb_path) as img:
                    img.verify()
                with Image.open(depth_path) as img:
                    img.verify()
                valid_indices.append(idx)
            except Exception as e:
                warnings.warn(f"Corrupt image for {dish_id}: {e}")
                continue
                
        return valid_indices
    
    def __len__(self):
        return len(self.valid_indices)
    
    def _load_image_safe(self, path: str, mode: str = 'RGB') -> Optional[Image.Image]:
        """Safely load an image with error handling"""
        try:
            with Image.open(path) as img:
                return img.convert(mode).copy()
        except Exception as e:
            warnings.warn(f"Failed to load image {path}: {e}")
            return None
    
    def _apply_preprocessing(self, rgb_img, depth_img):
        """Apply preprocessing as described in the paper"""
        # Resize to target size
        rgb_img = TF.resize(rgb_img, (self.img_size, self.img_size))
        depth_img = TF.resize(depth_img, (self.img_size, self.img_size))
        
        # Apply data augmentation for training
        if self.split == 'train' and self.augment:
            # Random horizontal flip (50% probability)
            if random.random() > 0.5:
                rgb_img = TF.hflip(rgb_img)
                depth_img = TF.hflip(depth_img)
            
            # Random rotation
            if random.random() > 0.5:
                angle = random.uniform(-10, 10)
                rgb_img = TF.rotate(rgb_img, angle)
                depth_img = TF.rotate(depth_img, angle)
        
        return rgb_img, depth_img
    
    def __getitem__(self, idx):
        """Get a single sample"""
        actual_idx = self.valid_indices[idx]
        row = self.df.iloc[actual_idx]
        
        dish_id = row['ID']
        calorie = float(row['calories'])
        
        # Load images
        rgb_path = os.path.join(self.color_dir, dish_id, 'rgb.png')
        depth_path = os.path.join(self.depth_raw_dir, dish_id, 'depth_raw.png')
        
        rgb_img = self._load_image_safe(rgb_path, 'RGB')
        depth_img = self._load_image_safe(depth_path, 'L')  # Grayscale for depth
        
        if rgb_img is None or depth_img is None:
            # Fallback: return a black image
            rgb_img = Image.new('RGB', (self.img_size, self.img_size), (0, 0, 0))
            depth_img = Image.new('L', (self.img_size, self.img_size), 0)
        
        # Apply preprocessing
        rgb_img, depth_img = self._apply_preprocessing(rgb_img, depth_img)
        
        # Convert to tensors
        rgb_tensor = TF.to_tensor(rgb_img)  # (3, H, W)
        depth_tensor = TF.to_tensor(depth_img)  # (1, H, W)
        
        # Normalize RGB
        rgb_tensor = self.color_normalize(rgb_tensor)
        
        # Normalize depth (map to [0,1])
        depth_tensor = depth_tensor / 255.0
        
        return {
            'dish_id': dish_id,
            'rgb': rgb_tensor,
            'depth': depth_tensor,
            'calorie': torch.tensor(calorie, dtype=torch.float32)
        }


In [4]:
# Function to create train/validation split
def create_train_val_split(csv_path: str, val_ratio: float = 0.15, random_seed: int = 42):
    """
    Create train/validation split CSV files
    
    Args:
        csv_path: Path to the original CSV file
        val_ratio: Ratio of validation samples
        random_seed: Random seed for reproducibility
        
    Returns:
        Tuple of (train_csv_path, val_csv_path)
    """
    # Read original CSV
    df = pd.read_csv(csv_path)    
    
    # Shuffle with fixed seed
    df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Split
    val_size = int(len(df_shuffled) * val_ratio)
    train_df = df_shuffled[val_size:]
    val_df = df_shuffled[:val_size]
    
    # Save temporary CSV files
    base_dir = os.path.dirname(csv_path)
    train_csv = os.path.join(base_dir, 'train_split.csv')
    val_csv = os.path.join(base_dir, 'val_split.csv')
    
    train_df.to_csv(train_csv, index=False)
    val_df.to_csv(val_csv, index=False)
    
    return train_csv, val_csv

# Create train/validation split
print("Creating train/validation split...")
train_csv, val_csv = create_train_val_split(
    CSV_PATH,
    val_ratio=VAL_RATIO,
    random_seed=SEED
)

print(f"Train CSV: {train_csv}")
print(f"Validation CSV: {val_csv}")

# Check if we can load a sample from the dataset
sample_dataset = Nutrition5KDataset(
    csv_path=train_csv,
    data_root=DATA_ROOT,
    split='train',
    augment=False,
    img_size=IMG_SIZE
)

print(f"\nDataset loaded successfully!")
print(f"Training samples: {len(sample_dataset)}")
if len(sample_dataset) > 0:
    sample = sample_dataset[0]
    print(f"RGB shape: {sample['rgb'].shape}")
    print(f"Depth shape: {sample['depth'].shape}")
    print(f"Calorie value: {sample['calorie'].item():.1f}")


Creating train/validation split...
Train CSV: ../Nutrition5K/train_split.csv
Validation CSV: ../Nutrition5K/val_split.csv
Loaded 2804 valid samples out of 2805

Dataset loaded successfully!
Training samples: 2804
RGB shape: torch.Size([3, 256, 256])
Depth shape: torch.Size([1, 256, 256])
Calorie value: 88.5


In [5]:
# InceptionV2 Model Implementation
# First, let's implement the basic building blocks

class BasicConv2d(nn.Module):
    """Basic convolution module for InceptionV2: Conv2d + BatchNorm + ReLU"""
    
    def __init__(self, in_channels, out_channels, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
    
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return F.relu(x, inplace=True)


class InceptionModule(nn.Module):
    """InceptionV2 module with BatchNorm"""
    
    def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj):
        super().__init__()
        
        # 1x1 branch
        self.branch1 = BasicConv2d(in_channels, ch1x1, kernel_size=1)
        
        # 3x3 branch
        self.branch2 = nn.Sequential(
            BasicConv2d(in_channels, ch3x3red, kernel_size=1),
            BasicConv2d(ch3x3red, ch3x3, kernel_size=3, padding=1)
        )
        
        # 5x5 branch
        self.branch3 = nn.Sequential(
            BasicConv2d(in_channels, ch5x5red, kernel_size=1),
            BasicConv2d(ch5x5red, ch5x5, kernel_size=5, padding=2)
        )
        
        # Pool branch
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            BasicConv2d(in_channels, pool_proj, kernel_size=1)
        )
    
    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)
        
        return torch.cat([branch1, branch2, branch3, branch4], dim=1)


In [6]:
# InceptionV2 Encoder Implementation

class InceptionV2Encoder(nn.Module):
    """InceptionV2 encoder as used in the original Nutrition5k paper"""
    
    def __init__(self, pretrained: bool = False, in_channels: int = 3):
        super().__init__()
        
        # The output of InceptionV2 features is 2048 channels (as shown in paper)
        self.out_channels = 2048
        
        # Initial layers
        self.conv1 = nn.Sequential(
            BasicConv2d(in_channels, 64, kernel_size=7, stride=2, padding=3),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            BasicConv2d(64, 64, kernel_size=1),
            BasicConv2d(64, 192, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        
        # Inception blocks
        self.inception3a = InceptionModule(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = InceptionModule(256, 128, 128, 192, 32, 96, 64)
        
        # Max pooling
        self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # More Inception blocks
        self.inception4a = InceptionModule(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = InceptionModule(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = InceptionModule(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = InceptionModule(512, 112, 144, 288, 32, 64, 64)
        self.inception4e = InceptionModule(528, 256, 160, 320, 32, 128, 128)
        
        # Max pooling
        self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # Final Inception blocks
        self.inception5a = InceptionModule(832, 256, 160, 320, 32, 128, 128)
# Modify these parameters to double the output channels
        self.inception5b = InceptionModule(832, 768, 192, 768, 48, 256, 256)        
        # Initialize weights
        self._initialize_weights()
    
    def _initialize_weights(self):
        """Initialize weights with the recommended scheme"""
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        """
        Args:
            x: Input tensor (B, C, H, W)
        Returns:
            Feature map (B, 1024, H/32, W/32)
        """
        x = self.conv1(x)
        
        x = self.inception3a(x)
        x = self.inception3b(x)
        x = self.maxpool3(x)
        
        x = self.inception4a(x)
        x = self.inception4b(x)
        x = self.inception4c(x)
        x = self.inception4d(x)
        x = self.inception4e(x)
        x = self.maxpool4(x)
        
        x = self.inception5a(x)
        x = self.inception5b(x)
        
        return x


In [7]:
# Regression Head for calorie prediction
class RegressionHead(nn.Module):
    """
    Regression head for calorie prediction
    Modified architecture with smaller FC layers for non-pretrained model
    Original paper used (4096→4096→4096→1) with pretrained model
    """
    
    def __init__(self, in_channels: int = 2048, fc_dim: int = 1024, dropout_rate: float = 0.4):
        super().__init__()
        
        # Global average pooling
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Fully connected layers with reduced dimensions for non-pretrained model
        # Uses progressive structure: 2048→1024→1024→1
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            
            # First FC layer: in_channels → fc_dim (2048 → 1024)
            nn.Linear(in_channels, fc_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            
            # Second FC layer: fc_dim → fc_dim (1024 → 1024)
            nn.Linear(fc_dim, fc_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            
            # Third FC layer: fc_dim → fc_dim/2 (1024 → 512)
            nn.Linear(fc_dim, fc_dim // 2),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            
            # Final output layer: fc_dim/2 → 1 (512 → 1)
            nn.Linear(fc_dim // 2, 1)
        )
    
    def forward(self, x):
        x = self.avgpool(x)  # (B, C, 1, 1)
        x = self.fc_layers(x)  # (B, 1)
        return x


In [8]:
# Full InceptionV2 Model for Calorie Prediction
class CalorieInceptionV2Model(nn.Module):
    """
    Complete InceptionV2 model for calorie prediction
    Based on the architecture in the Nutrition5k paper but with smaller FC layers
    """
    
    def __init__(self, pretrained=False, dropout_rate=0.4, fc_dim=1024):
        super().__init__()
        
        # InceptionV2 encoder
        self.encoder = InceptionV2Encoder(pretrained=pretrained, in_channels=3)
        
        # Regression head for calorie prediction
        self.regression_head = RegressionHead(
            in_channels=self.encoder.out_channels,
            fc_dim=fc_dim,
            dropout_rate=dropout_rate
        )
    
    def forward(self, x):
        """
        Forward pass through the model
        
        Args:
            x: Input RGB image tensor (B, 3, H, W)
            
        Returns:
            Calorie prediction (B, 1)
        """
        # Extract features with InceptionV2 encoder
        features = self.encoder(x)
        
        # Predict calories
        calories = self.regression_head(features)
        
        return calories
    
    def get_num_parameters(self):
        """Get total number of trainable parameters"""
        return sum(p.numel() for p in self.parameters() if p.requires_grad)


In [9]:
# Implement RMSProp optimizer with paper parameters
def get_optimizer(model_params, lr=1e-4, momentum=0.9, weight_decay=0.0, epsilon=1.0):
    """
    Create RMSProp optimizer with parameters from the paper
    
    Args:
        model_params: Model parameters to optimize
        lr: Learning rate (paper uses 1e-4)
        momentum: Momentum factor (paper uses 0.9)
        weight_decay: Weight decay for L2 regularization
        epsilon: Epsilon value to prevent division by zero (paper uses 1.0)
        
    Returns:
        Configured optimizer
    """
    return torch.optim.RMSprop(
        model_params,
        lr=lr,
        momentum=momentum,
        alpha=DECAY,  # alpha is the decay factor in RMSProp (paper uses 0.9)
        eps=epsilon,
        weight_decay=weight_decay
    )

# Training and validation loss function
def get_loss_fn():
    """
    Loss function for calorie prediction
    Using Mean Squared Error (MSE) instead of MAE
    """
    return nn.MSELoss()  # MSE loss


In [10]:

# Early Stopping Helper
class EarlyStopping:
    """Early stopping utility to prevent overfitting"""
    
    def __init__(self, patience=15, min_delta=0.1, mode='min'):
        """
        Args:
            patience: Number of epochs to wait after min has been hit
            min_delta: Minimum change to qualify as an improvement
            mode: 'min' for loss, 'max' for metrics like accuracy
        """
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.best_epoch = 0
        self.early_stop = False
    
    def __call__(self, score, epoch):
        """Return True if training should stop"""
        if self.best_score is None:
            self.best_score = score
            self.best_epoch = epoch
            return False
        
        if self.mode == 'min':
            improved = score < (self.best_score - self.min_delta)
        else:
            improved = score > (self.best_score + self.min_delta)
        
        if improved:
            self.best_score = score
            self.best_epoch = epoch
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                
        return self.early_stop


# Trainer class for InceptionV2 model
class Trainer:
    """Training manager for the calorie prediction model"""
    
    def __init__(
        self,
        model,
        train_loader,
        val_loader,
        criterion,
        optimizer,
        device,
        output_dir,
        early_stopping_patience=15
    ):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.criterion = criterion
        self.optimizer = optimizer
        self.device = device
        self.output_dir = output_dir
        
        # Early stopping
        self.early_stopping = EarlyStopping(
            patience=early_stopping_patience,
            min_delta=0.1,
            mode='min'
        )
        
        # Tensorboard
        self.writer = SummaryWriter(log_dir=os.path.join(output_dir, 'tensorboard'))
        
        # Tracking
        self.best_val_loss = float('inf')
        self.best_metrics = {}
    
    def train_epoch(self):
        """Train for one epoch"""
        self.model.train()
        total_loss = 0.0
        
        # Progress bar
        pbar = tqdm(self.train_loader, desc="Training")
        for batch in pbar:
            # Move to device
            rgb = batch['rgb'].to(self.device)
            calorie = batch['calorie'].to(self.device)
            
            # Forward pass
            self.optimizer.zero_grad()
            calorie_pred = self.model(rgb)
            
            # Compute loss
            loss = self.criterion(calorie_pred.squeeze(), calorie)
            
            # Backward pass
            loss.backward()
            
            # Gradient clipping for stability
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            
            self.optimizer.step()
            
            # Track metrics
            total_loss += loss.item()
            
            # Update progress bar
            pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
        
        return total_loss / len(self.train_loader)
    
    @torch.no_grad()
    def validate_epoch(self):
        """Validate for one epoch"""
        self.model.eval()
        total_loss = 0.0
        all_predictions = []
        all_targets = []
        
        with torch.no_grad():
            # Progress bar
            pbar = tqdm(self.val_loader, desc="Validation")
            for batch in pbar:
                # Move to device
                rgb = batch['rgb'].to(self.device)
                calorie = batch['calorie'].to(self.device)
                
                # Forward pass
                calorie_pred = self.model(rgb)
                
                # Compute loss
                loss = self.criterion(calorie_pred.squeeze(), calorie)
                total_loss += loss.item()
                
                # Store predictions and targets for metrics
                all_predictions.extend(calorie_pred.squeeze().cpu().numpy())
                all_targets.extend(calorie.cpu().numpy())
                
                # Update progress bar
                pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
            
            # Calculate metrics
            val_loss = total_loss / len(self.val_loader)
            mae = np.mean(np.abs(np.array(all_predictions) - np.array(all_targets)))
            mse = np.mean(np.square(np.array(all_predictions) - np.array(all_targets)))
            
            return val_loss, mae, mse
    
    def train(self, num_epochs):
        """Full training loop"""
        print(f"Starting training for {num_epochs} epochs...")
        
        # Create experiment directory if it doesn't exist
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Print model parameters
        print(f"Model has {self.model.get_num_parameters():,} trainable parameters")
        
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch+1}/{num_epochs}")
            
            # Train
            train_loss = self.train_epoch()
            
            # Validate
            val_loss, mae, mse= self.validate_epoch()
            
            # Log metrics
            self.writer.add_scalar('Loss/Train', train_loss, epoch)
            self.writer.add_scalar('Loss/Val', val_loss, epoch)
            self.writer.add_scalar('MAE', mae, epoch)
            self.writer.add_scalar('MSE', mse, epoch)  # Changed from MAE to MSE

            
            # Print epoch results
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Loss: {val_loss:.4f}")
            print(f"MAE: {mae:.2f}")
            
            # Save best model
            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.best_metrics = {
                    'epoch': epoch + 1,
                    'val_loss': val_loss,
                    'mae': mae,
                }
                
                # Save model checkpoint
                self._save_checkpoint(epoch)
                print(f"✓ New best model saved! (Val Loss: {val_loss:.4f})")
            
            # Early stopping
            if self.early_stopping(val_loss, epoch):
                print(f"Early stopping triggered after {epoch+1} epochs")
                print(f"Best epoch: {self.early_stopping.best_epoch+1}")
                break
        
        self.writer.close()
        print(f"\nTraining completed!")
        print(f"Best validation loss: {self.best_val_loss:.4f}")
    
    def _save_checkpoint(self, epoch):
        """Save model checkpoint"""
        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'val_loss': self.best_val_loss,
        }
        
        checkpoint_path = os.path.join(self.output_dir, 'best_model.pth')
        torch.save(checkpoint, checkpoint_path)


In [None]:
# Main experiment runner for training InceptionV2 model on Nutrition5k
def train_inceptionv2_model(
    data_root=DATA_ROOT,
    train_csv=None,
    val_csv=None,
    output_dir=OUTPUT_DIR,
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    momentum=MOMENTUM,
    decay=DECAY,
    epsilon=EPSILON,
    dropout_rate=DROPOUT_RATE,
    fc_dim=FC_DIM,
    img_size=IMG_SIZE,
    num_workers=NUM_WORKERS,
    early_stopping_patience=15,
    experiment_name=None,
    pretrained=False
):
    """
    Train the InceptionV2 model for calorie prediction
    
    Args:
        data_root: Path to the dataset directory
        train_csv/val_csv: Paths to train/val CSV files
        output_dir: Directory to save results
        batch_size: Batch size for training
        num_epochs: Number of training epochs
        learning_rate: Learning rate for optimizer
        momentum: Momentum for RMSProp
        decay: Decay rate for RMSProp
        epsilon: Epsilon value for RMSProp
        dropout_rate: Dropout rate for regularization
        fc_dim: Dimension of fully connected layers
        img_size: Input image size
        num_workers: Number of data loading workers
        early_stopping_patience: Patience for early stopping
        experiment_name: Name for this experiment run
        pretrained: Whether to use pretrained weights
        
    Returns:
        Best metrics from training
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Generate experiment name if not provided
    if experiment_name is None:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        experiment_name = f"inceptionv2_calorie_{timestamp}"
    
    # Create experiment directory
    exp_dir = os.path.join(output_dir, experiment_name)
    os.makedirs(exp_dir, exist_ok=True)
    
    # Create train/val splits if not provided
    if train_csv is None or val_csv is None:
        print("Creating train/validation split...")
        train_csv, val_csv = create_train_val_split(
            CSV_PATH,
            val_ratio=VAL_RATIO,
            random_seed=SEED
        )
    
    print(f"Train CSV: {train_csv}")
    print(f"Validation CSV: {val_csv}")
    
    # Create datasets
    print("\nLoading datasets...")
    train_dataset = Nutrition5KDataset(
        csv_path=train_csv,
        data_root=data_root,
        split='train',
        augment=True,
        img_size=img_size
    )
    
    val_dataset = Nutrition5KDataset(
        csv_path=val_csv,
        data_root=data_root,
        split='val',
        augment=False,
        img_size=img_size
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    # Build the model
    print("\nBuilding InceptionV2 model...")
    model = CalorieInceptionV2Model(
        pretrained=pretrained,
        dropout_rate=dropout_rate,
        fc_dim=fc_dim
    )
    model = model.to(device)
    
    # Initialize optimizer and loss function
    optimizer = get_optimizer(
        model.parameters(),
        lr=learning_rate,
        momentum=momentum,
        weight_decay=1e-6,  # Small weight decay for regularization
        epsilon=epsilon
    )
    
    criterion = get_loss_fn()
    
    # Save configuration
    config = {
        'batch_size': batch_size,
        'num_epochs': num_epochs,
        'learning_rate': learning_rate,
        'momentum': momentum,
        'decay': decay,
        'epsilon': epsilon,
        'dropout_rate': dropout_rate,
        'fc_dim': fc_dim,
        'img_size': img_size,
        'pretrained': pretrained,
        'num_workers': num_workers,
        'early_stopping_patience': early_stopping_patience
    }
    
    with open(os.path.join(exp_dir, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)
    
    # Print training information
    print("\n" + "="*60)
    print(f"TRAINING: InceptionV2 Calorie Prediction Model")
    print("="*60)
    print(f"Model parameters: {model.get_num_parameters():,}")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    print(f"Learning rate: {learning_rate}")
    print(f"RMSProp parameters: momentum={momentum}, decay={decay}, epsilon={epsilon}")
    print(f"FC dimensions: {fc_dim}")
    print(f"Output directory: {exp_dir}")
    print("="*60)
    
    # Create trainer
    trainer = Trainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        device=device,
        output_dir=exp_dir,
        early_stopping_patience=early_stopping_patience
    )
    
    # Train the model
    trainer.train(num_epochs)
    
    print(f"\nExperiment completed! Results saved to: {exp_dir}")
    return trainer.best_metrics


: 

In [None]:
# Run the experiment
if __name__ == "__main__":
    # Set experiment name with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    experiment_name = f"exp_inceptionv2_calorie_{timestamp}"
    
    # Optional: Run model parameter count check
    model = CalorieInceptionV2Model(pretrained=False, dropout_rate=DROPOUT_RATE, fc_dim=FC_DIM)
    print(f"Model parameter count: {model.get_num_parameters():,}")
    del model  # Free up memory
    
    # Print current configuration
    print("\nTraining configuration:")
    print(f"  Batch size: {BATCH_SIZE}")
    print(f"  FC dimensions: {FC_DIM}")
    print(f"  Learning rate: {LEARNING_RATE}")
    print(f"  Image size: {IMG_SIZE}x{IMG_SIZE}")
    print(f"  Number of workers: {NUM_WORKERS}")
    
    # Ensure we have GPU support if available
    if torch.cuda.is_available():
        print(f"\nGPU available: {torch.cuda.get_device_name(0)}")
        print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("\nNo GPU available, using CPU")
    
    # Run the training
    print("\nStarting training...")
    try:
        # Train the model
        metrics = train_inceptionv2_model(
            experiment_name=experiment_name,
            batch_size=BATCH_SIZE,
            num_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            momentum=MOMENTUM,
            decay=DECAY,
            epsilon=EPSILON,
            dropout_rate=DROPOUT_RATE,
            fc_dim=FC_DIM,
            img_size=IMG_SIZE,
            num_workers=NUM_WORKERS,
            early_stopping_patience=15
        )
        
        # Print final results
        print("\nTraining completed successfully!")
        print(f"Best validation MAE: {metrics['mae']:.2f}")
        print(f"Best epoch: {metrics['epoch']}")
        
    except Exception as e:
        print(f"\nTraining failed with error: {e}")
        import traceback
        traceback.print_exc()


Model parameter count: 10,899,105

Training configuration:
  Batch size: 16
  FC dimensions: 1024
  Learning rate: 0.001
  Image size: 256x256
  Number of workers: 4

GPU available: NVIDIA H100 80GB HBM3
GPU memory: 84.93 GB

Starting training...
Creating train/validation split...
Train CSV: ../Nutrition5K/train_split.csv
Validation CSV: ../Nutrition5K/val_split.csv

Loading datasets...
Loaded 2804 valid samples out of 2805
Loaded 495 valid samples out of 495

Building InceptionV2 model...

TRAINING: InceptionV2 Calorie Prediction Model
Model parameters: 10,899,105
Training samples: 2804
Validation samples: 495
Learning rate: 0.001
RMSProp parameters: momentum=0.9, decay=0.9, epsilon=1e-08
FC dimensions: 1024
Output directory: ../experiments/exp_inceptionv2_calorie_20251024_013408
Starting training for 40 epochs...
Model has 10,899,105 trainable parameters

Epoch 1/40


Training: 100%|██████████| 175/175 [00:12<00:00, 13.61it/s, Loss=68227.3750] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 13.46it/s, Loss=76793.5234] 


Train Loss: 60590.8357
Val Loss: 79799.3223
MAE: 231.87
✓ New best model saved! (Val Loss: 79799.3223)

Epoch 2/40


Training: 100%|██████████| 175/175 [00:15<00:00, 11.60it/s, Loss=174895.9375]
Validation: 100%|██████████| 31/31 [00:02<00:00, 12.95it/s, Loss=33989.1562]


Train Loss: 52333.2610
Val Loss: 37045.2835
MAE: 149.19
✓ New best model saved! (Val Loss: 37045.2835)

Epoch 3/40


Training: 100%|██████████| 175/175 [00:12<00:00, 13.51it/s, Loss=59037.8711] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 12.46it/s, Loss=18268.6973]


Train Loss: 41021.6421
Val Loss: 27750.2133
MAE: 130.20
✓ New best model saved! (Val Loss: 27750.2133)

Epoch 4/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.37it/s, Loss=44151.4766] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 13.95it/s, Loss=403822.0312]


Train Loss: 38051.5855
Val Loss: 489878.2319
MAE: 633.32

Epoch 5/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.55it/s, Loss=43070.8164] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.02it/s, Loss=42383.3477] 


Train Loss: 34272.2087
Val Loss: 54472.6598
MAE: 188.58

Epoch 6/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.31it/s, Loss=11864.2705] 
Validation: 100%|██████████| 31/31 [00:03<00:00, 10.29it/s, Loss=14228.9395] 


Train Loss: 33957.9909
Val Loss: 32353.5599
MAE: 119.49

Epoch 7/40


Training: 100%|██████████| 175/175 [00:14<00:00, 12.12it/s, Loss=78010.6875] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 12.97it/s, Loss=41916.4688] 


Train Loss: 31511.1000
Val Loss: 67346.4707
MAE: 181.93

Epoch 8/40


Training: 100%|██████████| 175/175 [00:15<00:00, 11.52it/s, Loss=67764.5938]
Validation: 100%|██████████| 31/31 [00:02<00:00, 12.86it/s, Loss=122949.8672]


Train Loss: 29892.3315
Val Loss: 183503.3233
MAE: 351.11

Epoch 9/40


Training: 100%|██████████| 175/175 [00:12<00:00, 13.65it/s, Loss=15350.0020] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 12.89it/s, Loss=29600.4648] 


Train Loss: 44860.7932
Val Loss: 65140.8335
MAE: 176.76

Epoch 10/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.36it/s, Loss=35037.4883] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.02it/s, Loss=5828.2930] 


Train Loss: 28742.7750
Val Loss: 21591.0838
MAE: 100.05
✓ New best model saved! (Val Loss: 21591.0838)

Epoch 11/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.36it/s, Loss=39142.5742] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.19it/s, Loss=46806.8594] 


Train Loss: 57493.4731
Val Loss: 91353.6022
MAE: 214.64

Epoch 12/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.37it/s, Loss=36335.6836] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.21it/s, Loss=146467.1719]


Train Loss: 36895.7647
Val Loss: 188349.9892
MAE: 349.42

Epoch 13/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.44it/s, Loss=26678.4727] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.54it/s, Loss=7983.7964] 


Train Loss: 29952.5399
Val Loss: 17092.1983
MAE: 88.75
✓ New best model saved! (Val Loss: 17092.1983)

Epoch 14/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.36it/s, Loss=37150.3750] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.18it/s, Loss=11522.4912] 


Train Loss: 31204.4808
Val Loss: 33076.2534
MAE: 126.24

Epoch 15/40


Training: 100%|██████████| 175/175 [00:14<00:00, 12.10it/s, Loss=30233.2207] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 13.26it/s, Loss=20755.5156] 


Train Loss: 32146.3525
Val Loss: 43072.0595
MAE: 144.58

Epoch 16/40


Training: 100%|██████████| 175/175 [00:13<00:00, 13.33it/s, Loss=16361.7314] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 13.71it/s, Loss=14130.3008]


Train Loss: 31654.8319
Val Loss: 14916.1169
MAE: 89.77
✓ New best model saved! (Val Loss: 14916.1169)

Epoch 17/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.08it/s, Loss=20925.4297] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.32it/s, Loss=12406.3896]


Train Loss: 33182.6127
Val Loss: 28242.9974
MAE: 114.82

Epoch 18/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.52it/s, Loss=59182.8359]
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.44it/s, Loss=9853.0830] 


Train Loss: 26671.7349
Val Loss: 19636.1633
MAE: 107.91

Epoch 19/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.58it/s, Loss=23733.7637]
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.45it/s, Loss=9972.2793] 


Train Loss: 30601.6517
Val Loss: 17461.9334
MAE: 97.82

Epoch 20/40


Training: 100%|██████████| 175/175 [00:11<00:00, 14.69it/s, Loss=27010.7539] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.31it/s, Loss=17598.0215] 


Train Loss: 35336.3669
Val Loss: 42270.0524
MAE: 144.53

Epoch 21/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.56it/s, Loss=18762.7266]
Validation: 100%|██████████| 31/31 [00:02<00:00, 14.12it/s, Loss=11622.4873]


Train Loss: 29740.1647
Val Loss: 19684.9223
MAE: 103.93

Epoch 22/40


Training: 100%|██████████| 175/175 [00:11<00:00, 14.61it/s, Loss=25524.6777] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 13.61it/s, Loss=5511.2534] 


Train Loss: 28751.4916
Val Loss: 18673.1548
MAE: 89.91

Epoch 23/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.51it/s, Loss=16321.3018] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 13.60it/s, Loss=36588.4141]


Train Loss: 41094.0699
Val Loss: 35407.2401
MAE: 134.51

Epoch 24/40


Training: 100%|██████████| 175/175 [00:12<00:00, 14.40it/s, Loss=15222.2227] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 13.49it/s, Loss=28102.8809] 


Train Loss: 37452.4287
Val Loss: 62971.8583
MAE: 176.25

Epoch 25/40


Training: 100%|██████████| 175/175 [00:13<00:00, 12.60it/s, Loss=6665.3281] 
Validation: 100%|██████████| 31/31 [00:02<00:00, 13.27it/s, Loss=6459.4058] 


Train Loss: 28815.7880
Val Loss: 14860.5781
MAE: 89.16
✓ New best model saved! (Val Loss: 14860.5781)

Epoch 26/40


Training:  74%|███████▎  | 129/175 [00:09<00:04, 10.61it/s, Loss=13466.7812] 