## Environment Setting

In [1]:
import torch
from PIL import Image, UnidentifiedImageError
from torch.utils.data import Dataset, DataLoader, random_split
import os
import pandas as pd
from pathlib import Path
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torchvision import transforms
import random
import torchvision.transforms.functional as TF
#from sklearn.model_selection import KFold 
import numpy as np


## DataLoader

### Load Image

In [2]:
class MyDataset(Dataset):
    def __init__(self, 
                 root, 
                 if_train = True,
                 data_dir = "train",
                 color_dir = "color",
                 depth_dir = "depth_raw",
                 rgb_name = "rgb.png",
                 depth_name="depth_raw.png",
                 csv_name = "nutrition5k_train.csv",
                 transform = None, 
                 ):
        self.root = root
        self.data = self.root / data_dir
        self.color_dir  = self.data / color_dir
        self.depth_dir = self.data / depth_dir
        self.rgb_name = rgb_name
        self.depth_name = depth_name
        self.if_train = if_train
        


        self.to_tensor = transforms.ToTensor()
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        self.color_jitter = transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1)
    

        if self.if_train:
            df = pd.read_csv(self.root / csv_name)
            self.id2cal = {str(r["ID"]) : float(r["Value"]) for _, r in df.iterrows()}

        rgb_paths = sorted((p / rgb_name for p in self.color_dir.glob("dish_*") if (p / rgb_name).exists()))
        if not rgb_paths:
            raise RuntimeError(f"Found 0 images in {self.color_dir}.")
        
        self.samples = []
        if self.if_train:
            for rgb_path in rgb_paths:
                dish_id = rgb_path.parent.name
                if dish_id not in self.id2cal:
                    print(f"Warning: {dish_id} not found in CSV.")
                depth_path = self.depth_dir / dish_id / self.depth_name 
                self.samples.append((rgb_path, depth_path, self.id2cal[dish_id]))
        else:
            for rgb_path in rgb_paths:
                dish_id = rgb_path.parent.name
                depth_path = self.depth_dir / dish_id / self.depth_name 
                self.samples.append((rgb_path, depth_path, None))
                
    def __len__(self):
            return len(self.samples)

    def __getitem__(self, idx):
        try:
            rgb_path, depth_path, cal = self.samples[idx]
            rgb = Image.open(rgb_path).convert("RGB")
            depth = Image.open(depth_path).convert("L")

            # Data enhencement
            # Data enhancement
            if self.if_train:
                # 1. Center crop to 480x480
                rgb = TF.center_crop(rgb, (480, 480))
                depth = TF.center_crop(depth, (480, 480))

                # 2. Random 90-degree rotation
                k = random.randint(0, 3)  # 0: 0°, 1: 90°, 2: 180°, 3: 270°
                if k > 0:
                    rgb = TF.rotate(rgb, angle=90 * k, interpolation=TF.InterpolationMode.BILINEAR)
                    depth = TF.rotate(depth, angle=90 * k, interpolation=TF.InterpolationMode.NEAREST)

            else:
                # Apply same crop for validation/test
                rgb = TF.center_crop(rgb, (480, 480))
                depth = TF.center_crop(depth, (480, 480))

            
            rgb = self.to_tensor(rgb)
            depth = self.to_tensor(depth)
            rgb = self.normalize(rgb)

            if self.if_train:
                cal = torch.tensor(cal, dtype=torch.float32)
                return rgb, depth, cal
            else:
                return rgb, depth

        except (UnidentifiedImageError, FileNotFoundError, Exception) as e:
            print(f"waring: index {idx} sample processing fail, will be taken place by original sample。error: {e}")
            new_idx = random.randint(0, len(self) - 1)
            return self.__getitem__(new_idx)
        

In [3]:
print(Path.cwd())

/mnt/c/Users/lhg45/Desktop/COMP90086/PJ/Nutrition5K


In [4]:
train_set = MyDataset(root=Path.cwd(), if_train=True, transform=False)
test_set = MyDataset(root=Path.cwd(), data_dir="test", if_train=False, transform=False)

dataset_size = len(train_set)
val_ratio = 0.1
val_size = int(dataset_size * val_ratio)
train_size = dataset_size - val_size
train_subset, val_subset = random_split(train_set, [train_size, val_size])

#torch.manual_seed(42) 

train_loader = DataLoader(train_subset, batch_size=8, shuffle=True,
                          num_workers=8, pin_memory=True)
val_loader = DataLoader(val_subset, batch_size=8, shuffle=False,
                        num_workers=8, pin_memory=True)
test_loader  = DataLoader(test_set,  batch_size=8, shuffle=False,
                          num_workers=8, pin_memory=True)

# Model Definition

In [5]:
def conv_block(c_in, c_out, k=3, s=1, p=1, bias=False):
    return nn.Sequential(
        nn.Conv2d(c_in, c_out, kernel_size=k, stride=s, padding=p, bias=bias),
        nn.BatchNorm2d(c_out),
        nn.ReLU(inplace=True)
    )


class PreActResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        
        # BN -> ReLU -> Conv
        self.preact_conv1 = nn.Sequential(
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) 
        )
        
        self.preact_conv2 = nn.Sequential(
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        )
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)

    def forward(self, x):
        identity = self.shortcut(x) 
        out = self.preact_conv1(x)
        out = self.preact_conv2(out)
        out += identity          
        return out
    
class AttentionPooling(nn.Module):
    def __init__(self, dim_model: int):
        super().__init__()
        self.attn_fc = nn.Linear(dim_model, 1)

    def forward(self, x: torch.Tensor):  # x: (B, num_tokens, dim_model)
        attn_scores = self.attn_fc(x)              # (B, num_tokens, 1)
        attn_weights = F.softmax(attn_scores, dim=1)
        pooled = torch.sum(attn_weights * x, dim=1)
        return pooled
    
class TransformerHead(nn.Module):
    def __init__(
        self,
        in_features: int = 512,
        num_tokens: int = 8,
        dim_model: int = 512,
        nhead: int = 8,
        dropout: float = 0.1
    ):
        super().__init__()
        self.num_tokens = num_tokens
        self.dim_model  = dim_model

        self.token_embed = nn.Linear(in_features, num_tokens * dim_model)
        self.pos_embed = nn.Parameter(torch.zeros(1, num_tokens, dim_model))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_model,
            nhead=nhead,
            dropout=dropout,
            batch_first=True,
            activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(
            encoder_layer,
            num_layers=2
        )

        self.attention_pool = AttentionPooling(dim_model)

        self.fc = nn.Sequential(
            nn.LayerNorm(512),              
            nn.Linear(512, 256),           
            nn.GELU(),                      
            nn.Dropout(0.2),                
            nn.Linear(256, 64),             
            nn.ReLU(),                      
            nn.Dropout(0.1),                
            nn.Linear(64, 1)                
        )

    def forward(self, x):
        B = x.size(0)
        tokens = self.token_embed(x)                     # (B, num_tokens * dim_model)
        tokens = tokens.view(B, self.num_tokens, self.dim_model)
        tokens = tokens + self.pos_embed
        tokens = self.transformer(tokens)                # (B, num_tokens, dim_model)
        feat = self.attention_pool(tokens)               # (B, dim_model)
        return self.fc(feat).squeeze(1)
    
class MLPHead(nn.Module):
    def __init__(self, in_features: int = 128):
        super().__init__()
        self.fc = nn.Sequential(
            nn.LayerNorm(in_features),
            nn.Linear(in_features, 256),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.fc(x).squeeze(1)

    
    
class BFPFusion(nn.Module):
        def __init__(self, out_channels=256, refine_level=2):
            super().__init__()
            self.refine_level = refine_level
            self.refine_conv = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)

        def forward(self, feats):

            target_size = feats[self.refine_level].shape[2:]
            resized = [F.interpolate(f, size=target_size, mode="nearest") for f in feats]
            fused = torch.stack(resized, dim=0).mean(dim=0)  # (B, C, H, W)

            fused = self.refine_conv(fused)

            return fused


class RGBBranch(nn.Module):
    def __init__(self, in_ch=3):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv2d(in_ch, 16, kernel_size=5, stride=2, padding=2, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        # 四个stage，对应C2~C5
        self.layer1 = nn.Sequential(
            PreActResidualBlock(16, 32, stride=2),
            PreActResidualBlock(32, 32, stride=1)
        )
        self.layer2 = nn.Sequential(
            PreActResidualBlock(32, 64, stride=2),
            PreActResidualBlock(64, 64, stride=1)
        )
        self.layer3 = nn.Sequential(
            PreActResidualBlock(64, 128, stride=2),
            PreActResidualBlock(128, 128, stride=1)
        )
        self.layer4 = nn.Sequential(
            PreActResidualBlock(128, 256, stride=2),
            PreActResidualBlock(256, 256, stride=1)
        )

    def forward(self, x):
        x = self.stem(x)
        c2 = self.layer1(x)  # 1/4
        c3 = self.layer2(c2) # 1/8
        c4 = self.layer3(c3) # 1/16
        c5 = self.layer4(c4) # 1/32
        return [c2, c3, c4, c5]


class DepthBranch(RGBBranch):
    def __init__(self, in_ch=1):
        super().__init__(in_ch)


class FPN(nn.Module):
    def __init__(self, in_channels=[32,64,128,256], out_channels=128):
        super().__init__()
        self.lateral_convs = nn.ModuleList([
            nn.Conv2d(c, out_channels, kernel_size=1) for c in in_channels
        ])
        self.output_convs = nn.ModuleList([
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1) for _ in in_channels
        ])

    def forward(self, features):
        c2, c3, c4, c5 = features
        p5 = self.lateral_convs[3](c5)
        p4 = self.lateral_convs[2](c4) + F.interpolate(p5, size=c4.shape[2:], mode="nearest")
        p3 = self.lateral_convs[1](c3) + F.interpolate(p4, size=c3.shape[2:], mode="nearest")
        p2 = self.lateral_convs[0](c2) + F.interpolate(p3, size=c2.shape[2:], mode="nearest")

        p2 = self.output_convs[0](p2)
        p3 = self.output_convs[1](p3)
        p4 = self.output_convs[2](p4)
        p5 = self.output_convs[3](p5)

        return [p2, p3, p4, p5]
    
class CBAM(nn.Module):
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.channel_att = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(channels, channels // reduction, 1),
            nn.ReLU(),
            nn.Conv2d(channels // reduction, channels, 1),
            nn.Sigmoid()
        )
        self.spatial_att = nn.Sequential(
            nn.Conv2d(2, 1, kernel_size=7, padding=3),
            nn.Sigmoid()
        )

    def forward(self, x):
        ca = self.channel_att(x)
        x = x * ca
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        sa = self.spatial_att(torch.cat([avg_out, max_out], dim=1))
        x = x * sa
        return x


class RGBDNet(nn.Module):
    def __init__(self, num_tokens=16, dim_model=512, nhead=8, dropout=0.1):
        super().__init__()
        self.rgb   = RGBBranch(3)
        self.depth = DepthBranch(1)

        fusion_dim = 128

        self.fpn_rgb   = FPN([32,64,128,256], fusion_dim)
        self.fpn_depth = FPN([32,64,128,256], fusion_dim)
        self.bfp = BFPFusion(out_channels=fusion_dim, refine_level=2)

        self.cbam = CBAM(channels=fusion_dim)
        self.head = MLPHead(in_features=fusion_dim)

        '''
        self.head = TransformerHead(
            in_features=fusion_dim,
            num_tokens=num_tokens,
            dim_model=dim_model,
            nhead=nhead,
            dropout=dropout
        )
        '''

    def forward(self, rgb, depth):
        f_rgb   = self.rgb(rgb)     # [c2,c3,c4,c5]
        f_depth = self.depth(depth)

        p_rgb   = self.fpn_rgb(f_rgb)     # [p2,p3,p4,p5]
        p_depth = self.fpn_depth(f_depth)

        fused = [r + d for r, d in zip(p_rgb, p_depth)]
        out = self.bfp(fused)
        #out = self.cbam(out)

        out_vec = F.adaptive_avg_pool2d(out, 1).flatten(1)
        output = self.head(out_vec)
        return output



# Training

In [None]:
class DualThresholdEarlyStopping:
    def __init__(self, train_threshold=7000.0, val_threshold=7000.0):
        self.train_threshold = train_threshold
        self.val_threshold = val_threshold
        self.early_stop = False

    def __call__(self, train_loss, val_loss):
        if train_loss < self.train_threshold and val_loss < self.val_threshold:
            self.early_stop = True



if torch.cuda.is_available():
    print(f"detect {torch.cuda.device_count()} GPU avaliable。")
    device = torch.device('cuda')
else:
    print("No GPU detect, runing on cpu")
    device = torch.device('cpu')

model = RGBDNet()  
# if torch.cuda.device_count() > 1:
#     print("using multipule GPU")
#     model = nn.DataParallel(model)

model.to(device)

#optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6 )
#optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, weight_decay = 5e-4 )
#scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
criterion = nn.MSELoss()
early_stopper = DualThresholdEarlyStopping(train_threshold=9000, val_threshold=7500)




train_losses, val_losses = [], []
train_smapes, val_smapes = [], []

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss, smape_sum, n_batches = 0.0, 0.0, 0
    print("Start epoch: " + str(epoch+1))
    image_count = 0

    for rgb, depth, cal in train_loader:
        rgb, depth, cal = rgb.to(device), depth.to(device), cal.to(device)
        pred = model(rgb, depth)
        loss = criterion(pred, cal)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * rgb.size(0)
        smape_batch = torch.mean( 2 * torch.abs(pred - cal) / (torch.abs(pred) + torch.abs(cal) + 1e-8)).item()
        smape_sum += smape_batch
        n_batches += 1

        image_count += rgb.size(0)
        print(f"Processed {image_count} images", end='\r')

    train_loss = running_loss / len(train_loader.dataset)
    train_smape = smape_sum / n_batches
    train_losses.append(train_loss)
    train_smapes.append(train_smape)

    model.eval()
    val_loss, smape_sum, n_batches = 0.0, 0.0, 0
    with torch.no_grad():
        for rgb, depth, cal in val_loader:
            rgb, depth, cal = rgb.to(device), depth.to(device), cal.to(device)
            pred = model(rgb, depth)
            loss = criterion(pred, cal)
            val_loss += loss.item() * rgb.size(0)
            smape_batch = torch.mean( 2 * torch.abs(pred - cal) / (torch.abs(pred) + torch.abs(cal) + 1e-8)).item()
            smape_sum += smape_batch
            n_batches += 1

    val_loss /= len(val_loader.dataset)
    val_smape = smape_sum / n_batches
    val_losses.append(val_loss)
    val_smapes.append(val_smape)

    torch.cuda.empty_cache()

    print(f"Epoch [{epoch+1}/{num_epochs}] | "
          f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
          f"Train SMAPE: {train_smape*100:.3f}%, Val MAPE: {val_smape*100:.3f}%")

    early_stopper(train_loss, val_loss)
    if early_stopper.early_stop:
        print("✅ Early stopping: both train and val loss thresholds met.")
        break




detect 1 GPU avaliable。
Start epoch: 1
Epoch [1/100] | Train Loss: 59971.7185, Val Loss: 24684.7790, Train SMAPE: 102.852%, Val MAPE: 67.438%
Start epoch: 2
Epoch [2/100] | Train Loss: 32091.5080, Val Loss: 30632.5761, Train SMAPE: 70.748%, Val MAPE: 65.278%
Start epoch: 3
Epoch [3/100] | Train Loss: 28321.9925, Val Loss: 25133.7538, Train SMAPE: 64.502%, Val MAPE: 63.107%
Start epoch: 4
Epoch [4/100] | Train Loss: 27105.2035, Val Loss: 20334.5119, Train SMAPE: 64.199%, Val MAPE: 61.326%
Start epoch: 5
Epoch [5/100] | Train Loss: 26305.0285, Val Loss: 16376.5678, Train SMAPE: 63.230%, Val MAPE: 55.928%
Start epoch: 6
Epoch [6/100] | Train Loss: 26680.3582, Val Loss: 16305.8887, Train SMAPE: 62.826%, Val MAPE: 57.591%
Start epoch: 7
Epoch [7/100] | Train Loss: 24558.6700, Val Loss: 15347.6377, Train SMAPE: 63.141%, Val MAPE: 54.883%
Start epoch: 8
Epoch [8/100] | Train Loss: 24765.5336, Val Loss: 15745.6844, Train SMAPE: 62.693%, Val MAPE: 55.303%
Start epoch: 9
Epoch [9/100] | Train Lo

In [None]:
save_path = "checkpoints/FFN_BFN_epoch100_10_25.pth" 
torch.save(model.state_dict(), save_path)
print(f"Model weights saved to {save_path}")


## Plot

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.title('L2 Huber Loss Curve'); plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend()

plt.subplot(1,2,2)
plt.plot([m*100 for m in train_smapes], label='Train SMAPE')
plt.plot([m*100 for m in val_smapes], label='Val SMAPE')
plt.title('SMAPE Curve'); plt.xlabel('Epoch'); plt.ylabel('SMAPE (%)'); plt.legend()

plt.tight_layout()
plt.show()

# Testing

In [None]:
model.eval()
predictions = []
submission_df = pd.DataFrame({"ID": [], "Value": []})
with torch.no_grad():
    for rgb, depth in test_loader:
        rgb, depth = rgb.to(device), depth.to(device)
        pred = model(rgb, depth)
        predictions.extend(pred.cpu().numpy())

submission_df = pd.DataFrame({
    "ID": [f"dish_{3300+i:04d}" for i in range(1, len(predictions)+1)],
    "Value": [0 if val < 5 else val for val in predictions]
})



In [None]:
output_filepath = "submission.csv"

submission_df.to_csv(output_filepath, index=False)