In [6]:
import os
import re
import json
import numpy as np
import cv2
import matplotlib.pyplot as plt

path_to = "../.."

# Set your local paths to each folder
ipd_val_dir = f"{path_to}/ipd_val/val"       # Validation images and labels
ipd_base_dir = f"{path_to}/ipd_base"     # Base archive: camera parameters, dataset_info, etc.
ipd_models_dir = f"{path_to}/ipd_models" # 3D object models (PLY files)

# Utility to extract image ID from filename (e.g., "000123.png" -> "123" -> "123")
def get_image_id(filename):
    numeric_str = re.sub(r'\D', '', filename)
    return str(int(numeric_str)) if numeric_str else None

def parse_scene_jsons(scene_path):
    """
    Parse the JSON files in a single scene folder.
    Returns dictionaries for:
      - Camera parameters: cam_params_dict[cam_idx][img_id]
      - Ground-truth poses: gt_dict[cam_idx][img_id]
      - Ground-truth info (bounding boxes): gt_info_dict[cam_idx][img_id]
    """
    # Initialize dictionaries for cameras 0..9 (adjust if needed)
    cam_params_dict = {i: {} for i in range(10)}
    gt_dict         = {i: {} for i in range(10)}
    gt_info_dict    = {i: {} for i in range(10)}
    
    json_files = sorted([f for f in os.listdir(scene_path) if f.endswith(".json")])
    for json_file in json_files:
        file_path = os.path.join(scene_path, json_file)
        with open(file_path, 'r') as f:
            data = json.load(f)
        # Find camera index in filename, e.g. "scene_camera_cam1.json"
        match = re.search(r'_cam(\d+)', json_file)
        if not match:
            continue
        cam_idx = int(match.group(1))
        
        if "scene_camera_cam" in json_file:
            for img_id_str, cam_params in data.items():
                K = np.array(cam_params["cam_K"]).reshape(3, 3)
                R = np.array(cam_params["cam_R_w2c"]).reshape(3, 3)
                t = np.array(cam_params["cam_t_w2c"])
                depth_scale = cam_params["depth_scale"]
                cam_params_dict[cam_idx][img_id_str] = {
                    "cam_K": K,
                    "cam_R_w2c": R,
                    "cam_t_w2c": t,
                    "depth_scale": depth_scale
                }
        elif "scene_gt_cam" in json_file:
            for img_id_str, annotations in data.items():
                pose_list = []
                for ann in annotations:
                    R_m2c = np.array(ann["cam_R_m2c"]).reshape(3, 3)
                    t_m2c = np.array(ann["cam_t_m2c"])
                    obj_id = ann["obj_id"]
                    pose_list.append({
                        "cam_R_m2c": R_m2c,
                        "cam_t_m2c": t_m2c,
                        "obj_id": obj_id
                    })
                gt_dict[cam_idx][img_id_str] = pose_list
        elif "scene_gt_info_cam" in json_file:
            for img_id_str, info_list in data.items():
                gt_info_dict[cam_idx][img_id_str] = info_list
    
    return {
        "cam_params": cam_params_dict,
        "gt": gt_dict,
        "gt_info": gt_info_dict
    }

def parse_all_scenes(val_path):
    """
    Iterate over all scene folders in val_path (e.g. "000000", "000001", ...),
    parse their JSON files, and return a dict keyed by scene ID.
    """
    scene_dict = {}
    scene_folders = sorted([d for d in os.listdir(val_path) if d.isdigit()])
    for scene_id in scene_folders:
        scene_folder = os.path.join(val_path, scene_id)
        if not os.path.isdir(scene_folder):
            continue
        print(f"Parsing scene {scene_id} ...")
        scene_data = parse_scene_jsons(scene_folder)
        scene_dict[scene_id] = scene_data
    return scene_dict

# Parse all scenes from the validation directory (assumed to contain scene folders)
all_scenes = parse_all_scenes(f"{ipd_val_dir}")
print("Parsed scenes:", list(all_scenes.keys()))


Parsing scene 000000 ...
Parsing scene 000001 ...
Parsing scene 000002 ...
Parsing scene 000003 ...
Parsing scene 000004 ...
Parsing scene 000005 ...
Parsing scene 000006 ...
Parsing scene 000007 ...
Parsing scene 000008 ...
Parsing scene 000009 ...
Parsing scene 000010 ...
Parsing scene 000011 ...
Parsing scene 000012 ...
Parsing scene 000013 ...
Parsing scene 000014 ...
Parsed scenes: ['000000', '000001', '000002', '000003', '000004', '000005', '000006', '000007', '000008', '000009', '000010', '000011', '000012', '000013', '000014']


In [7]:

import torch
from torch.utils.data import Dataset, DataLoader

class DualModalDataset(Dataset):
    def __init__(self, scene_dict, scene_id, cam_idx, val_folder,
                 rgb_folder_template="rgb_cam{}", depth_folder_template="depth_cam{}",
                 transform=None):
        """
        Args:
            scene_dict (dict): Parsed scenes dictionary.
            scene_id (str): Scene ID (e.g., "000000").
            cam_idx (int): Camera index to use (here, we use 1).
            rgb_folder_template (str): Template for the RGB folder name.
            depth_folder_template (str): Template for the depth folder name.
            transform (callable, optional): Optional transform to apply on images.
        """
        self.scene_data = scene_dict[scene_id]
        self.cam_idx = cam_idx
        self.rgb_folder = os.path.join(val_folder, scene_id, rgb_folder_template.format(cam_idx))
        self.depth_folder = os.path.join(val_folder, scene_id, depth_folder_template.format(cam_idx))
        self.image_files = sorted([f for f in os.listdir(self.rgb_folder) if f.endswith((".png", ".jpg"))])
        self.transform = transform
        
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        # Load RGB image
        rgb_file = self.image_files[idx]
        rgb_path = os.path.join(self.rgb_folder, rgb_file)
        rgb_img = cv2.imread(rgb_path)
        if rgb_img is None:
            raise ValueError(f"Failed to load RGB image: {rgb_path}")
        rgb_img = cv2.cvtColor(rgb_img, cv2.COLOR_BGR2RGB)
        
        # Load corresponding depth image (assumes same filename in depth folder)
        depth_path = os.path.join(self.depth_folder, rgb_file)
        depth_img = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)  # depth may be single-channel
        if depth_img is None:
            raise ValueError(f"Failed to load Depth image: {depth_path}")
        # If depth image has more than one channel, take the first channel
        if len(depth_img.shape) == 3:
            depth_img = depth_img[:, :, 0]
        
        # Normalize images:
        rgb_img = rgb_img.astype(np.float32) / 255.0  # RGB in [0,1]
        depth_img = depth_img.astype(np.float32)
        # Optionally, scale depth image (using a depth scale from camera parameters could be used here)
        depth_img = depth_img / np.max(depth_img) if np.max(depth_img) > 0 else depth_img
        
        # Create a dummy segmentation mask from bounding boxes
        # Initialize mask with zeros (binary segmentation)
        mask = np.zeros((rgb_img.shape[0], rgb_img.shape[1]), dtype=np.uint8)
        img_id = str(int(get_image_id(rgb_file)))
        gt_info = self.scene_data["gt_info"].get(self.cam_idx, {}).get(img_id, [])
        for obj in gt_info:
            if "bbox_visib" in obj:
                x, y, w, h = obj["bbox_visib"]
            elif "bbox_obj" in obj:
                x, y, w, h = obj["bbox_obj"]
            else:
                continue
            x, y, w, h = map(int, [x, y, w, h])
            cv2.rectangle(mask, (x, y), (x+w, y+h), 1, -1)  # fill rectangle with 1
        
        # Convert images and mask to torch tensors
        rgb_tensor = torch.from_numpy(rgb_img).permute(2, 0, 1)  # (3, H, W)
        depth_tensor = torch.from_numpy(depth_img).unsqueeze(0)    # (1, H, W)
        mask_tensor = torch.from_numpy(mask).long()                # (H, W) with class labels {0,1}
        
        if self.transform:
            rgb_tensor = self.transform(rgb_tensor)
            depth_tensor = self.transform(depth_tensor)
        
        # Return a tuple: (rgb, depth, segmentation mask)
        return rgb_tensor, depth_tensor, mask_tensor



In [None]:

import torch.nn as nn
import torch.nn.functional as F

class FuseNetDual(nn.Module):
    def __init__(self, num_classes=2):
        super(FuseNetDual, self).__init__()
        # RGB encoder branch (input: 3 channels)
        self.rgb_conv1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True)
        )
        self.rgb_conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True)
        )
        # Depth encoder branch (input: 1 channel)
        self.depth_conv1 = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(inplace=True)
        )
        self.depth_conv2 = nn.Sequential(
            nn.Conv2d(8, 16, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True)
        )
        # Fusion: concatenate features from both branches
        # After encoder, assume RGB branch gives 32 channels and depth branch gives 16 channels
        self.fusion_conv = nn.Sequential(
            nn.Conv2d(32+16, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True)
        )
        # Decoder: simple upsampling to original resolution
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(16, num_classes, kernel_size=3, stride=2, padding=1, output_padding=1)
        )
        
    def forward(self, rgb, depth):
        # Encode RGB
        x_rgb = self.rgb_conv1(rgb)    # (B, 16, H/2, W/2)
        x_rgb = self.rgb_conv2(x_rgb)   # (B, 32, H/4, W/4)
        # Encode depth
        x_depth = self.depth_conv1(depth)  # (B, 8, H/2, W/2)
        x_depth = self.depth_conv2(x_depth)  # (B, 16, H/4, W/4)
        # Fuse: concatenate along channel dimension
        x = torch.cat([x_rgb, x_depth], dim=1)  # (B, 48, H/4, W/4)
        x = self.fusion_conv(x)                 # (B, 64, H/4, W/4)
        # Decode back to segmentation map (assume num_classes channels)
        x = self.decoder(x)  # (B, num_classes, H, W)
        return x

In [9]:
import torch.optim as optim

# Configuration dictionary for hyperparameters and training
config = {
    "num_classes": 2,
    "learning_rate": 1e-3,
    "num_epochs": 5,
    "batch_size": 4,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

# Initialize the model and move it to the device
model = FuseNetDual(num_classes=config["num_classes"]).to(config["device"])
print("Model initialized on device:", config["device"])

# Loss: CrossEntropyLoss for segmentation (target shape: (B, H, W) with class labels)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])

Model initialized on device: cpu


In [None]:
# Training Loop on the Validation Dataset
def train_model(model, dataloader, criterion, optimizer, num_epochs, device):
    model.train()
    epoch_losses = []
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (rgb, depth, target) in enumerate(dataloader):
            rgb = rgb.to(device)
            depth = depth.to(device)
            # For CrossEntropyLoss, target should be of shape (B, H, W) and of type long.
            target = target.to(device)
            
            optimizer.zero_grad()
            outputs = model(rgb, depth)  # outputs: (B, num_classes, H, W)
            # CrossEntropyLoss expects raw logits and target of shape (B, H, W)
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * rgb.size(0)
        
        avg_loss = running_loss / len(dataloader.dataset)
        epoch_losses.append(avg_loss)
        print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")
    return epoch_losses

scene_id = "000000"
cam_idx = 1

# Create dataset and dataloader for dual-modal data
dataset = DualModalDataset(all_scenes, scene_id, cam_idx, ipd_val_dir,
                           rgb_folder_template="rgbloss_cam{}", depth_folder_template="depth_cam{}")
dataloader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True, num_workers=0)

# Train the model
epoch_losses = train_model(model, dataloader, criterion, optimizer, config["num_epochs"], config["device"])


RuntimeError: size mismatch (got input: [4, 2, 4320, 7680] , target: [4, 2160, 3840]

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(1, config["num_epochs"]+1), epoch_losses, marker="o", linestyle="--")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.title("Averaged Training Loss Across Epochs")
plt.grid(True)
plt.show()
