# 6D Pose Estimation

## Set up the project

We will work with a portion of this dataset, which you can find here: https://drive.google.com/drive/folders/19ivHpaKm9dOrr12fzC8IDFczWRPFxho7

In [None]:
# Step 1: Download the dataset (LineMOD)
# Download LineMOD dataset
# create directory structure without errors
!mkdir -p datasets/linemod/
%cd datasets/linemod/

Check working directory

In [None]:
!pwd

In [None]:
# Download DenseFusion Folder (Which includes a portion of the LimeMOD dataset) 
!gdown --folder "https://drive.google.com/drive/folders/19ivHpaKm9dOrr12fzC8IDFczWRPFxho7"

In [None]:
!mkdir -p DenseFusion/
%cd DenseFusion/

In [None]:
!unzip ../Linemod_preprocessed.zip

Install requirements

In [None]:
!pip install -r ../../../requirements.txt

Get working directory

In [None]:
path = !pwd
path = path[0]

In [None]:
import os
import yaml
import torch
import torchvision
import open3d as o3d
import itertools
import shutil
from torch.utils.data import Dataset
from torch import nn, optim
from PIL import Image
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import matplotlib.patches as patches

IMG_WIDTH = 640
IMG_HEIGHT = 480

## Data Exploration

Load an image

In [None]:
img_path = f"{path}/Linemod_preprocessed/data/01/rgb/0000.png"
img = Image.open(img_path).convert("RGB")
plt.imshow(img)
plt.show()

In [None]:
class CustomDataset(Dataset): # used to load and preprocess data
    def __init__(self, dataset_root, split='train', train_ratio=0.7, seed=42):
        """
        Args:
            dataset_root (str): Path to the dataset directory.
            split (str): 'train', 'validation' or 'test'.
            train_ratio (float): Percentage of data used for training (default 70%).
            seed (int): Random seed for reproducibility.
        """
        self.dataset_root = dataset_root
        self.split = split
        self.train_ratio = train_ratio
        self.seed = seed

        # Get list of all samples (folder_id, sample_id)
        self.samples = self.get_all_samples()

        # Check if samples were found
        if not self.samples:
            raise ValueError(f"No samples found in {self.dataset_root}. Check the dataset path and structure.")

        # Split into training and validation+test sets
        labels = [el[0] for el in self.samples]
        self.train_samples, self.val_test_samples = train_test_split(
            self.samples, train_size=self.train_ratio, random_state=self.seed, stratify=labels
        )

        # split validation+test set (by default 30% of the original dataset) into validation and test sets
        labels = [el[0] for el in self.val_test_samples]
        self.val_samples, self.test_samples = train_test_split(self.val_test_samples, train_size=0.5, random_state=self.seed, stratify=labels)

        # Select the appropriate split
        if split == "train":
            self.samples = self.train_samples
        elif split == "validation":
            self.samples = self.val_samples
        else:
            self.samples = self.test_samples

        # Define image transformations
        self.transform = transforms.Compose([
            transforms.ToTensor(),
        ])

    def get_samples_id(self):
        return self.samples

    def get_all_samples(self):
        """Retrieve the list of all available sample indices from all folders."""
        samples = []
        for folder_id in range(1, 16):  # Assuming folders are named 01 to 15
            folder_path = os.path.join(self.dataset_root, 'data', f"{folder_id:02d}", "rgb")
            #print(folder_path)
            if os.path.exists(folder_path):
                # get id of the images
                sample_ids = sorted([int(f.split('.')[0]) for f in os.listdir(folder_path) if f.endswith('.png')])
                samples.extend([(folder_id, sid) for sid in sample_ids])  # Store (folder_id, sample_id)
        return samples
    
    def load_config(self, folder_id):
        """Load YAML configuration files for camera intrinsics and object info for a specific folder."""
        camera_intrinsics_path = os.path.join(self.dataset_root, 'data', f"{folder_id:02d}", 'info.yml')
        objects_info_path = os.path.join(self.dataset_root, 'models', f"models_info.yml")

        with open(camera_intrinsics_path, 'r') as f:
            camera_intrinsics = yaml.load(f, Loader=yaml.FullLoader)

        with open(objects_info_path, 'r') as f:
            objects_info = yaml.load(f, Loader=yaml.FullLoader)

        return camera_intrinsics, objects_info

    #Define here some usefull functions to access the data
    def load_image(self, img_path):
        """Load an RGB image and convert to tensor."""
        img = Image.open(img_path).convert("RGB")
        return self.transform(img)
    
    def load_depth(self, depth_path):
        """Load a depth image and convert to tensor."""
        depth = np.array(Image.open(depth_path))
        return torch.tensor(depth, dtype=torch.float32)
    
    def load_point_cloud(self, depth, intrinsics):
        """Convert depth image to point cloud using Open3D."""
        intrinsics = intrinsics[0]['cam_K'] # take intrinsincs of the first image
        h, w = depth.shape
        # focal lengths and principal centers
        fx, fy, cx, cy = intrinsics[0], intrinsics[4], intrinsics[2], intrinsics[5]

        # Generate 3D points
        xmap, ymap = np.meshgrid(np.arange(w), np.arange(h))
        z = depth / 1000.0  # Convert to meters
        x = (xmap - cx) * z / fx
        y = (ymap - cy) * z / fy

        points = np.stack((x, y, z), axis=-1).reshape(-1, 3)
        point_cloud = o3d.geometry.PointCloud()
        point_cloud.points = o3d.utility.Vector3dVector(points)

        return point_cloud

    def load_6d_pose(self, folder_id, sample_id):
        """Load the 6D pose (translation and rotation) for the object in this sample."""
        pose_file = os.path.join(self.dataset_root, 'data', f"{folder_id:02d}", "gt.yml")

        # Load the ground truth poses from the gt.yml file
        with open(pose_file, 'r') as f:
            pose_data = yaml.load(f, Loader=yaml.FullLoader)

        # The pose data is a dictionary where each key corresponds to a frame with pose info
        # We assume sample_id corresponds to the key in pose_data
        if sample_id not in pose_data:
            raise KeyError(f"Sample ID {sample_id} not found in gt.yml for folder {folder_id}.")

        for pose in pose_data[sample_id]: # There can be more than one pose per sample, but take the one of label=folder_id
            # Extract translation and rotation
            if (int(pose['obj_id']) == int(folder_id)):
                translation = np.array(pose['cam_t_m2c'], dtype=np.float32)  # [3] ---> (x,y,z)
                rotation = np.array(pose['cam_R_m2c'], dtype=np.float32).reshape(3, 3)  # [3x3] ---> rotation matrix
                # bbox is top left corner and width and height info, YOLO needs center coordinates and width and height
                x_min, y_min, width, height = np.array(pose['obj_bb'], dtype=np.float32) # [4] ---> x_min, y_min, width, height
                # compute initial center
                x_center = x_min + width/2
                y_center = y_min + height/2

                # move the center when outside image and adjust width and height accordingly
                if x_center < 0:
                    width += 2 * x_center # x_center negative, subtract its absolute value * 2 from width
                    x_center = 0
                elif x_center > IMG_WIDTH:
                    width -= 2 * (x_center - IMG_WIDTH)
                    x_center = IMG_WIDTH

                if y_center < 0:
                    height += 2 * y_center # y_center negative, subtract its absolute value * 2 from height
                    y_center = 0
                elif y_center > IMG_HEIGHT:
                    height -= 2 * (y_center - IMG_HEIGHT)
                    y_center = IMG_HEIGHT
                
                # ensure width and height are not negative, this happens when bounding box is completely outside image (it should never happen)
                width = max(0, width)
                height = max(0, height)
                
                # store coordinates of the center and width and height of the bounding box normalized to the
                # image width=640 pixels and height=480 pixels
                bbox = np.array([x_center/IMG_WIDTH, y_center/IMG_HEIGHT, width/IMG_WIDTH, height/IMG_HEIGHT], dtype=np.float32)

                obj_id = np.array(pose['obj_id'], dtype=np.float32) # [1] ---> label
                break

        return translation, rotation, bbox, obj_id

    def __len__(self):
        #Return the total number of samples in the selected split.
        return len(self.samples)

    def __getitem__(self, idx):
        #Load a dataset sample.
        folder_id, sample_id = self.samples[idx]

        # Load the correct camera intrinsics and object info for this folder
        camera_intrinsics, objects_info = self.load_config(folder_id)

        img_path = os.path.join(self.dataset_root, 'data', f"{folder_id:02d}", f"rgb/{sample_id:04d}.png")
        depth_path = os.path.join(self.dataset_root, 'data', f"{folder_id:02d}", f"depth/{sample_id:04d}.png")

        img = self.load_image(img_path)
        depth = self.load_depth(depth_path)
        point_cloud = self.load_point_cloud(depth.numpy(), camera_intrinsics)
        point_cloud = torch.tensor(np.asarray(point_cloud.points), dtype=torch.float32)
        translation, rotation, bbox, obj_id = self.load_6d_pose(folder_id, sample_id)

        #Dictionary with all the data
        return {
            "rgb": img,
            "depth": torch.tensor(depth, dtype=torch.float32),
            "point_cloud": point_cloud,
            "camera_intrinsics": camera_intrinsics[0]['cam_K'],
            "objects_info": objects_info,
            "translation": torch.tensor(translation),
            "rotation": torch.tensor(rotation),
            "bbox": torch.tensor(bbox),
            "obj_id": torch.tensor(obj_id)
        }

In [None]:
dataset_root = "./Linemod_preprocessed"

train_dataset = CustomDataset(dataset_root, split="train")
print(f"Training samples: {len(train_dataset)}")

val_dataset = CustomDataset(dataset_root, split="validation")
print(f"Validation samples: {len(val_dataset)}")

test_dataset = CustomDataset(dataset_root, split="test")
print(f"Testing samples: {len(test_dataset)}")

## Data Preprocessing

Structure the data such that
```
datasets/
├── data.yaml
│
├── train/
│   ├── images/
│   │
│   └── labels/
│  
├── val/
│
└── test/
```

In [None]:
# divide the dataset into training, validation and testing set
train_samples = train_dataset.get_samples_id()
validation_samples = val_dataset.get_samples_id()
test_samples = test_dataset.get_samples_id() # test folder is optional for training YOLO

Create a new folder containing all the info, we just need the rgb image and a text file with the label and bounding box.
The ```Linemod_preprocessed``` is not removed, as it contains info about translation and rotation that are needed for pose estimation, but not for object detection model.

The working directory is in the ```DenseFusion```

In [None]:
# create a folder to contain the dataset for YOLO model
os.makedirs("../YOLO/datasets", exist_ok=True)

# count number of distinct classes
number_classes = 0
class_names = []
for el in os.scandir("./Linemod_preprocessed/data"):
    # if entry is a directory and its name is an integer value (this is just to avoid counting non directories or other directories)
    if (el.is_dir() and el.name.isdigit()):
        class_names.append(el.name)
        number_classes += 1

# get string of all class names
class_names.sort() # sort the names
names = "["
for index, el in enumerate(class_names):
    # if last element don't add comma
    if index == number_classes-1:
        names += f"'{str(el)}'"
    else:
        names += f"'{str(el)}',"
names += "]"

# create data.yaml (as class names use ids of the folder)
content = f"""train: ./train/images
val: ./val/images
test: ./test/images

nc: {number_classes}
names: {names}"""
# write to file
with open("../YOLO/datasets/data.yaml", "w") as fout:
    fout.write(content)
fout.close()

While creating the folder structure, we have to change the class id by using the index in the array written in the ```data.yaml```

In [None]:
# create a dictionary to have easily access to the index
index_dict = dict()
for index, el in enumerate(class_names):
    index_dict[int(el)] = index

Create the folders. Note that each image may contain multiple objects. For instance in ```data/02/gt.yml``` for one image there are multiple objects, but just consider the object of that class

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import os

# create images and labels
# dataset = [train_samples, validation_samples, test_samples]
folder_names = ["train", "val", "test"]

# count also the number of instances of each class
classes = range(0, number_classes)
counter_df = pd.DataFrame()
for idx in range(3):
    if idx == 0:
        dataset = train_samples
    elif idx == 1:
        dataset = validation_samples
    else:
        dataset = test_samples
    print(f"------------------------------{folder_names[idx].upper()}------------------------------")
    os.makedirs(f"../YOLO/datasets/{folder_names[idx]}/images", exist_ok=True)
    os.makedirs(f"../YOLO/datasets/{folder_names[idx]}/labels", exist_ok=True)
    classCount = {label_object: 0 for label_object in index_dict.keys()} # initialize dictionary for counting
    total = 0 # used to normalize count
    for el in tqdm(dataset, desc="Moving..."):
        # el is (folderId, sampleId)
        _, _, bbox, obj_id = train_dataset.load_6d_pose(el[0], el[1])
        # copy image into the new folder
        # avoid overwriting the files, so concat also the name of the folderId to the destination file
        shutil.copy(f"./Linemod_preprocessed/data/{el[0]:02d}/rgb/{el[1]:04d}.png", f"../YOLO/datasets/{folder_names[idx]}/images/{el[0]:02d}_{el[1]:04d}.png")
        # create label file with the same name as the image
        with open(f"../YOLO/datasets/{folder_names[idx]}/labels/{el[0]:02d}_{el[1]:04d}.txt", "w") as fout:
            # bbox is a list of values in the form of [x_center, y_center, width, height] and obj_id a list of class labels
            # where each label is in the format 01-15
            classCount[int(obj_id)] += 1
            total += 1
            content = f"{index_dict[int(obj_id)]} {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}\n"
            fout.write(content)
        fout.close()
    
    # store in the dataframe
    values = pd.array(list(classCount.values()))/total
    counter_df[folder_names[idx]] = values.copy()

In [None]:
# plot distribution of labels in training, validation and test set
fig, axes = plt.subplots(1,3,figsize=(15,6),sharey=True)
for index, column in enumerate(counter_df.columns):
    axes[index].barh([str(el) for el in index_dict.keys()], counter_df[column],color="orange", edgecolor='gray')
    axes[index].set_title(column.capitalize())
    # add line that represents the uniform distribution of the labels
    axes[index].axvline(x=1/number_classes, color="blue")
    axes[index].text(x=1/number_classes,y=-0.5,s=f"{1/number_classes: .5f}", color="blue")

fig.supxlabel("Frequency")
fig.supylabel("Labels")
plt.subplots_adjust(left=0.07, wspace=0.1)
plt.suptitle("Labels Distribution over the Training, Validation and Test sets")
plt.savefig("../../../images/YOLO_dataset_distribution.png")
plt.show()

### Visualize data

Visualize depth image

In [None]:
img_path = "./Linemod_preprocessed/data/02/depth/0000.png"
img = Image.open(img_path)
plt.imshow(img)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Plot image with bounding box

# Load the ground truth poses from the gt.yml file
with open("./Linemod_preprocessed/data/02/gt.yml", 'r') as f:
  pose_data = yaml.load(f, Loader=yaml.FullLoader)
pose = pose_data[0][1] # access image 0 (start counting from 0) and get second object in that image (in case of multiple objects)

bbox = np.array(pose['obj_bb'], dtype=np.float32) #[4]
obj_id = np.array(pose['obj_id'], dtype=np.float32) #[1]

fig, ax = plt.subplots()
ax.imshow(img)

# Create a rectangle patch
rect = patches.Rectangle(
    (bbox[0], bbox[1]),  # (x, y)
    bbox[2],             # width
    bbox[3],             # height
    linewidth=2,
    edgecolor='red',
    facecolor='none'
)

# Add the rectangle to the plot
ax.add_patch(rect)

# Optionally add object ID label (write a bit above the top left corner)
ax.text(bbox[0], bbox[1] - 10, f'ID: {int(obj_id)}', color='yellow', fontsize=12, backgroundcolor='black')

plt.axis('off')
plt.show()

In [None]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
print(f"Training loader: {len(train_loader)}")
print(f"Validation loader: {len(val_loader)}")
print(f"Test loader: {len(test_loader)}")

In [None]:
import itertools

# Get only the first 1 batch
train_subset_num_batches = 1
val_subset_num_batches = 1
test_subset_num_batches = 1
train_subset = list(itertools.islice(train_loader, train_subset_num_batches))
val_subset = list(itertools.islice(val_loader, val_subset_num_batches))
test_subset = list(itertools.islice(test_loader, test_subset_num_batches))

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Get one batch from the train loader (4 images)
batch = next(iter(train_loader)) # it uses load_6d_pose, so one pose per object

# Extract relevant data
rgb_images = batch["rgb"]         # (B, 3, H, W)
bboxes = batch["bbox"]            # (B, 4) in pixel coords: x_min, y_min, x_max, y_max
obj_ids = batch["obj_id"]         # (B,)

# Convert to numpy and rearrange channels
rgb_images = rgb_images.permute(0, 2, 3, 1).numpy()  # (B, H, W, 3)
bboxes = bboxes.numpy()
obj_ids = obj_ids.numpy()

# Plot settings
batch_size = rgb_images.shape[0]
cols = min(4, batch_size)
rows = (batch_size + cols - 1) // cols

fig, axes = plt.subplots(rows, cols, figsize=(12, 3 * rows))
axes = axes.flatten()

for i in range(batch_size):
    ax = axes[i]
    img = rgb_images[i]
    # each element is [x_center/IMG_WIDTH, y_center/IMG_HEIGHT, width/IMG_WIDTH, height/IMG_HEIGHT]
    x_center, y_center, width, height = bboxes[i]
    # remove normalization
    x_center = x_center*IMG_WIDTH
    y_center = y_center*IMG_HEIGHT
    width = width*IMG_WIDTH
    height = height*IMG_HEIGHT
    x_min = x_center-(width/2)
    y_min = y_center-(height/2)
    obj_id = obj_ids[i]

    ax.imshow(img)
    ax.axis('off')
    ax.set_title(f"Sample {i}")

    # Draw bounding box
    rect = patches.Rectangle(
        (x_min, y_min),   # (x_min, y_min)
        width,              # width
        height,              # height
        linewidth=2,
        edgecolor='red',
        facecolor='none'
    )
    ax.add_patch(rect)

    # Add object ID as label
    ax.text(
        x_min,
        y_min - 10,
        f'ID: {int(obj_id)}',
        color='yellow',
        fontsize=10,
        backgroundcolor='black'
    )

# Hide unused axes if batch_size < cols * rows
for j in range(batch_size, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

## Training Object Detection model

Check if CUDA available, otherwise try with MPS and then CPU

In [None]:
if torch.cuda.is_available():
    print("Cuda")
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    print("Cuda not available, use mps")
    device = torch.device("mps")
else:
    print("Use CPU")
    device = torch.device("cpu")

In [None]:
%cd ../YOLO/

In [None]:
path = !pwd
path = path[0]

In [None]:
from ultralytics import YOLO

model_path = "../../../checkpoints/yolo11n.pt"
model = YOLO(model_path)
epochs = 20
batch_size = 64
IMG_SIZE = 640

# model will automatically scale the image and related bounding box according to imgsz
results = model.train(data=f"{path}/datasets/data.yaml", epochs=epochs, batch=batch_size, device=device,
        imgsz=IMG_SIZE,
        augment=True,
        flipud=0.5,
        fliplr=0.5,
        hsv_h=0.4,
        hsv_s=0.4,
        hsv_v=0.4,
        degrees=120,
        translate=0.1,
        scale=0.5,
        shear=20,
        perspective=0.0001,
    )

Copy model file to ```checkpoints```

In [None]:
shutil.copy(f"./runs/detect/train/weights/best.pt", f"../../../checkpoints/best.pt")

Validate model

In [None]:
model_path = "../../../checkpoints/best.pt"
model = YOLO(model_path)
results = model.val(
        data=f"{path}/datasets/data.yaml",
        epochs=epochs,
        batch=batch_size,
        imgsz=IMG_SIZE,
        device=device
    )

Test model

In [None]:
import random

pathTest = f"{path}/datasets/test/images"
filenames = os.listdir(pathTest)
random.shuffle(filenames)

for filename in filenames:
    path = os.path.join(pathTest, filename)
    results = model.predict(path)
    for result in results:
        result.show()