# Transform dataset

Transform COCO dataset to include only selected classes.

Filters annotations by defined target (furniture) classes, and maps class labels to our list of class (from dataset .yaml file)

In [20]:
import time
from tqdm import tqdm
from pathlib import Path

In [21]:
# Root directory
DATASET_ROOT_DIR: str = "../datasets/coco"

# Labels directory
LABELS_DIR: str = "labels"

IMAGES_DIR: str = "images"

COCO_FURNITURE_CLASSES: list = [13, 56, 57, 58, 59, 60, 61, 71, 72, 75]

COCO_AEKI_CLASS_MAP: dict = {
    "13" : "0",
    "56" : "1",
    "57" : "2",
    "58" : "3",
    "59" : "4",
    "60" : "5",
    "61" : "6",
    "71" : "7",
    "72" : "8",
    "75" : "9"
}

# Filter classes

In [23]:
start = time.perf_counter()

# Define few lists
furniture_labels = []
furniture_images = []

# Read labels and get only those for furniture objects
labels_paths = list(Path(f"{DATASET_ROOT_DIR}/{LABELS_DIR}").rglob("*.txt"))

print("Reading labels...")
for label in tqdm(labels_paths):
    with open(label, 'r') as f:
        for line in f:
            # Cast to int for faster list search
            bbox_class = int(line.strip().split(" ")[0])
            if bbox_class in COCO_FURNITURE_CLASSES:
                furniture_labels.append(label)
                furniture_images.append(Path(str(label.with_suffix(".jpg")).replace(LABELS_DIR, IMAGES_DIR)))
                break

# Remove images 
images_paths = list(Path(f"{DATASET_ROOT_DIR}/{IMAGES_DIR}").rglob("*.jpg"))

print("Removing images...")
for img in tqdm(list(set(images_paths) - set(furniture_images))):
    img.unlink()

# Remove labels
print("Removing labels...")
for label in tqdm(list(set(labels_paths) - set(furniture_labels))):
    label.unlink()

# Remove lines with classes not in list and rename class name
print("Removing classes from labels and mapping classes...")
for label in tqdm(furniture_labels):
    with open(label, 'r+') as f:
        lines = f.readlines()
        # Set file handle at the beginning of the file
        f.seek(0)
        # Read lines from file
        for line in lines:
            bbox_class = line.strip().split(" ")[0]
            if int(bbox_class) in COCO_FURNITURE_CLASSES:
                # Replace bbox class with new, mapped one and write to file
                # Trailing space is added to replace class label only
                f.write(line.replace(bbox_class + " ", COCO_AEKI_CLASS_MAP[bbox_class] + " "))
        # Truncate file to current stream
        f.truncate()

end = time.perf_counter()

print(f"Done! Finished in {end - start} seconds.")


Reading labels...


100%|██████████| 122218/122218 [00:07<00:00, 16162.80it/s]


Removing images...


100%|██████████| 83938/83938 [00:13<00:00, 6352.93it/s]


Removing files...


100%|██████████| 82869/82869 [00:13<00:00, 6330.56it/s]


Removing classes from labels and renaming classes...


100%|██████████| 39349/39349 [00:06<00:00, 6332.01it/s]

Done! Finished in 41.22528312500799 seconds.



