In [None]:
# Imports and setup
import pprint

# Optional: display full dicts cleanly
pp = pprint.PrettyPrinter(indent=2)

## Load dataset

The package provides a `Dataset` class to load and manage datasets. It supports various way of creating a dataset. It is practical for working with object detection sets of images for inspection or preparation for training, validation, or even running predictions. Below is a guide to creating a dataset and the most practical methods.

### 📁 From folder

In [None]:
# Load dataset from folder
import os
from ct_detector.data.dataset import Dataset
from ct_detector.model import DATASETS_DIR

folder_path = os.path.join(DATASETS_DIR, "1", "val", "images")

dset = Dataset.from_folder(folder_path)
print(f"Dataset loaded with {dset.size} images.")


### 📄 From .txt file with image paths

In [None]:
# Assume you have a train.txt file containing image paths
import os
from ct_detector.data.dataset import Dataset
from ct_detector.model import DATASETS_DIR

txt_path = os.path.join(DATASETS_DIR, "1", "val.txt")
dataset_txt = Dataset.from_txt(txt_path)
print(f"Loaded from .txt file: {dataset_txt.size} samples")

### 📄 From .yaml file (YOLO format)

In [None]:
# Standard YOLO YAML with train/val/test keys
import os
from ct_detector.data.dataset import Dataset
from ct_detector.model import DATASETS_DIR

yaml_path = os.path.join(DATASETS_DIR, "1.yaml")
dataset_yaml = Dataset.from_yaml(yaml_path)
print(f"Loaded from .yaml file: {dataset_yaml.size} samples")
print("Dataset splits:", dataset_yaml.dataset_names)

### 📂 From list of paths

In [None]:
# Create a small list manually or from another Dataset
from pathlib import Path
from ct_detector.data.dataset import Dataset
from ct_detector.model import DATASETS_DIR

some_paths = list((Path(DATASETS_DIR) / '1' / 'val' / 'images').glob("*.jpg"))[:10]
dataset_paths = Dataset.from_paths(some_paths)
print(f"Loaded from list of paths: {dataset_paths.size} samples")

### 🧬 Merge multiple datasets

In [None]:
# Merge datasets

merged = Dataset.from_datasets([dataset_txt, dataset_paths])
print(f"Merged dataset: {merged.size} samples")

### 🧹 Optional: Filter out by names

In [None]:
blacklist_path = os.path.join(DATASETS_DIR, "1", "blacklist.txt")  # A .txt file with image filenames to exclude
txt_path = os.path.join(DATASETS_DIR, "1", "val.txt")
filtered = Dataset.from_txt(txt_path, exclude_names_path=blacklist_path)
print(f"Filtered dataset: {filtered.size} samples")

## Inspect dataset

In [None]:
# Inspect dataset keys
for key, value in dset.items():
    print(f"Key: {key}")

In [None]:
# Basic inspection
print(f"Train: {dset.train_size}, Val: {dset.val_size}, Test: {dset.test_size}")
print(f"With detections: {dset.with_detection}, Without detections: {dset.without_detection}")


In [None]:
# Print a sample entry
key = next(iter(dset.keys()))
pp.pprint(dset[key])


In [None]:
# Visualize an image and its boxes
dset.visualize(key, color_conversion="RGBA2BGR")

In [None]:
# If the color of the image isn't right you can experiment with different color_conversion modes.
from ct_detector.display import COLOR_CONVERSIONS

print("Available color conversions:")
for name, code in COLOR_CONVERSIONS.items():
    print(f"{name}: {code}")

In [None]:
# Sanity check
missing, corrupted = dset.sanity_check()
print(f"Missing labels: {len(missing)}\nCorrupted images: {len(corrupted)}")


In [None]:
# Class distribution (example class names)
class_names = {0: "elephant", 1: "human", 2: "cat", 3: "dog"}
dist = dset.class_distribution(class_names)
pp.pprint(dist)


## Filter dataset

In [None]:
# Filter by name
dset.filter_by("name", ["0002_jpg.rf.1e66a3c788c21cd312d09a6288c36f4d.jpg"])
print(f"After filtering: {dset.size} images")


## Subset dataset

In [None]:
# Subset and random subset
val_subset = dset.get_subset("val")
random_subset = dset.get_random_subset(10)
print(f"Subset sizes: Val = {val_subset.size}, Random = {random_subset.size}")

## Split dataset

In [None]:
# Split dataset randomly
dset.split_dataset(0.7, 0.2, 0.1)
print(f"Split sizes: Train = {dset.train_size}, Val = {dset.val_size}, Test = {dset.test_size}")


In [None]:
# Generate YOLO txt and yaml
dset.generate_yolo_files("demo_output", classes=class_names, abs_paths=True, write_yaml=True)
print("YOLO files generated in demo_output/")


In [None]:
# Reorganize files
dset.reorganize_files("reorg_output", by="dataset")
print("Files copied by dataset split to reorg_output/")


## Balance dataset

In [None]:
# Balance dataset
d_bal = dset.balance_by_class(target_size=5)
print(f"Balanced dataset size: {d_bal.size}")
