In [None]:
!pip install datasets -q
print("✅ 'datasets' library installed.")

✅ 'datasets' library installed.


In [None]:
from datasets import load_dataset

# This loads the dataset in streaming mode.
coco_data = load_dataset("detection-datasets/coco", split="train", streaming=True)

print("✅ COCO dataset is ready to stream.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/58.0 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/40 [00:00<?, ?it/s]

dataset_infos.json: 0.00B [00:00, ?B/s]

✅ COCO dataset is ready to stream.


In [None]:
import os
import time

# DEFINE  CLASS MAP


CLASS_MAP = {
    0: 0,  # person
    1: 1,  # bicycle
    2: 2,  # car
    3: 3,  # motorcycle
    5: 4,  # bus
    7: 5,  # truck
}



def convert_and_filter_yolo(example, index, class_map, output_dir="yolo_data"):
    output_images_dir = os.path.join(output_dir, "images")
    output_labels_dir = os.path.join(output_dir, "labels")
    os.makedirs(output_images_dir, exist_ok=True)
    os.makedirs(output_labels_dir, exist_ok=True)

    image = example['image']
    image_width, image_height = image.size

    # A list to hold our new, filtered labels
    yolo_labels = []
    num_objects = len(example['objects']['bbox'])

    # Check if this image has any objects we want
    for i in range(num_objects):
        category_id = example['objects']['category'][i]

        # Check if this object's class is in filter map
        if category_id in class_map:
            new_category_id = class_map[category_id]

            # Get the bounding box and convert to YOLO format
            bbox = example['objects']['bbox'][i]
            x_min, y_min, width, height = bbox
            x_center = (x_min + width / 2) / image_width
            y_center = (y_min + height / 2) / image_height
            norm_width = width / image_width
            norm_height = height / image_height

            yolo_labels.append(f"{new_category_id} {x_center} {y_center} {norm_width} {norm_height}")

    # 2. If (and only if) we found relevant objects, save the files
    if yolo_labels:
        # Save the image
        image_filename = os.path.join(output_images_dir, f"coco_filtered_{index:05d}.jpg")
        image.save(image_filename)

        # Save the corresponding label file
        label_filename = os.path.join(output_labels_dir, f"coco_filtered_{index:05d}.txt")
        with open(label_filename, 'w') as f:
            f.write("\n".join(yolo_labels))

        # Return True to indicate we saved this image
        return True

    # This image had no objects we cared about, so we skip it.
    return False


# --- Configuration ---
NUM_SAMPLES_TO_SAVE = 5000 # We will stop after saving 5,000 valid images
dataset_iterator = iter(coco_data)
start_time = time.time()

processed_count = 0 # This counts how many images we've *saved*
image_index = 0     # This counts how many images we've *checked*

print(f"--- Starting to filter and process images... ---")
print(f"--- Looking for classes (Old ID -> New ID): {CLASS_MAP} ---")
print(f"--- Goal: Save {NUM_SAMPLES_TO_SAVE} valid images. ---")

while processed_count < NUM_SAMPLES_TO_SAVE:
    try:
        example = next(dataset_iterator)

        # This function will check, filter, remap, and save.
        was_saved = convert_and_filter_yolo(example, image_index, CLASS_MAP)

        if was_saved:
            processed_count += 1 # We saved one!

            if processed_count % 100 == 0:
                print(f"    ... saved {processed_count} / {NUM_SAMPLES_TO_SAVE} valid images (checked {image_index + 1} total).")

        image_index += 1

    except StopIteration:
        print(f"\nReached the end of the dataset after checking {image_index} images.")
        break
    except Exception as e:
        print(f"An error occurred on image {image_index} (skipping): {e}")
        image_index += 1 # Skip this image and continue

end_time = time.time()
total_time = end_time - start_time

print("--- Processing complete! ---")
print(f"Saved {processed_count} valid images in {total_time:.2f} seconds.")
print("Your filtered data is ready in the 'yolo_data' folder.")

--- Starting to filter and process images... ---
--- Looking for classes (Old ID -> New ID): {0: 0, 1: 1, 2: 2, 3: 3, 5: 4, 7: 5} ---
--- Goal: Save 5000 valid images. ---
    ... saved 100 / 5000 valid images (checked 176 total).
    ... saved 200 / 5000 valid images (checked 344 total).
    ... saved 300 / 5000 valid images (checked 524 total).
    ... saved 400 / 5000 valid images (checked 689 total).
    ... saved 500 / 5000 valid images (checked 840 total).
    ... saved 600 / 5000 valid images (checked 995 total).
    ... saved 700 / 5000 valid images (checked 1164 total).
    ... saved 800 / 5000 valid images (checked 1328 total).
    ... saved 900 / 5000 valid images (checked 1493 total).
    ... saved 1000 / 5000 valid images (checked 1661 total).
    ... saved 1100 / 5000 valid images (checked 1845 total).
    ... saved 1200 / 5000 valid images (checked 2013 total).
    ... saved 1300 / 5000 valid images (checked 2187 total).
    ... saved 1400 / 5000 valid images (checked 23

In [None]:
import shutil
from google.colab import drive

print("Mounting Google Drive...")
drive.mount('/content/drive')

print("Zipping the 'yolo_data' folder...")
# This creates a file named 'filtered_coco_data.zip'
shutil.make_archive('filtered_coco_data', 'zip', 'yolo_data')
print("Zip file created.")

# --- Define your save path ---
# You can change this path if you want
drive_save_path = '/content/drive/My Drive/VisionAssist-Dataset/filtered_coco_data.zip'

# Create the destination folder if it doesn't exist
os.makedirs(os.path.dirname(drive_save_path), exist_ok=True)

print(f"Copying zip file to: {drive_save_path}")
shutil.copy('filtered_coco_data.zip', drive_save_path)

print("--- All Done! ---")
print("Your processed data is now safely in Google Drive.")

Mounting Google Drive...
Mounted at /content/drive
Zipping the 'yolo_data' folder...
Zip file created.
Copying zip file to: /content/drive/My Drive/VisionAssist-Dataset/filtered_coco_data.zip
--- All Done! ---
Your processed data is now safely in Google Drive.


In [None]:
%%writefile my_coco_dataset.yaml
# This is a YAML file, not Python.
# Define the paths to your training and validation images
train: ../train/images
val: ../valid/images

# Number of classes (must match the count in your CLASS_MAP)
nc: 6

names:
  0: person
  1: bicycle
  2: car
  3: motorcycle
  4: bus
  5: truck

Writing my_coco_dataset.yaml
