In [1]:
import os
import requests
from tqdm import tqdm
import zipfile

# URLs for the required files
urls = {
    # "instances_train2017.json": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip",
    # "instances_val2017.json": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip",
    # "train2017": "http://images.cocodataset.org/zips/train2017.zip",
    # "val2017": "http://images.cocodataset.org/zips/val2017.zip",
    "panoptic_train2017.json": "http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip",
    "panoptic_val2017.json": "http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip",
}

# Directory to store the dataset
output_dir = "./COCO"
os.makedirs(output_dir, exist_ok=True)

def download_file(url, output_path):
    """Downloads a file with a progress bar."""
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(output_path, "wb") as file, tqdm(
        desc=f"Downloading {os.path.basename(output_path)}",
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            bar.update(len(data))

def extract_file(zip_path, extract_to, specific_files=None):
    """Extracts specific files or all files from a zip archive."""
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        if specific_files:
            for file in specific_files:
                if file in zip_ref.namelist():
                    zip_ref.extract(file, extract_to)
        else:
            zip_ref.extractall(extract_to)

# Step 1: Download and Extract Annotations
annotations_zip_path = os.path.join(output_dir, "panoptic_annotations_trainval2017.zip")
if not os.path.exists(annotations_zip_path):
    download_file(urls["panoptic_train2017.json"], annotations_zip_path)

extract_file(
    annotations_zip_path,
    output_dir,
    specific_files=["annotations/panoptic_train2017.json", "annotations/panoptic_val2017.json"],
)

# Step 2: Download and Extract Images (Train)
# train_zip_path = os.path.join(output_dir, "train2017.zip")
# if not os.path.exists(train_zip_path):
#     download_file(urls["train2017"], train_zip_path)

# extract_file(train_zip_path, output_dir)

# Step 3: Download and Extract Images (Validation)
# val_zip_path = os.path.join(output_dir, "val2017.zip")
# if not os.path.exists(val_zip_path):
#     download_file(urls["val2017"], val_zip_path)

# extract_file(val_zip_path, output_dir)

# Final output paths
annotations_dir = os.path.join(output_dir, "annotations")
# train_images_dir = os.path.join(output_dir, "train2017")
# val_images_dir = os.path.join(output_dir, "val2017")

print("Dataset downloaded and extracted:")
print(f"Annotations: {annotations_dir}")
# print(f"Train images: {train_images_dir}")
# print(f"Validation images: {val_images_dir}")


Downloading panoptic_annotations_trainval2017.zip: 100%|██████████| 821M/821M [01:15<00:00, 11.4MB/s] 


Dataset downloaded and extracted:
Annotations: ./COCO/annotations


In [2]:
import json
from pycocotools.coco import COCO

# Define the path to the original COCO dataset annotation file
input_json_path = './COCO/annotations/instances_train2017.json'
output_json_path = 'filtered_instances.json'

# Define the category IDs to keep (example IDs for things and stuff)
things_ids = [1, 2, 3, ..., 80]  # Replace with actual IDs for things
stuff_ids = [81, 82, 83, ..., 133]  # Replace with actual IDs for stuff
category_ids_to_keep = set(things_ids + stuff_ids)

# Load the original COCO dataset
coco = COCO(input_json_path)

# Filter annotations
filtered_annotations = []
for annotation in coco.dataset['annotations']:
    if annotation['category_id'] in category_ids_to_keep:
        filtered_annotations.append(annotation)

# Filter images
image_ids_to_keep = {ann['image_id'] for ann in filtered_annotations}
filtered_images = [img for img in coco.dataset['images'] if img['id'] in image_ids_to_keep]

# Filter categories
filtered_categories = [cat for cat in coco.dataset['categories'] if cat['id'] in category_ids_to_keep]

# Save the filtered dataset
filtered_data = {
    'images': filtered_images,
    'annotations': filtered_annotations,
    'categories': filtered_categories
}
with open(output_json_path, 'w') as f:
    json.dump(filtered_data, f)

print(f"Filtered dataset saved to {output_json_path}")


loading annotations into memory...
Done (t=13.22s)
creating index...
index created!
Filtered dataset saved to filtered_instances.json
