In [None]:
import os
import requests
from tqdm import tqdm
import zipfile

# URLs for the required files
urls = {
    "instances_train2017.json": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip",
    "instances_val2017.json": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip",
    "train2017": "http://images.cocodataset.org/zips/train2017.zip",
    "val2017": "http://images.cocodataset.org/zips/val2017.zip",
    "panoptic_train2017.json": "http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip",
    "panoptic_val2017.json": "http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip",
}

# Directory to store the dataset
output_dir = "./"
os.makedirs(output_dir, exist_ok=True)

def download_file(url, output_path):
    """Downloads a file with a progress bar."""
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    with open(output_path, "wb") as file, tqdm(
        desc=f"Downloading {os.path.basename(output_path)}",
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            bar.update(len(data))

def extract_file(zip_path, extract_to, specific_files=None):
    """Extracts specific files or all files from a zip archive."""
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        if specific_files:
            for file in specific_files:
                if file in zip_ref.namelist():
                    zip_ref.extract(file, extract_to)
        else:
            zip_ref.extractall(extract_to)

# Step 1: Download and Extract Annotations

#File1: panoptic annotations.json
annotations_zip_path = os.path.join(output_dir, "panoptic_annotations_trainval2017.zip")
if not os.path.exists(annotations_zip_path):
    download_file(urls["panoptic_train2017.json"], annotations_zip_path)

extract_file(
    annotations_zip_path,
    output_dir,
    specific_files=["annotations/panoptic_train2017.json", "annotations/panoptic_val2017.json",]
)
#File2: instance_train2017 json
instance_annotations_zip_path = os.path.join(output_dir, "annotations_trainval2017.zip")
if not os.path.exists(instance_annotations_zip_path):
    download_file(urls["instances_train2017.json"], instance_annotations_zip_path)

extract_file(
    instance_annotations_zip_path, 
    output_dir,
    specific_files=["annotations/instances_train2017.json", "annotations/instances_val2017.json"]
)

# # Step 2: Download and Extract Images (Train)
train_zip_path = os.path.join(output_dir, "train2017.zip")
if not os.path.exists(train_zip_path):
    download_file(urls["train2017"], train_zip_path)

extract_file(train_zip_path, output_dir)

# # Step 3: Download and Extract Images (Validation)
val_zip_path = os.path.join(output_dir, "val2017.zip")
if not os.path.exists(val_zip_path):
    download_file(urls["val2017"], val_zip_path)

extract_file(val_zip_path, output_dir)

# Step4: Download the Extract Images (panoptic_annotations_trainval2017)

path1 = os.path.join(output_dir,"annotations", "panoptic_train2017.zip")
path2 = os.path.join(output_dir, "annotations","panoptic_val2017.zip")

extract_file(path1, output_dir)
extract_file(path2, output_dir)


# Final output paths
annotations_dir = os.path.join(output_dir, "annotations")
train_images_dir = os.path.join(output_dir, "train2017")
val_images_dir = os.path.join(output_dir, "val2017")

print("Dataset downloaded and extracted:")
#print(f"Annotations: {annotations_dir}")
# print(f"Train images: {train_images_dir}")
# print(f"Validation images: {val_images_dir}")


Downloading panoptic_annotations_trainval2017.zip: 100%|██████████| 821M/821M [02:02<00:00, 7.04MB/s]   
Downloading annotations_trainval2017.zip: 100%|██████████| 241M/241M [00:43<00:00, 5.80MB/s] 
Downloading train2017.zip: 100%|██████████| 18.0G/18.0G [34:40<00:00, 9.29MB/s]  
Downloading val2017.zip: 100%|██████████| 778M/778M [01:35<00:00, 8.53MB/s] 


Dataset downloaded and extracted:


In [8]:
import json
import os
import random
from shutil import copy2
current_directory = os.getcwd()
print(current_directory)


TRAIN_PATH = './'

# Add these debugging lines before your loop
import glob
possible_paths = [
    os.path.join(TRAIN_PATH, 'panoptic_train2017'),
    os.path.join(TRAIN_PATH, 'annotations', 'panoptic_train2017'),
    os.path.join(TRAIN_PATH, 'panoptic_train2017', 'panoptic_train2017')
]

for path in possible_paths:
    if os.path.exists(path):
        print(f"Found path: {path}")
        print(f"Sample files: {glob.glob(os.path.join(path, '*.png'))[:3]}")
# Paths
original_json_path = os.path.join(TRAIN_PATH, 'annotations', 'panoptic_train2017.json')
subset_json_path = os.path.join(TRAIN_PATH, 'annotations', 'panoptic_train_subset.json')
original_img_dir = os.path.join(TRAIN_PATH, 'train2017')
subset_img_dir = os.path.join(TRAIN_PATH, 'train_subset')
subset_panoptic_dir = os.path.join(TRAIN_PATH, 'panoptic_train_subset')

# Parameters
num_samples = 10000

# Create directories for subset
os.makedirs(subset_img_dir, exist_ok=True)
os.makedirs(subset_panoptic_dir, exist_ok=True)

# Load original JSON
with open(original_json_path, 'r') as f:
    panoptic_data = json.load(f)

# Randomly sample images
sampled_images = random.sample(panoptic_data['images'], num_samples)
sampled_image_ids = {img['id'] for img in sampled_images}

# Filter annotations for sampled images
sampled_annotations = [ann for ann in panoptic_data['annotations'] if ann['image_id'] in sampled_image_ids]

# Copy sampled images and annotations
for img in sampled_images:
    src_img_path = os.path.join(original_img_dir, img['file_name'])
    dest_img_path = os.path.join(subset_img_dir, img['file_name'])
    copy2(src_img_path, dest_img_path)

for ann in sampled_annotations:
    src_ann_path = os.path.join(TRAIN_PATH,'panoptic_train2017', ann['file_name'])
    print(src_ann_path)
    dest_ann_path = os.path.join(subset_panoptic_dir, ann['file_name'])
    print(dest_ann_path)
    copy2(src_ann_path, dest_ann_path)

# Save the new JSON
subset_data = {
    'images': sampled_images,
    'annotations': sampled_annotations,
    'categories': panoptic_data['categories'],
}
with open(subset_json_path, 'w') as f:
    json.dump(subset_data, f)

print(f"Subset dataset created with {num_samples} samples.")


s:\Research\MaskUnet\data\COCO
Found path: ./panoptic_train2017
Sample files: ['./panoptic_train2017\\000000000009.png', './panoptic_train2017\\000000000025.png', './panoptic_train2017\\000000000030.png']
./panoptic_train2017\000000000036.png
./panoptic_train_subset\000000000036.png
./panoptic_train2017\000000000049.png
./panoptic_train_subset\000000000049.png
./panoptic_train2017\000000000077.png
./panoptic_train_subset\000000000077.png
./panoptic_train2017\000000000089.png
./panoptic_train_subset\000000000089.png
./panoptic_train2017\000000000136.png
./panoptic_train_subset\000000000136.png
./panoptic_train2017\000000000138.png
./panoptic_train_subset\000000000138.png
./panoptic_train2017\000000000201.png
./panoptic_train_subset\000000000201.png
./panoptic_train2017\000000000208.png
./panoptic_train_subset\000000000208.png
./panoptic_train2017\000000000486.png
./panoptic_train_subset\000000000486.png
./panoptic_train2017\000000000514.png
./panoptic_train_subset\000000000514.png
./pan