In [156]:
import os
import shutil
import random
import json

PICTURE_PER_SUBFOLDER = 17
random.seed(420) 

# Set paths
root_path = './rgb'
subset_path = os.path.join(root_path, 'subset')
coco_json_path = os.path.join(root_path, 'merged_annotations.coco.json')
subset_coco_json_path = os.path.join(subset_path, 'subset_dataset.coco.json')

# Set this flag to True if you want images directly in the 'subset' folder, or False if you want subdirectories for categories and dates
use_flat_structure = True

# Create the subset directory if it doesn't exist
shutil.rmtree(subset_path)
os.makedirs(subset_path, exist_ok=True)

# Initialize lists to keep track of original and new paths
path_mapping = []

# Iterate through categories (beans and corn)
for category in os.listdir(root_path):
    if category == 'subset' :
        continue
    category_path = os.path.join(root_path, category)
    if os.path.isdir(category_path):
        subset_category_path = subset_path if use_flat_structure else os.path.join(subset_path, category)
        os.makedirs(subset_category_path, exist_ok=True)
        
        # Get list of date folders (e.g., 2023-05-24)
        date_folders = os.listdir(category_path)
        
        # Randomly select PICTURE_PER_SUBFOLDER images from each date folder
        for date_folder in date_folders:
            date_path = os.path.join(category_path, date_folder)
            subset_date_path = subset_path if use_flat_structure else os.path.join(subset_category_path, date_folder)
            os.makedirs(subset_date_path, exist_ok=True)
            
            image_files = [f for f in os.listdir(date_path) if f.endswith('.png')]
            selected_images = random.sample(image_files, min(PICTURE_PER_SUBFOLDER, len(image_files)))
            
            for image in selected_images:
                original_image_path = os.path.join(date_path, image)
                new_image_path = os.path.join(subset_date_path, image)
                shutil.copy(original_image_path, new_image_path)
                
                path_mapping.append({
                    'original_path': original_image_path,
                    'new_path': new_image_path
                })

# Save path mapping to a text file
with open(os.path.join(subset_path, 'path_mapping.txt'), 'w') as mapping_file:
    for mapping in path_mapping:
        mapping_file.write(f"Original: {mapping['original_path']}\tNew: {mapping['new_path']}\n")

# Load the original coco JSON
with open(coco_json_path, 'r') as coco_json_file:
    coco_data = json.load(coco_json_file)


In [157]:
# Filter and create a subset of the coco data
subset_coco_data = {'images': [], 'annotations': [], 'categories': coco_data['categories']}
for image_info in coco_data['images']:
    for mapping in path_mapping:
        if mapping['original_path'].endswith(image_info['file_name']):
            subset_coco_data['images'].append(image_info)

for annotation in coco_data['annotations']:
    for image in subset_coco_data['images']:
        if annotation['image_id'] == image['id'] :
            subset_coco_data['annotations'].append(annotation)


In [158]:
# Save the subset coco JSON
with open(subset_coco_json_path, 'w') as subset_coco_json_file:
    json.dump(subset_coco_data, subset_coco_json_file)
