In [None]:
import os
import random
from shutil import copyfile

In [1]:
#Subsample annotations using a target class ratio
def subsample_annotations(input_dir, output_dir, subsample_ratio):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Collect the list of annotation files
    annotation_files = [f for f in os.listdir(input_dir) if f.endswith('.txt')]

    # Collect the annotations per class
    class_annotations = {}
    for annotation_file in annotation_files:
        with open(os.path.join(input_dir, annotation_file), 'r') as f:
            lines = f.readlines()
            for line in lines:
                class_id = line.split()[0]
                if class_id not in class_annotations:
                    class_annotations[class_id] = []
                class_annotations[class_id].append(annotation_file)

    # Calculate the number of annotations to subsample per class
    subsampled_annotations = {}
    for class_id, annotations in class_annotations.items():
        num_annotations = len(annotations)
        num_subsampled_annotations = int(num_annotations * subsample_ratio)
        subsampled_annotations[class_id] = random.sample(annotations, num_subsampled_annotations)

    # Copy the subsampled annotations to the output directory
    for class_id, annotations in subsampled_annotations.items():
        for annotation_file in annotations:
            input_path = os.path.join(input_dir, annotation_file)
            output_path = os.path.join(output_dir, annotation_file)
            copyfile(input_path, output_path)

#Verify class split in subsampled data
def analyze_class_distribution(data_dir):
    # Collect the list of annotation files
    annotation_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]

    # Count the annotations per class
    class_counts = {}
    for annotation_file in annotation_files:
        with open(os.path.join(data_dir, annotation_file), 'r') as f:
            lines = f.readlines()
            for line in lines:
                class_id = line.split()[0]
                if class_id not in class_counts:
                    class_counts[class_id] = 0
                class_counts[class_id] += 1

    # Print the class distribution
    print("Class Distribution:")
    for class_id, count in class_counts.items():
        print(f"Class {class_id}: {count} annotations")

In [7]:
# Example usage
input_directory = "E:/datasets/zooniverse/labels/train"  # Directory containing original annotations
output_directory = 'E:/datasets/zooniverse/subsample_025/'  # Directory to store subsampled annotations
subsample_ratio = 0.75  # Subsampling ratio, e.g., 0.5 for 50% of the original data

subsample_annotations(input_directory, output_directory, subsample_ratio)
analyze_class_distribution(output_directory)