Subsample Crowdsourced annotations based on percentages of either the total number of labels in the training pool, or number of original images (not tiles) in the training pool

In [1]:
#Import necessary modules
import os
import random
from shutil import copyfile

In [13]:
#Subsample annotations using a target class ratio
def subsample_annotations(input_dir, output_dir, subsample_ratio):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Collect the list of annotation files
    annotation_files = [f for f in os.listdir(input_dir) if f.endswith('.txt')]

    # Collect the annotations per class
    class_annotations = {}
    for annotation_file in annotation_files:
        with open(os.path.join(input_dir, annotation_file), 'r') as f:
            lines = f.readlines()
            for line in lines:
                class_id = line.split()[0]
                if class_id not in class_annotations:
                    class_annotations[class_id] = []
                class_annotations[class_id].append(annotation_file)

    # Calculate the number of annotations to subsample per class
    subsampled_annotations = {}
    for class_id, annotations in class_annotations.items():
        num_annotations = len(annotations)
        num_subsampled_annotations = int(num_annotations * subsample_ratio)
        subsampled_annotations[class_id] = random.sample(annotations, num_subsampled_annotations)

    # Copy the subsampled annotations to the output directory
    for class_id, annotations in subsampled_annotations.items():
        for annotation_file in annotations:
            input_path = os.path.join(input_dir, annotation_file)
            output_path = os.path.join(output_dir, annotation_file)
            copyfile(input_path, output_path)

#Verify class split in subsampled data
def analyze_class_distribution(data_dir):
    # Collect the list of annotation files
    annotation_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]

    # Count the annotations per class
    class_counts = {}
    for annotation_file in annotation_files:
        with open(os.path.join(data_dir, annotation_file), 'r') as f:
            lines = f.readlines()
            for line in lines:
                class_id = line.split()[0]
                if class_id not in class_counts:
                    class_counts[class_id] = 0
                class_counts[class_id] += 1

    # Print the class distribution
    print("Class Distribution:")
    for class_id, count in class_counts.items():
        print(f"Class {class_id}: {count} annotations")

In [19]:
# Implementation of subsample by % of annotations in the original dataset
input_directory = "E:/datasets/zooniverse/labels/train/"  # Directory containing original annotations
output_directory = 'E:/datasets/zooniverse/test_001/'  # Directory to store subsampled annotations
subsample_ratio = 0.01  # Subsampling ratio, e.g., 0.5 for 50% of the original data

#subsample_annotations(input_directory, output_directory, subsample_ratio)
subsample_annotations(input_directory, output_directory,subsample_ratio)
analyze_class_distribution(output_directory)

Class Distribution:
Class 2: 527 annotations
Class 1: 3859 annotations
Class 0: 1541 annotations


In [13]:
#Subsample based on # original images

import os
import random
import shutil
from collections import defaultdict

def derive_original_image_names(txt_filenames):
    return list(set([filename.rsplit('_', 2)[0] for filename in txt_filenames]))

def subsample_image_names(image_names, ratio):
    num_images = len(image_names)
    num_subsampled = max(1, int(num_images * ratio))
    return random.sample(image_names, num_subsampled)

def copy_subsampled_annotations(source_dir, target_dir, subsampled_image_names):
    os.makedirs(target_dir, exist_ok=True)
    class_distribution = defaultdict(int)

    for txt_file in os.listdir(source_dir):
        filename, file_extension = os.path.splitext(txt_file)
        image_name = filename.rsplit('_', 2)[0]
        if image_name in subsampled_image_names:
            with open(os.path.join(source_dir, txt_file), 'r') as f:
                lines = f.readlines()
                for line in lines:
                    class_id = int(line.split()[0])
                    class_distribution[class_id] += 1
            shutil.copy(os.path.join(source_dir, txt_file), target_dir)

    return class_distribution

In [19]:
if __name__ == "__main__":
    source_directory = "E:/datasets/zooniverse/labels/train/"
    target_directory = "E:/datasets/zooniverse_img_001/labels/train/"
    subsample_ratio = 0.01

    txt_files = [file for file in os.listdir(source_directory) if file.endswith('.txt')]
    original_image_names = derive_original_image_names(txt_files)
    subsampled_image_names = subsample_image_names(original_image_names, subsample_ratio)

    class_distribution = copy_subsampled_annotations(source_directory, target_directory, subsampled_image_names)

    print("Subsampled class distribution:")
    for class_id, count in class_distribution.items():
        print(f"Class {class_id}: {count} instances")



Subsampled class distribution:
Class 2: 46 instances
Class 1: 530 instances
Class 0: 502 instances
