In [2]:
# Check integrity of the data at important stages especially before kicking off the ML 
"""
Script Name: Dataset Integrity Checker
Description: This script checks a folder of subfolders for:
    - Missing image-annotation pairs
    - Presence of files with "frame" in their names
    - Summarizes the findings for each subfolder

Dependencies:
    - Python 3
    - Standard libraries: os

"""
#=====================================================================================================================================#

import os

def delete_orphan_images(whichfolder, folder):

    print(f"Startin pattern for {folder}")
    path = os.path.join(whichfolder, folder)
    if not os.path.exists(path):
        print(f"Error: The folder {path} does not exist.")
        return
    files = os.listdir(path)
    image_files = sorted([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    text_files = sorted([f for f in files if f.lower().endswith('.txt')])
    paired_files = []
    for image_file in image_files:
        base_name = os.path.splitext(image_file)[0]
        annotation_file = f"{base_name}.txt"
        if annotation_file in text_files:
            paired_files.append((image_file, annotation_file))
        else:
            print(f"identified orphan file {image_file}")
            os.remove(os.path.join(path, image_file))
    for text_file in text_files:
        base_name = os.path.splitext(text_file)[0]
        image_file = f"{base_name}.PNG"
        if image_file in image_files:
            paired_files.append((image_file, text_file))
            #print(f"Paired files {image_file}")
        else:
            print(f"identified orphan file {text_file}")
            os.remove(os.path.join(path, text_file))
            
print(f"Finito")

#============================================================================================================================================

def check_missing_files(folder_path):
    """
    Check for missing annotations and missing images in a folder.

    Args:
        folder_path (str): Path to the folder containing images and annotations.

    Returns:
        None. Prints missing files and a summary.
    """
    # Collect all image and annotation filenames
    images = [
        os.path.splitext(f)[0] for f in os.listdir(folder_path)
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    ]
    annotations = [
        os.path.splitext(f)[0] for f in os.listdir(folder_path)
        if f.lower().endswith('.txt')
    ]

    # Find missing annotations and images
    missing_annotations = [img for img in images if img not in annotations]
    missing_images = [ann for ann in annotations if ann not in images]

    # Print results
    print("\n--- Missing Annotations ---")
    if missing_annotations:
        for img in missing_annotations:
            print(f" ---------------------------------Annotation missing for image: {img}")
    else:
       print("No missing annotations.")

    print("\n--- Missing Images ---")
    if missing_images:
        for ann in missing_images:
            print(f"-------------------------------- Image missing for annotation: {ann}")
    else:
        print("No missing images.")

    # Summary
    print("\n--- Summary ---")
    print(f"Total images: {len(images)}")
    print(f"Total annotations: {len(annotations)}")
    print(f"Missing annotations: {len(missing_annotations)}")
    print(f"Missing images: {len(missing_images)}")

#===================================================================================================================================================================#

def check_missing_files_in_folders(parent_folder):
    """
    Check for missing annotations, images, and files with "frame" in the name in a folder containing multiple subfolders.

    Args:
        parent_folder (str): Path to the parent folder containing subfolders.

    Returns:
        None. Prints missing files and a summary for each subfolder.
    """
    for subfolder in os.listdir(parent_folder):
        subfolder_path = os.path.join(parent_folder, subfolder)

        if not os.path.isdir(subfolder_path):
            continue  # Skip if not a folder

        print(f"\nChecking folder: {subfolder}")

        # Collect all image and annotation filenames
        images = [
            os.path.splitext(f)[0] for f in os.listdir(subfolder_path)
            if f.lower().endswith(('.png', '.jpg', '.jpeg'))
        ]
        annotations = [
            os.path.splitext(f)[0] for f in os.listdir(subfolder_path)
            if f.lower().endswith('.txt')
        ]

        # Find missing annotations and images
        missing_annotations = [img for img in images if img not in annotations]
        missing_images = [ann for ann in annotations if ann not in images]

        # Check for files with "frame" in the name
        frame_files = [f for f in os.listdir(subfolder_path) if "frame" in f.lower()]

        # Print results
        # print("------------------------------------------------------------ Missing Annotations ---")
        if missing_annotations:
            for img in missing_annotations:
                print(f"                                                   Annotation missing for image: {img}")
        #else:
        #    print("No missing annotations.")

        # print("---------------------------------------------------------    Missing Images ---")
        if missing_images:
            for ann in missing_images:
                print(f"Image missing for annotation: {ann}")
        #else:
        #    print("No missing images.")

        if frame_files:
            print("-------------------------------------------------         Alert: Files with 'frame' in the name ---")
            for frame_file in frame_files:
                print(f"File: {frame_file}")

        # Summary
        print("--- Summary ---")
        #print(f"Total images: {len(images)}")
        #print(f"Total annotations: {len(annotations)}")
        print(f"Missing annotations: {len(missing_annotations)}")
        print(f"Missing images: {len(missing_images)}")


base_path_tgt = 'D:/FlagDetectionDatasets/ExportedDatasetsSelectedML'

#delete_orphan_images(base_path_tgt, 'Job_76')
#check_missing_files(base_path_tgt)
check_missing_files_in_folders('D:/FlagDetectionDatasets/ExportedDatasetsSelectedML')


Finito

Checking folder: Job_7
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_11
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_12
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_13
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_14
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_15
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_16
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_21
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_17
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_18
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_24
--- Summary ---
Missing annotations: 0
Missing images: 0

Checking folder: Job_24_filter
--- Summary ---
Missing annotations: 0
Missing images: 0

Ch