In [3]:
# count_selected() expects the data in Job subfolders as it sorts them by the numeric part 
# count_selected_flat () is suitable for the dataset once it has been split e.g. parent > train, test, val 
"""
Script Name: Job Folder Statistics Generator
Description: This script processes a parent folder containing subfolders (jobs) and generates a CSV report 
             with detailed statistics about images and annotations in each subfolder. It also verifies the 
             presence of image-annotation pairs and counts class occurrences.

Features:
    - Reads class names from an `obj.names` file.
    - Handles subfolders that may contain an `obj_train_data` folder or directly contain images and annotations.
    - Calculates total statistics across all job folders.
    - Logs any problematic files or folders.

Input:
    - Parent folder containing job subfolders (e.g., `Job_1`, `Job_2`, etc.).
    - Each job folder may contain:
        - Images (.png, .jpg, .jpeg).
        - Annotations (.txt files).
        - Optionally, a subfolder named `obj_train_data`.

Output:
    - A CSV file summarizing statistics for each job folder, including:
        - Job name.
        - Number of images and annotations.
        - Whether all image-annotation pairs are present.
        - Class-wise counts.

Dependencies:
    - Python 3
    - Libraries: os, re, pandas, collections.defaultdict
"""
import os
import re
import pandas as pd
from collections import defaultdict

def read_classes_from_file(file_path):
    """Read class names from a file."""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return []
    with open(file_path, 'r') as f:
        return [line.strip() for line in f.readlines()]

def count_selected_flat(parent_folder, output_csv="job_statistics_selected.csv"):
    """
    Process folders to generate a CSV with job statistics, while handling folders
    that do not match the expected naming pattern.

    Args:
        parent_folder (str): Path to the parent folder.
        output_csv (str): Path to save the CSV file with job statistics.
    """
    def extract_job_number(folder_name):
        """Extract the job number from the folder name."""
        match = re.search(r'_(\d+)', folder_name)
        return int(match.group(1)) if match else None

    print (obj_names_path)
    classes = read_classes_from_file(obj_names_path)
    if not classes:
        print("No classes found in obj.names. Exiting.")
        return

    # Get and sort job folders
    job_folders = [
        f.path for f in os.scandir(parent_folder) if f.is_dir()
    ]
    job_folders = [folder for folder, _ in job_folders]

    if not job_folders:
        print(f"No folders found in {parent_folder}. Exiting.")
        return

    # Process the folders
    data = []
    skipped_files = []  # To log problematic files

    for job_folder in job_folders:
        job_name = os.path.basename(job_folder)

        obj_train_data_folder = os.path.join(job_folder, "obj_train_data")
        image_folder = obj_train_data_folder if os.path.exists(obj_train_data_folder) else job_folder

        images = [
            f for f in os.listdir(image_folder)
            if f.lower().endswith(('.png', '.jpg', '.jpeg'))
        ]
        annotations = [
            f for f in os.listdir(image_folder)
            if f.lower().endswith('.txt')
        ]

        image_count = len(images)
        annotation_count = len(annotations)

        # Calculate total size of images
        total_image_size = sum(
            os.path.getsize(os.path.join(image_folder, img)) for img in images
        ) / (1024 * 1024)  # Convert to MB

        # Check if all image/annotation pairs are present
        all_pairs_present = all(
            os.path.exists(os.path.join(image_folder, os.path.splitext(img)[0] + ".txt"))
            for img in images
        )

        # Count classes in annotations
        class_counts = defaultdict(int)
        for annotation in annotations:
            try:
                with open(os.path.join(image_folder, annotation), 'r') as file:
                    for line in file:
                        class_id = int(line.split()[0])
                        class_counts[f"{class_id}"] += 1
            except (ValueError, IndexError) as e:
                skipped_files.append((job_folder, annotation, str(e)))

        # Prepare data for CSV
        row = {
            "Folder Name": job_name,
            "Image Count": image_count,
            "Annotation Count": annotation_count,
            "Total Image Size (MB)": round(total_image_size, 2),
            "All Image/Annotation Pairs Present": all_pairs_present,
        }
        for i, cls in enumerate(classes):
            row[cls] = class_counts.get(str(i), 0)
        data.append(row)

    # Calculate totals
    total_image_size_mb = sum(row["Total Image Size (MB)"] for row in data)
    total_image_size_gb = total_image_size_mb / 1024
    totals = {
        "Folder Name": "TOTAL",
        "Image Count": sum(row["Image Count"] for row in data),
        "Annotation Count": sum(row["Annotation Count"] for row in data),
        "Total Image Size (MB)": f"{round(total_image_size_mb, 2)} MB ({round(total_image_size_gb, 2)} GB)",
        "All Image/Annotation Pairs Present": "",
    }
    for cls in classes:
        totals[cls] = sum(row[cls] for row in data)
    data.append(totals)

    # Write data to CSV
    header = [
        "Folder Name", "Image Count", "Annotation Count", "Total Image Size (MB)", "All Image/Annotation Pairs Present"
    ] + classes
    df = pd.DataFrame(data, columns=header)
    df.to_csv(output_csv, index=False)
    print(f"CSV file created: {output_csv}")

    # Log skipped files
    if skipped_files:
        print("\nSkipped files due to errors:")
        for job_folder, annotation, error in skipped_files:
            print(f"Job: {job_folder}, File: {annotation}, Error: {error}")

# Used this when data is in Job_No folders - up to the stage of splitting the data 
def count_selected(parent_folder, output_csv="job_statistics_selected.csv"):
    """
    Process folders to generate a CSV with job statistics, while handling folders
    that do not match the expected naming pattern.

    Args:
        parent_folder (str): Path to the parent folder.
        output_csv (str): Path to save the CSV file with job statistics.
    """
    def extract_job_number(folder_name):
        """Extract the job number from the folder name."""
        match = re.search(r'_(\d+)', folder_name)
        return int(match.group(1)) if match else None

    # Read class names from obj.names
    # obj_names_path = os.path.normpath(os.path.join(parent_folder, "obj.names"))
    # obj_names_path = os.path.join(parent_folder, "obj.names")
    print (obj_names_path)
    classes = read_classes_from_file(obj_names_path)
    if not classes:
        print("No classes found in obj.names. Exiting.")
        return

    # Get and sort job folders
    job_folders = [
        f.path for f in os.scandir(parent_folder) if f.is_dir()
    ]
    job_folders_with_numbers = [
        (folder, extract_job_number(os.path.basename(folder))) for folder in job_folders
    ]
    job_folders_with_numbers = [
        (folder, num) for folder, num in job_folders_with_numbers if num is not None
    ]
    job_folders_with_numbers.sort(key=lambda x: x[1])  # Sort by extracted number
    job_folders = [folder for folder, _ in job_folders_with_numbers]

    if not job_folders:
        print(f"No job folders found in {parent_folder}. Exiting.")
        return

    # Process the folders
    data = []
    skipped_files = []  # To log problematic files

    for job_folder in job_folders:
        job_name = os.path.basename(job_folder)

        obj_train_data_folder = os.path.join(job_folder, "obj_train_data")
        image_folder = obj_train_data_folder if os.path.exists(obj_train_data_folder) else job_folder

        images = [
            f for f in os.listdir(image_folder)
            if f.lower().endswith(('.png', '.jpg', '.jpeg'))
        ]
        annotations = [
            f for f in os.listdir(image_folder)
            if f.lower().endswith('.txt')
        ]

        image_count = len(images)
        annotation_count = len(annotations)

        # Calculate total size of images
        total_image_size = sum(
            os.path.getsize(os.path.join(image_folder, img)) for img in images
        ) / (1024 * 1024)  # Convert to MB

        # Check if all image/annotation pairs are present
        all_pairs_present = all(
            os.path.exists(os.path.join(image_folder, os.path.splitext(img)[0] + ".txt"))
            for img in images
        )

        # Count classes in annotations
        class_counts = defaultdict(int)
        for annotation in annotations:
            try:
                with open(os.path.join(image_folder, annotation), 'r') as file:
                    for line in file:
                        class_id = int(line.split()[0])
                        class_counts[f"{class_id}"] += 1
            except (ValueError, IndexError) as e:
                skipped_files.append((job_folder, annotation, str(e)))

        # Prepare data for CSV
        row = {
            "Job Name": job_name,
            "Image Count": image_count,
            "Annotation Count": annotation_count,
            "Total Image Size (MB)": round(total_image_size, 2),
            "All Image/Annotation Pairs Present": all_pairs_present,
        }
        for i, cls in enumerate(classes):
            row[cls] = class_counts.get(str(i), 0)
        data.append(row)

    # Calculate totals
    total_image_size_mb = sum(row["Total Image Size (MB)"] for row in data)
    total_image_size_gb = total_image_size_mb / 1024
    totals = {
        "Job Name": "TOTAL",
        "Image Count": sum(row["Image Count"] for row in data),
        "Annotation Count": sum(row["Annotation Count"] for row in data),
        "Total Image Size (MB)": f"{round(total_image_size_mb, 2)} MB ({round(total_image_size_gb, 2)} GB)",
        "All Image/Annotation Pairs Present": "",
    }
    for cls in classes:
        totals[cls] = sum(row[cls] for row in data)
    data.append(totals)

    # Write data to CSV
    header = [
        "Job Name", "Image Count", "Annotation Count", "Total Image Size (MB)", "All Image/Annotation Pairs Present"
    ] + classes
    df = pd.DataFrame(data, columns=header)
    df.to_csv(output_csv, index=False)
    print(f"CSV file created: {output_csv}")

    # Log skipped files
    if skipped_files:
        print("\nSkipped files due to errors:")
        for job_folder, annotation, error in skipped_files:
            print(f"Job: {job_folder}, File: {annotation}, Error: {error}")

#===================================================================================================================================#
obj_names_path = 'D:/FlagDetectionDatasets/ExportedDatasetsReduced/obj.names'

#parent_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsExtracted'
#output_csv = "job_statistics_data_export_from_cvat.csv"
#count_selected(parent_folder, output_csv)

#parent_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsReduced'
#output_csv = "job_statistics_reduced_dataset1.csv"
#count_selected(parent_folder, output_csv)

#parent_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsSelectedML'
#output_csv = "job_statistics_selected_ML.csv"
#count_selected(parent_folder, output_csv)

parent_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsSelectedMLROBIN'
output_csv = "job_statistics_selected_MLROBIN.csv"
count_selected(parent_folder, output_csv)

#parent_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsReduced'
#output_csv = "job_statistics_reduced.csv"

#parent_folder = 'D:/FlagDetectionDatasets/ExportedDatasetsReduced'
#output_csv = "job_statistics_reduced_dataset1.csv"
#count_selected(parent_folder, output_csv)

#count_selected(parent_folder, output_csv)

# Count images in datasets
#parent_folder = r'D:\FlagDetectionDatasets\ExportedDatasetsReduced'
#output_csv = "csv\image_counts_by_job4.csv"
#output_csv = "image_counts_and_classes.csv"  # Output CSV file
#count_images_and_classes(parent_folder, output_csv)

#process_job_folders(folder_path, output_csv, image_subfolder_name="obj_train_data")


D:/FlagDetectionDatasets/ExportedDatasetsReduced/obj.names
No job folders found in D:/FlagDetectionDatasets/ExportedDatasetsSelectedMLROBIN. Exiting.
