In [8]:
import os
import pandas as pd
from IPython.display import display

def compare_folders_of_folders(folder1, folder2):
    """
    Compares two folders of folders for image and annotation counts, differences, and folder sizes.

    Args:
        folder1 (str): Path to the first folder.
        folder2 (str): Path to the second folder.

    Returns:
        dict: A dictionary containing folder comparison details.
    """
    def get_folder_details(folder_path):
        """
        Get details of a folder including image count, annotation count, and size.

        Args:
            folder_path (str): Path to the folder.

        Returns:
            dict: A dictionary containing details of the folder.
        """
        details = {}
        for subfolder in os.listdir(folder_path):
            subfolder_path = os.path.join(folder_path, subfolder)
            if os.path.isdir(subfolder_path):
                images = [f for f in os.listdir(subfolder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
                annotations = [f for f in os.listdir(subfolder_path) if f.lower().endswith('.txt')]
                folder_size = sum(os.path.getsize(os.path.join(subfolder_path, f)) for f in os.listdir(subfolder_path))
                details[subfolder] = {
                    "image_count": len(images),
                    "annotation_count": len(annotations),
                    "folder_size": folder_size
                }
        return details

    # Get details of both folders
    details1 = get_folder_details(folder1)
    details2 = get_folder_details(folder2)

    # Compare folders
    comparison = {}
    all_folders = set(details1.keys()).union(set(details2.keys()))
    for folder in all_folders:
        folder1_details = details1.get(folder, {"image_count": 0, "annotation_count": 0, "folder_size": 0})
        folder2_details = details2.get(folder, {"image_count": 0, "annotation_count": 0, "folder_size": 0})
        comparison[folder] = {
            "folder1_images": folder1_details["image_count"],
            "folder2_images": folder2_details["image_count"],
            "image_difference": folder1_details["image_count"] != folder2_details["image_count"],
            "folder1_annotations": folder1_details["annotation_count"],
            "folder2_annotations": folder2_details["annotation_count"],
            "annotation_difference": folder1_details["annotation_count"] != folder2_details["annotation_count"],
            "folder1_size": folder1_details["folder_size"],
            "folder2_size": folder2_details["folder_size"],
            "size_difference": folder1_details["folder_size"] - folder2_details["folder_size"]
        }
    return comparison

# Specify folder paths
folder1 = "D:/FlagDetectionDatasets/ExportedDatasetsSelected"
folder2 = "D:/FlagDetectionDatasets/ExportedDatasetsSelectedML"

# Compare folders and create a DataFrame for display
comparison_result = compare_folders_of_folders(folder1, folder2)
comparison_df = pd.DataFrame.from_dict(comparison_result, orient='index').reset_index()
comparison_df.rename(columns={"index": "Folder Name"}, inplace=True)

# Add a row for totals
totals = {
    "Folder Name": "TOTAL",
    "folder1_images": comparison_df["folder1_images"].sum(),
    "folder2_images": comparison_df["folder2_images"].sum(),
    "image_difference": "N/A",
    "folder1_annotations": comparison_df["folder1_annotations"].sum(),
    "folder2_annotations": comparison_df["folder2_annotations"].sum(),
    "annotation_difference": "N/A",
    "folder1_size": comparison_df["folder1_size"].sum(),
    "folder2_size": comparison_df["folder2_size"].sum(),
    "size_difference": comparison_df["size_difference"].sum()
}
comparison_df = pd.concat([comparison_df, pd.DataFrame([totals])], ignore_index=True)

# Convert size columns (bytes) to MB for better readability
comparison_df["folder1_size"] = comparison_df["folder1_size"].apply(lambda x: round(x / (1024 * 1024), 2) if isinstance(x, int) else x)
comparison_df["folder2_size"] = comparison_df["folder2_size"].apply(lambda x: round(x / (1024 * 1024), 2) if isinstance(x, int) else x)
comparison_df["size_difference"] = comparison_df["size_difference"].apply(lambda x: round(x / (1024 * 1024), 2) if isinstance(x, int) else x)

# Display the DataFrame
with pd.option_context('display.max_rows', None):
    display(comparison_df)

# Result for PNG format default and 10 Quality JPEGS was an increase of 807MB. 
# Folder Name	folder1_images	folder2_images	image_difference	folder1_annotations	folder2_annotations	annotation_difference	folder1_size	folder2_size	size_difference
#  TOTAL	   8721	           8721	          N/A	                8720	             8720	                   N/A	            3517.14	        4324.44	       -807.30


Unnamed: 0,Folder Name,folder1_images,folder2_images,image_difference,folder1_annotations,folder2_annotations,annotation_difference,folder1_size,folder2_size,size_difference
0,Job_125_Aug,37,37,False,37,37,False,8.02,18.33,-10.3
1,Job_159,17,17,False,17,17,False,33.67,36.48,-2.81
2,Job_114,96,96,False,96,96,False,48.64,53.66,-5.02
3,Job_143_Aug,13,13,False,13,13,False,5.62,6.57,-0.94
4,Job_120,58,58,False,58,58,False,37.9,41.16,-3.26
5,Job_108,5,5,False,5,5,False,3.28,3.51,-0.23
6,Job_117_Aug,49,49,False,49,49,False,11.87,13.6,-1.73
7,Job_60,54,54,False,54,54,False,19.3,22.96,-3.66
8,Job_13,49,49,False,49,49,False,6.92,8.84,-1.92
9,Job_22,101,101,False,101,101,False,33.02,39.42,-6.4
