In [1]:
import os
import glob
import plotly.graph_objects as go

def analyze_disease_directory(directory):
    """
    Analyzes a directory containing disease folders and images.

    Args:
        directory: The path to the main directory.

    Returns:
        A dictionary where keys are disease names (folder names) and 
        values are the number of images in each folder.  Returns an
        empty dictionary if the directory doesn't exist or if there
        are problems accessing it.  Also prints informative messages
        to the console.
    """

    try:
        if not os.path.isdir(directory):
            print(f"Error: Directory '{directory}' not found.")
            return {}

        disease_counts = {}
        disease_folders = [f for f in os.listdir(directory) if os.path.isdir(os.path.join(directory, f))]


        if not disease_folders:
            print(f"No disease folders found in '{directory}'.")
            return {}


        for disease_folder in disease_folders:
            disease_path = os.path.join(directory, disease_folder)
            image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.gif", "*.bmp"]  # Add more if needed
            image_count = 0

            for ext in image_extensions:
                image_count += len(glob.glob(os.path.join(disease_path, ext)))  # More robust image counting

            disease_counts[disease_folder] = image_count
            print(f"Disease: {disease_folder}, Images: {image_count}")

        return disease_counts

    except Exception as e:
        print(f"An error occurred: {e}")
        return {}


def plot_disease_counts(disease_counts):
    """Plots disease counts in a pie chart using Plotly."""

    if not disease_counts:
        print("No data to plot.")
        return

    labels = list(disease_counts.keys())
    values = list(disease_counts.values())

    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hoverinfo='label+percent', textinfo='value')])
    fig.update_layout(title="Disease Image Distribution")  # Add a title
    fig.show()



# Example usage:
main_directory = "/Users/anyhow/projects/data_science/Medical-Report-Generation/processed_dataset"  # Replace with the actual path
results = analyze_disease_directory(main_directory)

if results:
    print("\nSummary:")
    for disease, count in results.items():
        print(f"- {disease}: {count} images")

    total_diseases = len(results)
    total_images = sum(results.values())
    print(f"\nTotal Diseases: {total_diseases}")
    print(f"Total Images: {total_images}")

    plot_disease_counts(results) # Call the plotting function

Disease: no_lung_opacity___not_normal, Images: 11821
Disease: hernia, Images: 227
Disease: mass, Images: 5782
Disease: cardiomegaly, Images: 2776
Disease: atelectasis, Images: 11559
Disease: effusion, Images: 13317
Disease: pneumothorax, Images: 5302
Disease: nodule, Images: 6331
Disease: edema, Images: 2303
Disease: lung_opacity, Images: 6036
Disease: pleural_thickening, Images: 3385
Disease: pneumonia, Images: 5704
Disease: normal, Images: 10434
Disease: emphysema, Images: 2516
Disease: infiltration, Images: 19894
Disease: consolidation, Images: 4667
Disease: fibrosis, Images: 1686

Summary:
- no_lung_opacity___not_normal: 11821 images
- hernia: 227 images
- mass: 5782 images
- cardiomegaly: 2776 images
- atelectasis: 11559 images
- effusion: 13317 images
- pneumothorax: 5302 images
- nodule: 6331 images
- edema: 2303 images
- lung_opacity: 6036 images
- pleural_thickening: 3385 images
- pneumonia: 5704 images
- normal: 10434 images
- emphysema: 2516 images
- infiltration: 19894 imag

In [8]:
import os
import cv2
import pydicom
import numpy as np

def analyze_image_folder(root_folder):
    """Analyzes images in a folder and its subfolders (disease folders)."""

    all_image_types = set()
    all_dimensions = set()
    total_image_count = 0

    for disease_folder in os.listdir(root_folder):
        disease_path = os.path.join(root_folder, disease_folder)

        if os.path.isdir(disease_path):  # Check if it's a directory (disease folder)
            image_types, dimensions, image_count = _analyze_images_in_folder(disease_path) # Helper function
            all_image_types.update(image_types)  # Use update for sets
            all_dimensions.update(dimensions)
            total_image_count += image_count
            print(f"  Disease: {disease_folder}")
            print(f"    Image Count: {image_count}")
            print(f"    Image Types: {image_types}")
            print(f"    Dimensions: {dimensions}")
            print("-" * 20)

    return all_image_types, all_dimensions, total_image_count



def _analyze_images_in_folder(folder_path): # Helper function
    """Analyzes images in a single folder (no subfolders)."""
    image_types = set()
    dimensions = set()
    image_count = 0

    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)

        if os.path.isfile(filepath):
            try:
                img = cv2.imread(filepath, cv2.IMREAD_UNCHANGED)
                if img is not None:
                    image_count += 1
                    image_types.add(filename.split('.')[-1].lower())
                    dimensions.add(img.shape)
                else:
                    try:
                        ds = pydicom.dcmread(filepath)
                        img = ds.pixel_array
                        image_count += 1
                        image_types.add("dcm")
                        dimensions.add(img.shape)
                    except Exception as dicom_err:
                        pass # Or handle DICOM errors as needed
            except Exception as e:
                pass # Or handle other errors as needed
    return image_types, dimensions, image_count




# Example usage:
folders = [
    "/Users/anyhow/projects/data_science/Medical-Report-Generation/processed_dataset/train",
    "/Users/anyhow/projects/data_science/Medical-Report-Generation/processed_dataset/test",
    "/Users/anyhow/projects/data_science/Medical-Report-Generation/processed_dataset/val",
]

for folder in folders:
    types, dims, count = analyze_image_folder(folder)
    print(f"Folder: {folder}")
    print(f"  Total Image Count: {count}")
    print(f"  Overall Image Types: {types}")
    print(f"  Overall Dimensions: {dims}")
    print("=" * 30)  # Separator between folders

  Disease: no_lung_opacity___not_normal
    Image Count: 5910
    Image Types: {'png'}
    Dimensions: {(224, 224)}
--------------------
  Disease: hernia
    Image Count: 113
    Image Types: {'png'}
    Dimensions: {(224, 224)}
--------------------
  Disease: mass
    Image Count: 2891
    Image Types: {'png'}
    Dimensions: {(224, 224)}
--------------------
  Disease: cardiomegaly
    Image Count: 1388
    Image Types: {'png'}
    Dimensions: {(224, 224)}
--------------------
  Disease: atelectasis
    Image Count: 5779
    Image Types: {'png'}
    Dimensions: {(224, 224)}
--------------------
  Disease: effusion
    Image Count: 6658
    Image Types: {'png'}
    Dimensions: {(224, 224)}
--------------------
  Disease: pneumothorax
    Image Count: 2651
    Image Types: {'png'}
    Dimensions: {(224, 224)}
--------------------
  Disease: nodule
    Image Count: 3165
    Image Types: {'png'}
    Dimensions: {(224, 224)}
--------------------
  Disease: edema
    Image Count: 1151
   