# Phase 0-3: Dataset Statistics

Summaries for class distributions, bounding boxes, and image resolutions.

In [None]:
from pathlib import Path
from collections import defaultdict
import os
import json
from datetime import datetime

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

print(f"Imports OK. CWD: {Path().resolve()}")

In [None]:
# Paths are machine-specific. Adjust PROJECT_ROOT before running on a new machine.
PROJECT_ROOT = "/Users/tyreecruse/Desktop/CS230/Project/Data/Original"
# Or just use the current directory:
# PROJECT_ROOT = os.getcwd()

CONFIG = {
    "dataset_path": os.path.join(PROJECT_ROOT, "master_dataset_pool"),
    "sample_size": 1000,
    "save_report": True,
    "report_dir": os.path.join(PROJECT_ROOT, "dataset_reports"),
}

print("Config")
print("------")
for key in ("dataset_path", "sample_size", "save_report", "report_dir"):
    print(f"{key:>12}: {CONFIG[key]}")

dataset_path = Path(CONFIG["dataset_path"])
images_dir = dataset_path / "images"
labels_dir = dataset_path / "labels"

if dataset_path.exists():
    n_img = len(list(images_dir.glob("*"))) if images_dir.exists() else 0
    n_lbl = len(list(labels_dir.glob("*.txt"))) if labels_dir.exists() else 0
    print(f"\nDataset found at {dataset_path}")
    print(f"  images: {n_img:,}")
    print(f"  labels: {n_lbl:,}")
else:
    print(f"\n[warning] dataset not found at {dataset_path}")

In [None]:
def analyze_class_distribution(labels_dir):
    """Return class-count stats for YOLO label files in labels_dir."""
    labels_dir = Path(labels_dir)

    class_counts = defaultdict(int)
    boxes_per_image = []
    total_boxes = 0

    if not labels_dir.is_dir():
        return {
            "class_counts": {},
            "num_classes": 0,
            "total_boxes": 0,
            "boxes_per_image": [],
            "avg_boxes_per_image": 0.0,
            "std_boxes_per_image": 0.0,
            "min_boxes_per_image": 0,
            "max_boxes_per_image": 0,
        }

    label_files = sorted(labels_dir.glob("*.txt"))
    for label_path in label_files:
        boxes_in_image = 0
        try:
            lines = label_path.read_text().strip().splitlines()
        except Exception:
            continue

        for line in lines:
            parts = line.strip().split()
            if len(parts) != 5:
                continue
            try:
                class_id = int(parts[0])
            except ValueError:
                continue
            class_counts[class_id] += 1
            boxes_in_image += 1
            total_boxes += 1

        boxes_per_image.append(boxes_in_image)

    if boxes_per_image:
        arr = np.asarray(boxes_per_image, dtype=float)
        avg = float(arr.mean())
        std = float(arr.std())
        min_b = int(arr.min())
        max_b = int(arr.max())
    else:
        avg = std = 0.0
        min_b = max_b = 0

    return {
        "class_counts": dict(class_counts),
        "num_classes": len(class_counts),
        "total_boxes": int(total_boxes),
        "boxes_per_image": boxes_per_image,
        "avg_boxes_per_image": avg,
        "std_boxes_per_image": std,
        "min_boxes_per_image": min_b,
        "max_boxes_per_image": max_b,
    }

In [None]:
def analyze_bbox_dimensions(labels_dir, max_files=1000):
    """Return width/height/area/aspect stats for YOLO boxes."""
    labels_dir = Path(labels_dir)

    widths, heights, areas, aspect_ratios = [], [], [], []

    label_files = sorted(labels_dir.glob("*.txt"))[:max_files]
    for label_path in label_files:
        try:
            lines = label_path.read_text().strip().splitlines()
        except Exception:
            continue

        for line in lines:
            parts = line.strip().split()
            if len(parts) != 5:
                continue
            try:
                _, x, y, w, h = map(float, parts)
            except ValueError:
                continue

            widths.append(w)
            heights.append(h)
            areas.append(w * h)
            if h > 0:
                aspect_ratios.append(w / h)

    def mean_or_zero(values):
        if not values:
            return 0.0
        return float(np.asarray(values, dtype=float).mean())

    return {
        "widths": widths,
        "heights": heights,
        "areas": areas,
        "aspect_ratios": aspect_ratios,
        "avg_width": mean_or_zero(widths),
        "avg_height": mean_or_zero(heights),
        "avg_area": mean_or_zero(areas),
        "avg_aspect_ratio": mean_or_zero(aspect_ratios),
    }

In [None]:
def analyze_image_resolutions(images_dir, sample_size=1000):
    """Return width/height/aspect stats for image files under images_dir."""
    images_dir = Path(images_dir)

    widths, heights, aspect_ratios = [], [], []

    image_files = sorted(p for p in images_dir.glob("*") if p.is_file())
    if sample_size and sample_size > 0:
        image_files = image_files[:sample_size]

    for img_path in image_files:
        try:
            with Image.open(img_path) as img:
                w, h = img.size
        except Exception:
            continue

        widths.append(w)
        heights.append(h)
        if h > 0:
            aspect_ratios.append(w / h)

    def mean_or_zero(values):
        if not values:
            return 0.0
        return float(np.asarray(values, dtype=float).mean())

    def min_or_zero(values):
        return int(min(values)) if values else 0

    def max_or_zero(values):
        return int(max(values)) if values else 0

    return {
        "widths": widths,
        "heights": heights,
        "aspect_ratios": aspect_ratios,
        "avg_width": mean_or_zero(widths),
        "avg_height": mean_or_zero(heights),
        "min_width": min_or_zero(widths),
        "max_width": max_or_zero(widths),
        "min_height": min_or_zero(heights),
        "max_height": max_or_zero(heights),
    }

In [None]:
def analyze_dataset(dataset_path, config):
    """Compute basic counts plus class, bbox, and resolution stats."""
    dataset_path = Path(dataset_path)
    images_dir = dataset_path / "images"
    labels_dir = dataset_path / "labels"

    if not dataset_path.exists():
        raise FileNotFoundError(f"Dataset path does not exist: {dataset_path}")

    images = sorted(p for p in images_dir.glob("*") if p.is_file()) if images_dir.is_dir() else []
    labels = sorted(labels_dir.glob("*.txt")) if labels_dir.is_dir() else []

    basic = {
        "total_images": len(images),
        "total_labels": len(labels),
        "coverage": 100.0 * len(labels) / len(images) if images else 0.0,
    }

    print("\nDataset analysis")
    print("----------------")
    print(f" images : {basic['total_images']:,}")
    print(f" labels : {basic['total_labels']:,}")
    print(f" coverage: {basic['coverage']:.1f}%")

    class_stats = analyze_class_distribution(labels_dir)
    print(f"\n classes: {class_stats['num_classes']}")
    print(f" total boxes: {class_stats['total_boxes']:,}")
    print(f" avg boxes/image: {class_stats['avg_boxes_per_image']:.2f}")

    bbox_stats = analyze_bbox_dimensions(labels_dir)
    print(f"\n bbox avg width : {bbox_stats['avg_width']:.3f}")
    print(f" bbox avg height: {bbox_stats['avg_height']:.3f}")
    print(f" bbox avg area  : {bbox_stats['avg_area']:.3f}")

    res_stats = analyze_image_resolutions(images_dir, config.get("sample_size", 1000))
    if res_stats["avg_width"]:
        print(f"\n image avg size : {res_stats['avg_width']:.0f} x {res_stats['avg_height']:.0f}")
        print(f" width range    : {res_stats['min_width']} - {res_stats['max_width']}")
        print(f" height range   : {res_stats['min_height']} - {res_stats['max_height']}")

    analysis = {
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "dataset_path": str(dataset_path),
        "basic_stats": basic,
        "class_distribution": class_stats,
        "bbox_dimensions": bbox_stats,
        "image_resolutions": res_stats,
    }

    return analysis

In [None]:
def plot_class_distribution(analysis):
    """Bar plot of class counts."""
    data = analysis.get("class_distribution", {}).get("class_counts", {})
    if not data:
        print("No class data to plot.")
        return

    classes = sorted(data.keys())
    counts = [data[c] for c in classes]

    plt.figure(figsize=(6, 4))
    plt.bar(classes, counts)
    plt.xlabel("Class id")
    plt.ylabel("Box count")
    plt.title("Class distribution")
    plt.tight_layout()
    plt.show()


def plot_bbox_area_hist(analysis, bins=30):
    """Histogram of normalized bounding-box areas."""
    areas = analysis.get("bbox_dimensions", {}).get("areas", [])
    if not areas:
        print("No bbox area data to plot.")
        return

    plt.figure(figsize=(6, 4))
    plt.hist(areas, bins=bins)
    plt.xlabel("Normalized area (w*h)")
    plt.ylabel("Count")
    plt.title("Bounding-box area distribution")
    plt.tight_layout()
    plt.show()


def plot_image_resolution_scatter(analysis):
    """Scatter plot of image widths vs heights."""
    res = analysis.get("image_resolutions", {})
    widths = res.get("widths", [])
    heights = res.get("heights", [])
    if not widths or not heights:
        print("No resolution data to plot.")
        return

    plt.figure(figsize=(6, 4))
    plt.scatter(widths, heights, s=8, alpha=0.5)
    plt.xlabel("Width (px)")
    plt.ylabel("Height (px)")
    plt.title("Image resolutions")
    plt.tight_layout()
    plt.show()


def save_analysis_report(analysis, config):
    """Save a compact JSON summary (without large raw arrays)."""
    report_dir = Path(config["report_dir"])
    report_dir.mkdir(parents=True, exist_ok=True)

    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_file = report_dir / f"dataset_stats_{ts}.json"

    payload = {
        "timestamp": analysis["timestamp"],
        "dataset_path": analysis["dataset_path"],
        "basic_stats": analysis["basic_stats"],
        "class_distribution": {
            k: v
            for k, v in analysis["class_distribution"].items()
            if k not in ["boxes_per_image"]
        },
        "bbox_dimensions": {
            k: v
            for k, v in analysis["bbox_dimensions"].items()
            if k not in ["widths", "heights", "areas", "aspect_ratios"]
        },
        "image_resolutions": {
            k: v
            for k, v in analysis["image_resolutions"].items()
            if k not in ["widths", "heights", "aspect_ratios"]
        },
    }

    with open(report_file, "w") as f:
        json.dump(payload, f, indent=2)

    print(f"\nReport saved to {report_file}")
    return report_file

In [None]:
def main():
    """Run dataset analysis and optionally write a JSON report."""
    analysis = analyze_dataset(CONFIG["dataset_path"], CONFIG)

    if CONFIG.get("save_report", False):
        save_analysis_report(analysis, CONFIG)

    return analysis


if __name__ == "__main__":
    analysis_results = main()