# KozAlma AI â€” Dataset Analysis

Notebook for exploring class distribution, bbox stats, duplicates.

In [None]:
import sys
sys.path.insert(0, '../backend')

from scripts.data_checks import (
    load_config, analyze_class_distribution, analyze_bbox_sizes,
    check_corrupted, find_duplicates, plot_class_distribution, plot_bbox_stats,
    OUTPUT_DIR,
)
from pathlib import Path

data_path = Path('../data/data.yaml').resolve()
config = load_config(str(data_path))
base_path = (data_path.parent / config.get('path', '.')).resolve()
print(f'Dataset base: {base_path}')
print(f'Classes: {config["nc"]}')

In [None]:
# Class distribution
class_counts = analyze_class_distribution(config, base_path)
print(f'Total annotations: {sum(class_counts.values())}')
for cls, cnt in class_counts.most_common(10):
    print(f'  {cls}: {cnt}')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
plot_class_distribution(class_counts)

from IPython.display import Image, display
display(Image(filename=str(OUTPUT_DIR / 'class_distribution.png')))

In [None]:
# Bbox statistics
bbox_stats = analyze_bbox_sizes(config, base_path)
print(f'Total bboxes: {bbox_stats["count"]}')
if bbox_stats['count'] > 0:
    print(f'Avg width:  {bbox_stats["widths"].mean():.4f}')
    print(f'Avg height: {bbox_stats["heights"].mean():.4f}')
    plot_bbox_stats(bbox_stats)
    display(Image(filename=str(OUTPUT_DIR / 'bbox_stats.png')))

In [None]:
# Corrupted images
corrupted = check_corrupted(config, base_path)
print(f'Corrupted: {len(corrupted)}')
for c in corrupted[:5]:
    print(f'  {c}')

In [None]:
# Near-duplicates
duplicates = find_duplicates(config, base_path)
print(f'Duplicates: {len(duplicates)}')
for a, b in duplicates[:5]:
    print(f'  {Path(a).name} <-> {Path(b).name}')