In [None]:
# Build a table of all annotated objects and compute statistics + plots.
sns.set(style="whitegrid", rc={"figure.figsize": (8, 5)})

eda_out = out_root / "eda"
eda_out.mkdir(parents=True, exist_ok=True)

# Build rows: one row per object in dataset
rows = []
for img_path, xml_path in tqdm(list(zip(IMGS_PATH_LS, XML_PATH_LS)), desc="Parsing XMLs"):
    try:
        objs = parse_xml_coordiantes(xml_path)
    except Exception as e:
        print(f"Failed to parse {xml_path}: {e}")
        continue
    # If no objects, still register the image (optional)
    if len(objs) == 0:
        rows.append({
            "image_path": str(img_path),
            "xml_path": str(xml_path),
            "class_name": None,
            "class_id": None,
            "xmin": None, "ymin": None, "xmax": None, "ymax": None,
            "img_w": None, "img_h": None,
            "bbox_w": None, "bbox_h": None,
            "bbox_area": None, "bbox_area_norm": None, "bbox_ar": None
        })
        continue

    for name, xmin, ymin, xmax, ymax, w, h in objs:
        cid = get_class_id(name)
        bw = xmax - xmin
        bh = ymax - ymin
        area = bw * bh
        norm_area = area / (w * h) if (w * h) > 0 else None
        ar = (bw / bh) if bh > 0 else None
        rows.append({
            "image_path": str(img_path),
            "xml_path": str(xml_path),
            "class_name": name,
            "class_id": cid,
            "xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax,
            "img_w": w, "img_h": h,
            "bbox_w": bw, "bbox_h": bh,
            "bbox_area": area, "bbox_area_norm": norm_area, "bbox_ar": ar
        })

df = pd.DataFrame(rows)
# Basic dataset counts
num_images = len(df["image_path"].unique())
num_annotations = df["class_name"].notna().sum()
num_classes = len(breed_names)
print(f"Images: {num_images}, Annotated objects: {num_annotations}, Classes (declared): {num_classes}")

# Class frequency (by object instances)
class_freq = df["class_name"].value_counts(dropna=True)
class_freq.to_csv(eda_out / "class_frequency.csv", index=True)
print("Top classes:\n", class_freq.head(10))

# Objects per image distribution
obj_per_image = df.groupby("image_path")["class_name"].count()
obj_per_image.describe().to_csv(eda_out / "objects_per_image_stats.csv")
# Plot: objects per image
plt.figure()
sns.histplot(obj_per_image, bins=range(0, int(obj_per_image.max())+2), kde=False)
plt.title("Objects per image")
plt.xlabel("Number objects")
plt.ylabel("Number images")
plt.tight_layout()
plt.savefig(eda_out / "objects_per_image_hist.png")
plt.close()

# Plot: class frequency top 30
plt.figure(figsize=(10, 6))
top_n = min(30, len(class_freq))
sns.barplot(x=class_freq.values[:top_n], y=class_freq.index[:top_n], palette="viridis")
plt.title("Top classes by instance count")
plt.xlabel("Instance count")
plt.tight_layout()
plt.savefig(eda_out / "class_frequency_top.png")
plt.close()

# BBox normalized area distribution
plt.figure()
sns.histplot(df["bbox_area_norm"].dropna(), bins=50, kde=True)
plt.title("Normalized bbox area distribution")
plt.xlabel("BBox area (normalized to image)")
plt.tight_layout()
plt.savefig(eda_out / "bbox_area_norm_hist.png")
plt.close()

# BBox aspect ratio distribution
plt.figure()
sns.histplot(df["bbox_ar"].dropna(), bins=50, log_scale=(False, True))
plt.title("BBox aspect ratio (width/height) distribution")
plt.xlabel("Aspect ratio (w/h)")
plt.tight_layout()
plt.savefig(eda_out / "bbox_aspect_ratio_hist.png")
plt.close()

# Image size scatter (unique image sizes)
img_sizes = df[["image_path", "img_w", "img_h"]].drop_duplicates().dropna()
plt.figure(figsize=(6,6))
sns.scatterplot(x="img_w", y="img_h", data=img_sizes, alpha=0.6)
plt.title("Image width x height distribution")
plt.xlabel("Width")
plt.ylabel("Height")
plt.tight_layout()
plt.savefig(eda_out / "image_size_scatter.png")
plt.close()

# Images per class (unique images containing the class)
images_per_class = df.dropna(subset=["class_name"]).groupby("class_name")["image_path"].nunique()
images_per_class.sort_values(ascending=False).head(20).to_csv(eda_out / "images_per_class_top20.csv")

# Save a summary text
with open(eda_out / "summary.txt", "w") as fh:
    fh.write(f"Total unique images: {num_images}\n")
    fh.write(f"Total annotated objects: {num_annotations}\n")
    fh.write(f"Declared classes (names list length): {num_classes}\n")
    fh.write("Top 10 classes by instance count:\n")
    fh.write("\n".join([f"{i}: {c}" for i,c in enumerate(class_freq.index[:10])]) + "\n")

# Save dataframe sample and basic stats
df.describe(include="all").to_csv(eda_out / "df_describe.csv")
df.head(200).to_csv(eda_out / "annotations_sample.csv", index=False)

# Create sample visualizations: for top classes, draw boxes on one image per class
def draw_boxes_on_image(image_path, ann_rows, out_path, label_col="class_name"):
    img = cv2.imread(image_path)
    if img is None:
        return
    for _, r in ann_rows.iterrows():
        if pd.isna(r["xmin"]):
            continue
        x1, y1, x2, y2 = int(r["xmin"]), int(r["ymin"]), int(r["xmax"]), int(r["ymax"])
        color = (0, 255, 0)
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        label = str(r[label_col])
        cv2.putText(img, label, (x1, max(15, y1-5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
    cv2.imwrite(str(out_path), img)

top_classes = class_freq.index[:6]
for cls in top_classes:
    subset = df[df["class_name"] == cls]
    if subset.empty:
        continue
    # pick a sample image for this class
    sample_row = subset.sample(1).iloc[0]
    img_path = sample_row["image_path"]
    ann_for_image = df[df["image_path"] == img_path]
    out_img_path = eda_out / f"sample_{cls.replace('/','_')}.jpg"
    draw_boxes_on_image(img_path, ann_for_image, out_img_path)

print(f"EDA outputs saved to {eda_out}")

Parsing XMLs: 100%|██████████| 20580/20580 [00:00<00:00, 22331.48it/s]

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=class_freq.values[:top_n], y=class_freq.index[:top_n], palette="viridis")


Images: 20580, Annotated objects: 22126, Classes (declared): 120
Top classes:
 class_name
Afghan_hound          287
Maltese_dog           264
Irish_wolfhound       263
Leonberg              256
Scottish_deerhound    246
Samoyed               241
EntleBucher           236
whippet               235
Sealyham_terrier      234
basenji               234
Name: count, dtype: int64
EDA outputs saved to dogs_yolo_dataset/eda


In [None]:
eda_out

PosixPath('dogs_yolo_dataset/eda')