In [3]:
import csv
import json
import random
from pathlib import Path
from collections import Counter

PROJECT_ROOT = Path.home() / "Documents" / "rice_project"

RAW_DIR = PROJECT_ROOT / "data" / "raw"
SPLITS_DIR = PROJECT_ROOT / "data" / "splits"

SEED = 20260115
TRAIN_FRAC = 0.60
VAL_FRAC = 0.20
TEST_FRAC = 0.20

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR exists:", RAW_DIR.exists())
print("SPLITS_DIR:", SPLITS_DIR)


PROJECT_ROOT: /home/arijit/Documents/rice_project
RAW_DIR exists: True
SPLITS_DIR: /home/arijit/Documents/rice_project/data/splits


### Dataset audit `(classes + counts)`

In [4]:
# list class folders
class_dirs = sorted([p for p in RAW_DIR.iterdir() if p.is_dir()])
print("Number of class folders:", len(class_dirs))
print("First 10 class names:", [p.name for p in class_dirs[:10]])

# count images per class
class_counts = {}
for cdir in class_dirs:
    files = [p for p in cdir.iterdir() if p.is_file() and p.suffix.lower() in IMG_EXTS]
    class_counts[cdir.name] = len(files)

# print summary
counts = list(class_counts.values())
print("Min images/class:", min(counts))
print("Max images/class:", max(counts))
print("Unique counts:", sorted(set(counts))[:10], "..." if len(set(counts)) > 10 else "")

# show any classes not equal to 500
bad = {k: v for k, v in class_counts.items() if v != 500}
print("Classes with count != 500:", bad if bad else "None ✅")


Number of class folders: 38
First 10 class names: ['BD30', 'BD33', 'BD39', 'BD49', 'BD51', 'BD52', 'BD56', 'BD57', 'BD70', 'BD72']
Min images/class: 500
Max images/class: 500
Unique counts: [500] 
Classes with count != 500: None ✅
