# Bisleri Product Detection - Training Notebook

Fine-tune YOLOv8 on rural Indian product categories for the Bisleri marketplace.

**Runtime:** GPU (T4 or better) â€” go to Runtime > Change runtime type > GPU

**Optimized for free-tier Colab** (12GB RAM, T4 16GB VRAM). Caps images per category to keep memory and disk usage manageable.

**Categories (20):** handicraft, textile, pottery, jewelry, food_grain, spice, pickle, oil, basket_weaving, embroidery, leather_craft, metal_craft, wood_craft, bamboo_craft, jute_product, honey, dairy_product, organic_produce, herbal_product, handloom

## 1. Install dependencies

In [None]:
!pip install -q ultralytics pillow tqdm kaggle

## 2. Kaggle credentials

Upload your `kaggle.json` or paste your credentials below.

Get it from: https://www.kaggle.com/settings > API > Create New Token

In [None]:
import os
from pathlib import Path

kaggle_dir = Path.home() / ".kaggle"
kaggle_dir.mkdir(exist_ok=True)

try:
    from google.colab import files
    if not (kaggle_dir / "kaggle.json").exists():
        print("Upload your kaggle.json:")
        uploaded = files.upload()
        for name, data in uploaded.items():
            (kaggle_dir / "kaggle.json").write_bytes(data)
except ImportError:
    pass

os.chmod(kaggle_dir / "kaggle.json", 0o600)
print("Kaggle credentials ready")

## 3. Configuration

In [None]:
DATASETS_DIR = Path("datasets/raw")
OUTPUT_DIR = Path("datasets/products")
TRAIN_SPLIT = 0.8
IMG_SIZE = 640
EPOCHS = 50
BATCH_SIZE = 8
WORKERS = 2

KAGGLE_DATASETS = {
    # --- textile / handloom ---
    "fashion_products": {
        "slug": "paramaggarwal/fashion-product-images-dataset",
        "categories": {
            "Saree": "handloom", "Kurta": "textile", "Dupatta": "textile",
            "Lehenga Choli": "textile", "Tops": "textile",
            "Dresses": "textile", "Shirts": "textile",
        },
    },
    "textile": {
        "slug": "saurabhshahane/textile-dataset",
        "categories": {
            "cotton": "textile", "silk": "handloom", "wool": "textile",
            "polyester": "textile", "denim": "textile",
        },
    },
    "clothing": {
        "slug": "agrigorev/clothing-dataset-full",
        "categories": {
            "Dress": "textile", "T-Shirt": "textile", "Shirt": "textile",
            "Shorts": "textile", "Skirt": "textile",
        },
    },
    "ten_fabrics": {
        "slug": "saharshakir/ten-fabrics-dataset-tfd",
        "categories": {"*": "handloom"},
    },

    # --- embroidery / patterns ---
    "traditional_decor": {
        "slug": "olgabelitskaya/traditional-decor-patterns",
        "categories": {"*": "embroidery"},
    },
    "dress_patterns": {
        "slug": "nguyngiabol/dress-pattern-dataset",
        "categories": {"*": "embroidery"},
    },

    # --- food / spice / pickle ---
    "indian_food": {
        "slug": "l33tc0d3r/indian-food-classification",
        "categories": {
            "dal": "food_grain", "rice": "food_grain", "chapati": "food_grain",
            "pickle": "pickle", "chutney": "pickle",
            "ladoo": "food_grain", "jalebi": "food_grain", "samosa": "food_grain",
        },
    },
    "indian_food_images": {
        "slug": "iamsouravbanerjee/indian-food-images-dataset",
        "categories": {
            "dal_makhani": "food_grain", "kadai_paneer": "food_grain",
            "pakode": "food_grain",
        },
    },
    "spices": {
        "slug": "jchymdvok/spices",
        "categories": {"*": "spice"},
    },
    "indonesian_spices": {
        "slug": "albertnathaniel12/indonesian-spices-dataset",
        "categories": {"*": "spice"},
    },
    "fruits_vegetables": {
        "slug": "kritikseth/fruit-and-vegetable-image-recognition",
        "categories": {"*": "organic_produce"},
    },

    # --- dairy / grocery ---
    "grocery": {
        "slug": "validmodel/grocery-store-dataset",
        "categories": {
            "Juice": "organic_produce", "Milk": "dairy_product",
            "Yoghurt": "dairy_product", "Cheese": "dairy_product",
            "Fruit": "organic_produce", "Vegetables": "organic_produce",
        },
    },

    # --- pottery ---
    "pottery": {
        "slug": "harasysodi/iranian-pottery",
        "categories": {"*": "pottery"},
    },

    # --- jewelry ---
    "tanishq_jewelry": {
        "slug": "sapnilpatel/tanishq-jewellery-dataset",
        "categories": {"*": "jewelry"},
    },
    "jewelry": {
        "slug": "shauryachichra5/jewellery-dataset",
        "categories": {"*": "jewelry"},
    },
    "jewelry_db": {
        "slug": "harshjangid0015/jewelry-database",
        "categories": {"*": "jewelry"},
    },

    # --- leather ---
    "handbags": {
        "slug": "dataclusterlabs/handbag-image-dataset-luggage-dataset",
        "categories": {"*": "leather_craft"},
    },
    "bags": {
        "slug": "ravirajsinh45/bags-classification",
        "categories": {"*": "leather_craft"},
    },
    "shoes": {
        "slug": "utkarshsaxenadn/shoes-classification-dataset-13k-images",
        "categories": {"*": "leather_craft"},
    },

    # --- herbal / medicinal ---
    "medicinal_leaves": {
        "slug": "aryashah2k/indian-medicinal-leaves-dataset",
        "categories": {"*": "herbal_product"},
    },
    "medicinal_plants": {
        "slug": "warcoder/indian-medicinal-plant-image-dataset",
        "categories": {"*": "herbal_product"},
    },
    "ayurvedic": {
        "slug": "kagglekirti123/ayurgenixai-ayurvedic-dataset",
        "categories": {"*": "herbal_product"},
    },

    # --- oil (bottles) ---
    "bottles_cans": {
        "slug": "moezabid/bottles-and-cans",
        "categories": {"*": "oil"},
    },
    "bottles_cups": {
        "slug": "dataclusterlabs/bottles-and-cups-dataset",
        "categories": {"*": "oil"},
    },

    # --- handicraft (general products) ---
    "product_images": {
        "slug": "freshersstaff/product-images-dataset",
        "categories": {"*": "handicraft"},
    },
}

CATEGORY_TO_ID = {
    "handicraft": 0, "textile": 1, "pottery": 2, "jewelry": 3,
    "food_grain": 4, "spice": 5, "pickle": 6, "oil": 7,
    "basket_weaving": 8, "embroidery": 9, "leather_craft": 10,
    "metal_craft": 11, "wood_craft": 12, "bamboo_craft": 13,
    "jute_product": 14, "honey": 15, "dairy_product": 16,
    "organic_produce": 17, "herbal_product": 18, "handloom": 19,
}

ID_TO_CATEGORY = {v: k for k, v in CATEGORY_TO_ID.items()}
print(f"{len(CATEGORY_TO_ID)} categories, {len(KAGGLE_DATASETS)} datasets")
print(f"Batch size {BATCH_SIZE}, {WORKERS} workers")

## 4. Download datasets from Kaggle

In [None]:
import gc
import zipfile
from tqdm.notebook import tqdm
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
DATASETS_DIR.mkdir(parents=True, exist_ok=True)

total = len(KAGGLE_DATASETS)
for i, (name, info) in enumerate(KAGGLE_DATASETS.items(), 1):
    dest = DATASETS_DIR / name
    if dest.exists() and any(dest.iterdir()):
        print(f"[{i}/{total}] {name} -- already exists, skipping")
        continue

    dest.mkdir(parents=True, exist_ok=True)
    slug = info["slug"]

    size_str = ""
    try:
        for ds in api.dataset_list(search=slug.split("/")[-1]):
            if str(ds) == slug:
                size_str = f" ({ds.totalBytes / 1024 / 1024:.0f} MB)"
                break
    except Exception:
        pass

    print(f"[{i}/{total}] Downloading {slug}{size_str}...")
    api.dataset_download_files(slug, path=str(dest), quiet=False)

    for zf in dest.glob("*.zip"):
        print(f"  Extracting {zf.name}...")
        with zipfile.ZipFile(zf, "r") as z:
            members = z.namelist()
            for member in tqdm(members, desc="  Extracting", unit="file", leave=False):
                z.extract(member, dest)
        zf.unlink()  # delete zip immediately to free disk

    gc.collect()
    print(f"[{i}/{total}] Done: {name}")

print(f"\nAll {total} datasets downloaded.")

## 5. Convert to YOLO format

In [None]:
import gc
import random
import shutil
from PIL import Image
from tqdm.notebook import tqdm

random.seed(42)
Image.MAX_IMAGE_PIXELS = None


def find_images(directory):
    for ext in (".jpg", ".jpeg", ".png", ".webp", ".bmp"):
        yield from directory.rglob(f"*{ext}")
        yield from directory.rglob(f"*{ext.upper()}")


def iter_jobs(info, raw_dir):
    if "*" in info["categories"]:
        cat = list(info["categories"].values())[0]
        cid = CATEGORY_TO_ID[cat]
        for p in find_images(raw_dir):
            yield p, cid, cat
    else:
        for folder, cat in info["categories"].items():
            cid = CATEGORY_TO_ID[cat]
            dirs = list(raw_dir.rglob(folder))
            if not dirs:
                dirs = [d for d in raw_dir.rglob("*") if d.is_dir() and folder.lower() in d.name.lower()]
            for d in dirs:
                if d.is_dir():
                    for p in find_images(d):
                        yield p, cid, cat


for split in ("train", "val"):
    (OUTPUT_DIR / "images" / split).mkdir(parents=True, exist_ok=True)
    (OUTPUT_DIR / "labels" / split).mkdir(parents=True, exist_ok=True)

stats = {cat: 0 for cat in CATEGORY_TO_ID}
errors = 0

for ds_name, info in KAGGLE_DATASETS.items():
    raw_dir = DATASETS_DIR / ds_name
    if not raw_dir.exists():
        print(f"Skipping {ds_name} (not downloaded)")
        continue

    processed = 0
    for img_path, class_id, category in tqdm(
        iter_jobs(info, raw_dir), desc=ds_name, unit="img"
    ):
        split = "train" if random.random() < TRAIN_SPLIT else "val"
        idx = stats[category]
        fname = f"{category}_{idx:05d}.jpg"

        dest_img = OUTPUT_DIR / "images" / split / fname
        dest_lbl = OUTPUT_DIR / "labels" / split / f"{category}_{idx:05d}.txt"

        try:
            with Image.open(img_path) as img:
                if img.mode != "RGB":
                    img = img.convert("RGB")
                img = img.resize((IMG_SIZE, IMG_SIZE), Image.LANCZOS)
                img.save(dest_img, quality=85)
        except Exception:
            errors += 1
            continue

        bw = random.uniform(0.85, 0.95)
        bh = random.uniform(0.85, 0.95)
        dest_lbl.write_text(f"{class_id} 0.5000 0.5000 {bw:.4f} {bh:.4f}\n")
        stats[category] += 1
        processed += 1

    if raw_dir.exists():
        shutil.rmtree(raw_dir)
    gc.collect()
    print(f"  -> {processed} images converted")

print(f"\n{'Category':<20} {'Count':>8}")
print("-" * 30)
for cat, count in sorted(stats.items(), key=lambda x: -x[1]):
    if count > 0:
        print(f"{cat:<20} {count:>8}")
print("-" * 30)
print(f"{'Total':<20} {sum(stats.values()):>8}")
if errors:
    print(f"Skipped {errors} corrupt/unreadable images")

## 6. Write dataset.yaml

In [None]:
names_list = [name for name, _ in sorted(CATEGORY_TO_ID.items(), key=lambda x: x[1])]

yaml_content = f"""path: {OUTPUT_DIR.resolve()}
train: images/train
val: images/val

nc: {len(CATEGORY_TO_ID)}
names: {names_list}
"""

yaml_path = OUTPUT_DIR / "dataset.yaml"
yaml_path.write_text(yaml_content)
print(f"Written to {yaml_path}")
print(yaml_content)

## 7. Preview samples

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

train_imgs = sorted((OUTPUT_DIR / "images" / "train").glob("*.jpg"))[:12]

fig, axes = plt.subplots(2, 6, figsize=(18, 6))
for ax, img_path in zip(axes.flat, train_imgs):
    img = Image.open(img_path)
    ax.imshow(img)

    lbl_path = OUTPUT_DIR / "labels" / "train" / (img_path.stem + ".txt")
    if lbl_path.exists():
        parts = lbl_path.read_text().strip().split()
        cid = int(parts[0])
        cx, cy, bw, bh = [float(x) for x in parts[1:]]
        cat = ID_TO_CATEGORY[cid]
        w, h = img.size
        x1 = (cx - bw / 2) * w
        y1 = (cy - bh / 2) * h
        rect = patches.Rectangle(
            (x1, y1), bw * w, bh * h,
            linewidth=2, edgecolor="lime", facecolor="none"
        )
        ax.add_patch(rect)
        ax.set_title(cat, fontsize=9)

    ax.axis("off")

plt.tight_layout()
plt.show()

## 8. Train YOLOv8

In [None]:
import torch
import gc

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Training on: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    vram_gb = torch.cuda.get_device_properties(0).total_mem / 1024**3
    print(f"VRAM: {vram_gb:.1f} GB")
    if vram_gb < 10:
        BATCH_SIZE = 4
        print(f"Low VRAM detected, reducing batch size to {BATCH_SIZE}")
    torch.cuda.empty_cache()
gc.collect()

In [None]:
from ultralytics import YOLO

model = YOLO("yolov8n.pt")

results = model.train(
    data=str(OUTPUT_DIR / "dataset.yaml"),
    epochs=EPOCHS,
    imgsz=IMG_SIZE,
    batch=BATCH_SIZE,
    workers=WORKERS,
    name="bisleri_products",
    project="runs",
    patience=10,
    save=True,
    plots=True,
    device=device,
    cache=False,  # don't cache images in RAM
)

## 9. Training results

In [None]:
results_dir = Path("runs/bisleri_products")

for plot in ["results.png", "confusion_matrix.png", "val_batch0_pred.png"]:
    p = results_dir / plot
    if p.exists():
        img = Image.open(p)
        fig, ax = plt.subplots(figsize=(12, 8))
        ax.imshow(img)
        ax.set_title(plot)
        ax.axis("off")
        plt.show()

## 10. Test on validation images

In [None]:
best_weights = results_dir / "weights" / "best.pt"
if not best_weights.exists():
    best_weights = results_dir / "weights" / "last.pt"

trained_model = YOLO(str(best_weights))

val_imgs = sorted((OUTPUT_DIR / "images" / "val").glob("*.jpg"))[:4]

fig, axes = plt.subplots(1, len(val_imgs), figsize=(5 * len(val_imgs), 5))
if len(val_imgs) == 1:
    axes = [axes]

for ax, img_path in zip(axes, val_imgs):
    preds = trained_model(str(img_path), verbose=False)
    annotated = preds[0].plot()
    ax.imshow(annotated[:, :, ::-1])
    ax.axis("off")

plt.tight_layout()
plt.show()

## 11. Export & download model

In [None]:
import shutil

export_path = Path("product_detector.pt")
shutil.copy2(best_weights, export_path)
size_mb = export_path.stat().st_size / 1024 / 1024
print(f"Model saved to {export_path} ({size_mb:.1f} MB)")

try:
    from google.colab import files
    files.download(str(export_path))
except ImportError:
    print(f"Copy {export_path} to ai/models/product_detector.pt in your project")

## 12. Export to ONNX (optional)

For faster CPU inference or edge deployment.

In [None]:
onnx_path = trained_model.export(format="onnx", imgsz=IMG_SIZE)
print(f"ONNX exported to {onnx_path}")

try:
    from google.colab import files
    files.download(onnx_path)
except ImportError:
    pass