# Project Notes (Sanitized for Git)

This repository contains a **sanitized** version of the Gracity Insects YOLOv8 Classification notebooks.
All tenant-specific identifiers (bucket names, namespaces, OCIDs, local absolute paths) have been replaced by placeholders.

**Author:** Cristina Varas Menadas  
**Last updated:** 2026-02-19

> To run these notebooks, set the configuration values in the first "Configuration" section of each notebook.


# Gracity Insects â€” 01. Dataset Preparation, Stratified Split & Upload

This notebook:

1. Reads a **local** image dataset organized as `archive/<ClassName>/*.jpg`.
2. Creates a **stratified** split into `train/` and `test/` (used as validation).
3. Uploads images to Object Storage using **Resource Principals**.
4. Writes `classes.json` and `split_manifest.csv` under the `labels/` prefix.


## Configuration

Update these variables for your tenancy/project.

- **Bucket**: `<BUCKET_NAME>`
- **Dataset prefix** (images): `<PROJECT_PREFIX>/v1/raw/datasets/insects_kaggle_v1/`
- **Labels prefix** (metadata/manifests): `<PROJECT_PREFIX>/v1/labels/insects_kaggle_v1/`
- **Runs prefix** (artifacts): `<PROJECT_PREFIX>/yolo/runs/insects_kaggle_v1/`

We intentionally keep **`test/` as validation** for this starter project (to match your current bucket structure).

## 1.1 Imports

In [None]:
from __future__ import annotations

import io
import os
import csv
import json
import time
import random
from pathlib import Path
from typing import Dict, List, Tuple

import oci
from oci.object_storage import ObjectStorageClient
from tqdm import tqdm

## 1.2 Configuration

In [None]:
LOCAL_DATASET_DIR: str = "<LOCAL_PATH> Gracity/gracity-insects-yolo-cls/data/archive"  # <-- update if needed

TRAIN_RATIO: float = 0.80
SEED: int = 42

BUCKET_NAME: str = "<BUCKET_NAME>"
BASE_PREFIX: str = "<PROJECT_PREFIX>/v1/raw/datasets/insects_kaggle_v1"
LABELS_PREFIX: str = "<PROJECT_PREFIX>/v1/labels/insects_kaggle_v1"

ALLOWED_EXTS: set[str] = {".jpg", ".jpeg", ".png", ".webp"}
CONTENT_TYPE_BY_EXT: Dict[str, str] = {
    ".jpg": "image/jpeg",
    ".jpeg": "image/jpeg",
    ".png": "image/png",
    ".webp": "image/webp",
}

## 1.3 OCI client (Resource Principals)

In [None]:
signer = oci.auth.signers.get_resource_principals_signer()
os_client = ObjectStorageClient(config={}, signer=signer)
namespace: str = os_client.get_namespace().data
print("Namespace:", namespace)

## 1.4 Discover classes and files

In [None]:
def list_class_dirs(root_dir: str) -> List[Path]:
    root = Path(root_dir)
    if not root.exists():
        raise FileNotFoundError(f"LOCAL_DATASET_DIR does not exist: {root_dir}")
    class_dirs = [p for p in root.iterdir() if p.is_dir() and not p.name.startswith(".")]
    class_dirs.sort(key=lambda p: p.name.lower())
    if not class_dirs:
        raise ValueError(f"No class directories found under {root_dir}")
    return class_dirs

def list_images(class_dir: Path) -> List[Path]:
    imgs: List[Path] = []
    for p in class_dir.rglob("*"):
        if p.is_file() and p.suffix.lower() in ALLOWED_EXTS:
            imgs.append(p)
    imgs.sort(key=lambda p: str(p).lower())
    return imgs

class_dirs = list_class_dirs(LOCAL_DATASET_DIR)
classes: List[str] = [d.name for d in class_dirs]
print("Classes:", classes)

per_class_counts: Dict[str, int] = {}
for d in class_dirs:
    per_class_counts[d.name] = len(list_images(d))

per_class_counts

## 1.5 Stratified split per class (train/test)

We split **within each class** so class proportions are preserved.

In [None]:
def split_train_test(paths: List[Path], train_ratio: float, seed: int) -> Tuple[List[Path], List[Path]]:
    rnd = random.Random(seed)
    shuffled = list(paths)
    rnd.shuffle(shuffled)
    n = len(shuffled)
    n_train = int(round(n * train_ratio))
    if n >= 2:
        n_train = max(1, min(n_train, n - 1))
    return shuffled[:n_train], shuffled[n_train:]

random.seed(SEED)

split_index: List[Tuple[str, Path, str]] = []  # (class, local_path, split)

for d in class_dirs:
    imgs = list_images(d)
    tr, te = split_train_test(imgs, TRAIN_RATIO, SEED)
    for p in tr:
        split_index.append((d.name, p, "train"))
    for p in te:
        split_index.append((d.name, p, "test"))  # 'test' used as validation

len(split_index)

## 1.6 Upload to Object Storage

Objects will be uploaded as:

- `.../train/<ClassName>/<filename>`
- `.../test/<ClassName>/<filename>`

In [None]:
def object_name(split: str, class_name: str, local_path: Path) -> str:
    return f"{BASE_PREFIX}/{split}/{class_name}/{local_path.name}"

def upload_file(local_path: Path, obj_name: str) -> None:
    ext = local_path.suffix.lower()
    content_type = CONTENT_TYPE_BY_EXT.get(ext, "application/octet-stream")
    with local_path.open("rb") as f:
        data = f.read()
    os_client.put_object(
        namespace_name=namespace,
        bucket_name=BUCKET_NAME,
        object_name=obj_name,
        put_object_body=data,
        content_type=content_type,
    )

manifest_rows: List[List[str]] = []

t0 = time.time()
for class_name, p, split in tqdm(split_index, desc="Uploading"):
    obj = object_name(split, class_name, p)
    upload_file(p, obj)
    manifest_rows.append([str(p), class_name, split, obj])

print(f"Uploaded {len(manifest_rows)} files in {time.time()-t0:.1f}s")

## 1.7 Write dataset metadata (classes + manifest)

In [None]:
def put_text(obj_name: str, text: str, content_type: str) -> None:
    os_client.put_object(
        namespace_name=namespace,
        bucket_name=BUCKET_NAME,
        object_name=obj_name,
        put_object_body=text.encode("utf-8"),
        content_type=content_type,
    )

classes_payload = {
    "dataset": "insects_kaggle_v1",
    "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "train_ratio": TRAIN_RATIO,
    "seed": SEED,
    "classes": classes,
    "counts": per_class_counts,
    "note": "This starter dataset uses test/ as validation split."
}

classes_obj = f"{LABELS_PREFIX}/classes.json"
put_text(classes_obj, json.dumps(classes_payload, indent=2), "application/json")
print("Wrote:", classes_obj)

csv_buf = io.StringIO()
writer = csv.writer(csv_buf)
writer.writerow(["local_path", "class", "split", "object_name"])
writer.writerows(manifest_rows)

manifest_obj = f"{LABELS_PREFIX}/split_manifest.csv"
put_text(manifest_obj, csv_buf.getvalue(), "text/csv")
print("Wrote:", manifest_obj)