In [33]:
# # Cleanup all output folders created by this notebook
from pathlib import Path
import shutil

# # List of folders to remove (add more if needed)
folders_to_remove = [
    "datasets/temp_pipeline_output",
    "datasets/new_val",
    "datasets/preprocessed/fixed_val_preview",
]

for folder in folders_to_remove:
    p = Path(folder)
    if p.exists():
        shutil.rmtree(p)
        print(f"✓ Removed: {folder}")
    else:
        print(f"(not found): {folder}")
        
#todo: place this after defined paths later


(not found): datasets/temp_pipeline_output
✓ Removed: datasets/new_val
✓ Removed: datasets/preprocessed/fixed_val_preview


# Add Labeled Images to Fixed Validation (Option B + Purge)

Optional notebook to inject **already labeled** images into `datasets/ready/fixed_val`, with:
- **Option B**: if filename collides but hash differs, auto-rename with a suffix.
- **Dedup by hash**: if hash is identical, skip to avoid duplicates.
- **Class check**: only `red ball`, `human`, `trashcan`.
- **Human review**: previews generated for visual QA.
- **Purge option**: clean outputs/previews before running.
- **Previews location**: `datasets/preprocessed/fixed_val_preview`.

In [34]:
import pathlib
from pathlib import Path
import shutil
import hashlib
import os
import random
import json
import cv2
import torch
from PIL import Image, ImageOps
from tqdm.notebook import tqdm
import numpy as np
from IPython.display import Markdown, display

# Project detectors
from src.detection import GroundingDINODetector
from src.segmentation import SAMSegmenter
from src.pipeline import img_pipeline

In [51]:
class DisplayPath(Path):
    def display(self):
        display(Markdown(f"[{self}]({self})") if self.exists() else str(self))
Path = DisplayPath

In [36]:
PROJECT_DIR = Path("./datasets/ready/fixed_val")
# PROJECT_DIR = Path("datasets/ready/fixed_val")

PROJECT_DIR.display()

[datasets/ready/fixed_val](datasets/ready/fixed_val)

In [52]:

# Parameters
dirs = [
    PROJECT_DIR := Path("datasets/ready/fixed_val"),
    PREVIEW_DIR := Path("datasets/preprocessed/fixed_val_preview"),
    # RAW_SOURCE_DIR = Path("datasets/raw/IRL_validation_pictures"),
    RAW_SOURCE_DIR := Path("datasets/raw/tom_trashcans"),     
    NEW_IMAGES_DIR := Path("datasets/new_val/images"),      
    NEW_LABELS_DIR := Path("datasets/new_val/labels"),
    TEMP_PIPELINE_DIR := Path("datasets/trashcan_pipeline_output")   
]
for dir in dirs:
    dir.display()

[datasets/ready/fixed_val](datasets/ready/fixed_val)

[datasets/preprocessed/fixed_val_preview](datasets/preprocessed/fixed_val_preview)

[datasets/raw/tom_trashcans](datasets/raw/tom_trashcans)

[datasets/new_val/images](datasets/new_val/images)

[datasets/new_val/labels](datasets/new_val/labels)

[datasets/trashcan_pipeline_output](datasets/trashcan_pipeline_output)

In [38]:
# dont use red ball for trashcan validation there should be none
# ALLOWED_CLASSES = {"red ball": 0, "human": 1} # <-- for irl pictures
ALLOWED_CLASSES = {"human": 1, "trashcan": 2} # <-- for irl trashcans

In [39]:
CLASS_IDS = set(ALLOWED_CLASSES.values())

# Detection / segmentation
BOX_THRESHOLD = 0.42
TEXT_THRESHOLD = 0.2



PROMPT = " . ".join(ALLOWED_CLASSES.keys())
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# Options
OPTION_B_RENAME_ON_NAME_CONFLICT = True   # rename on filename collision with different hash
PURGE_OUTPUTS = True                      # purge previews/output folders before run
SAMPLES_PER_SPLIT = 20                    # previews per split

random.seed(42)
print(f"Device: {DEVICE}")
print("Parameters loaded.")

Device: cuda:0
Parameters loaded.


In [40]:
# Purge existing previews / temp if requested
if PURGE_OUTPUTS:
    for path in [PREVIEW_DIR]:
        if path.exists():
            shutil.rmtree(path)
    PREVIEW_DIR.mkdir(parents=True, exist_ok=True)
    print("✓ Purged preview outputs.")
else:
    PREVIEW_DIR.mkdir(parents=True, exist_ok=True)
    print("Skipping purge (PURGE_OUTPUTS=False)")

✓ Purged preview outputs.


## Auto-label with GroundingDINO + SAM2
This section auto-labels raw images into `NEW_IMAGES_DIR/NEW_LABELS_DIR` before merging into `fixed_val`.

- Classes enforced: red ball, human, trashcan
- Dedup: skip identical hashes; collision on name => rename (option B)
- Outputs stay separate so you can review before copy to fixed val

In [41]:
def mask_to_polygon(mask: np.ndarray) -> list:
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return []
    c = max(contours, key=cv2.contourArea)
    epsilon = 0.005 * cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, epsilon, True)
    if len(approx) < 3: 
        return []
    h, w = mask.shape
    pts = []
    for p in approx:
        x, y = p[0]
        pts.extend([x / w, y / h])
    return pts


def save_yolo_label(file_path: Path, labels):
    with open(file_path, 'w') as f:
        for cls_id, pts in labels:
            pts_str = " ".join(f"{p:.6f}" for p in pts)
            f.write(f"{cls_id} {pts_str}\n")

In [42]:
# Initialize models
try :
    detector, segmenter #type: ignore
except:
    detector = GroundingDINODetector(
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD,
        device=DEVICE,
    )
    segmenter = SAMSegmenter()

In [43]:
# Auto-label using src.pipeline.img_pipeline
# This ensures consistency with the main data preparation pipeline (EXIF handling, visualization, etc.)

# 1. Setup Temporary Directory for Pipeline Output
if TEMP_PIPELINE_DIR.exists():
    shutil.rmtree(TEMP_PIPELINE_DIR)

TEMP_DET_DIR = TEMP_PIPELINE_DIR / "detection"
TEMP_SEG_DIR = TEMP_PIPELINE_DIR / "segmentation"
TEMP_LBL_DIR = TEMP_PIPELINE_DIR / "labels"
TEMP_IMG_DIR = TEMP_PIPELINE_DIR / "images"
TEMP_EMPTY_DIR = TEMP_PIPELINE_DIR / "empty"

for d in [TEMP_DET_DIR, TEMP_SEG_DIR, TEMP_LBL_DIR, TEMP_IMG_DIR, TEMP_EMPTY_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Using temporary pipeline dir")
TEMP_PIPELINE_DIR.display()

# 2. Run Pipeline on Raw Images
raw_files = [p for p in RAW_SOURCE_DIR.rglob("*") if p.suffix.lower() not in ['.Identifier']]
print(f"Raw files found: {len(raw_files)}")

processed_count = 0
errors = 0

for img_path in tqdm(raw_files, desc="Running Pipeline"):
    if not img_path.is_file():
        continue
    try:
        img_pipeline(
            img_path=img_path,
            detect_fn=lambda p: detector.detect(p, text_prompt=PROMPT, return_all_by_label=True),
            segment_fn=segmenter.segment_bbox,
            det_output_dir=TEMP_DET_DIR,
            seg_output_dir=TEMP_SEG_DIR,
            txt_output_dir=TEMP_LBL_DIR,
            empty_dir=TEMP_EMPTY_DIR,
            images_output_dir=TEMP_IMG_DIR
        )
        processed_count += 1
    except Exception as e:
        print(f"Pipeline error on {img_path.name}: {e}")
        errors += 1

print(f"Pipeline finished. Processed: {processed_count}, Errors: {errors}")

# 3. Post-process: Hash, Merge Labels, and Move to Final Destination
NEW_IMAGES_DIR.mkdir(parents=True, exist_ok=True)
NEW_LABELS_DIR.mkdir(parents=True, exist_ok=True)

processed_hashes = set()
added_final = 0
skipped_dup = 0

# Iterate over the images successfully processed by the pipeline
# Note: img_pipeline copies original images to TEMP_IMG_DIR
pipeline_images = list(TEMP_IMG_DIR.glob("*"))
print(f"Images to post-process: {len(pipeline_images)}")

Using temporary pipeline dir


[datasets/trashcan_pipeline_output](datasets/trashcan_pipeline_output)

Raw files found: 14


Running Pipeline:   0%|          | 0/14 [00:00<?, ?it/s]

Pipeline finished. Processed: 14, Errors: 0
Images to post-process: 13


In [44]:
processed_hashes = set()

for img_path in tqdm(pipeline_images, desc="Merging & Hashing"):

    # Open and Fix EXIF (Critical: Pipeline generated polygons on the fixed image)
    pil_img = Image.open(img_path)
    pil_img = ImageOps.exif_transpose(pil_img)
    if pil_img.mode != 'RGB':
        pil_img = pil_img.convert('RGB')
    
    # Calculate Hash
    hsh = hashlib.md5(pil_img.tobytes()).hexdigest()
    
    if hsh in processed_hashes:
        skipped_dup += 1
        continue
        
    # Collect Labels from subfolders
    # Pipeline outputs: TEMP_LBL_DIR / class_name / filename.txt
    # We need to merge them into one file with correct class IDs
    
    final_labels = []
    has_labels = False
    
    for class_name, class_id in ALLOWED_CLASSES.items():
        # The pipeline uses the class name as folder name
        lbl_file = TEMP_LBL_DIR / class_name / f"{img_path.stem}.txt"
        
        if lbl_file.exists():
            # Read polygons
            content = lbl_file.read_text().strip()
            if content:
                lines = content.split('\n')
                for line in lines:
                    parts = line.strip().split()
                    if len(parts) > 1:
                        # Replace the first '0' with the actual class_id
                        # Pipeline outputs "0 x y ...", we want "class_id x y ..."
                        coords = parts[1:]
                        final_labels.append(f"{class_id} " + " ".join(coords))
                        has_labels = True
    
    if not has_labels:
        # Should not happen if image is in TEMP_IMG_DIR (pipeline puts empty ones in empty_dir)
        # But check just in case
        continue
        
    # Save Final Image (Transposed)
    target_img_name = f"{hsh}.jpg"
    target_lbl_name = f"{hsh}.txt"
    
    pil_img.save(NEW_IMAGES_DIR / target_img_name, quality=95)
    
    with open(NEW_LABELS_DIR / target_lbl_name, 'w') as f:
        f.write("\n".join(final_labels))
        
    processed_hashes.add(hsh)
    added_final += 1
    


print(f"Final Import Stats: Added {added_final}, Skipped Dup {skipped_dup}")

# Optional: Clean up temp
# shutil.rmtree(TEMP_PIPELINE_DIR)

Merging & Hashing:   0%|          | 0/13 [00:00<?, ?it/s]

Final Import Stats: Added 13, Skipped Dup 0


In [45]:
NEW_IMAGES_DIR.display(), "Auto-labeled images"

[datasets/new_val/images](datasets/new_val/images)

(None, 'Auto-labeled images')

In [46]:
def convert_to_jpg(source_folder: Path, output_folder: Path):
    image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif', '.tiff', '.svg', '.heic', '.HEIC'}
    converted = 0
    errors = 0
    for file_path in tqdm(list(source_folder.rglob("*")), desc="Convert to JPG"):
        if ':Zone.Identifier' in str(file_path) or not file_path.is_file():
            continue
        if file_path.suffix.lower() not in image_extensions:
            continue
        rel = file_path.relative_to(source_folder)
        out_path = output_folder / rel.parent / (file_path.stem + '.jpg')
        out_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            with Image.open(file_path) as img:
                # ALIGN WITH PIPELINE.PY: Always call exif_transpose directly
                img = ImageOps.exif_transpose(img)
                
                if img.mode in ('RGBA', 'LA', 'P'):
                    bg = Image.new('RGB', img.size, (255, 255, 255))
                    if img.mode == 'P':
                        img = img.convert('RGBA')
                    bg.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
                    img = bg
                elif img.mode != 'RGB':
                    img = img.convert('RGB')
                img.save(out_path, 'JPEG', quality=95)
                converted += 1
        except Exception as e:
            errors += 1
            print(f"✗ {file_path.name}: {e}")
    print(f"Converted: {converted}, Errors: {errors}, Output: {output_folder}")
    return output_folder

CONVERT = False
CONVERTED_DIR = NEW_IMAGES_DIR.parent / (NEW_IMAGES_DIR.name + "_jpg")

if CONVERT:
    print(f"Converting {NEW_IMAGES_DIR} -> {CONVERTED_DIR}")
    NEW_IMAGES_DIR = convert_to_jpg(NEW_IMAGES_DIR, CONVERTED_DIR)
else:
    print("Skipping conversion (CONVERT=False)")

Skipping conversion (CONVERT=False)


In [47]:
def md5_hash(p: Path) -> str:
    h = hashlib.md5()
    with open(p, 'rb') as f:
        h.update(f.read())
    return h.hexdigest()

# Load existing hashes in fixed_val
existing_hashes = {}
PROJECT_DIR.mkdir(parents=True, exist_ok=True)
(PROJECT_DIR / "images").mkdir(parents=True, exist_ok=True)
(PROJECT_DIR / "labels").mkdir(parents=True, exist_ok=True)

for img_path in (PROJECT_DIR / "images").glob("*"):
    if img_path.suffix.lower() not in ['.jpg', '.jpeg', '.png']:
        continue
    existing_hashes[md5_hash(img_path)] = img_path.name

print(f"Existing fixed_val images: {len(existing_hashes)}")

Existing fixed_val images: 47


In [48]:
# Validate inputs
if not NEW_IMAGES_DIR.exists():
    raise FileNotFoundError(f"NEW_IMAGES_DIR missing: {NEW_IMAGES_DIR}")
if not NEW_LABELS_DIR.exists():
    raise FileNotFoundError(f"NEW_LABELS_DIR missing: {NEW_LABELS_DIR}")

added = 0
skipped_dupe = 0
renamed = 0
errors = 0

for img_path in tqdm(list(NEW_IMAGES_DIR.rglob("*")), desc="Add to fixed_val"):
    if not img_path.is_file():
        continue
    if img_path.suffix.lower() not in ['.jpg', '.jpeg', '.png']:
        continue

    stem = img_path.stem
    lbl_path = NEW_LABELS_DIR / f"{stem}.txt"
    if not lbl_path.exists():
        print(f"✗ Missing label for {img_path}")
        errors += 1
        continue

    # validate label
    valid = True
    lines_out = []
    with open(lbl_path) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 3 or len(parts[1:]) % 2 != 0:
                valid = False
                break
            cls_id = int(float(parts[0]))
            if cls_id not in CLASS_IDS:
                valid = False
                break
            lines_out.append(line.strip())
    if not valid:
        print(f"✗ Invalid label format/classes: {lbl_path}")
        errors += 1
        continue

    file_hash = md5_hash(img_path)
    if file_hash in existing_hashes:
        skipped_dupe += 1
        continue

    target_name = img_path.name
    target_img = PROJECT_DIR / "images" / target_name
    target_lbl = PROJECT_DIR / "labels" / f"{Path(target_name).stem}.txt"

    if target_img.exists() and OPTION_B_RENAME_ON_NAME_CONFLICT:
        # rename with suffix
        suffix = 1
        while True:
            candidate = PROJECT_DIR / "images" / f"{Path(target_name).stem}_v{suffix}{img_path.suffix}"
            if not candidate.exists():
                target_img = candidate
                target_lbl = PROJECT_DIR / "labels" / f"{candidate.stem}.txt"
                renamed += 1
                break
            suffix += 1

    # copy
    shutil.copy(img_path, target_img)
    with open(target_lbl, 'w') as f:
        for line in lines_out:
            f.write(line + "\n")

    existing_hashes[file_hash] = target_img.name
    added += 1

print(f"Added: {added}, Renamed: {renamed}, Skipped dup hash: {skipped_dupe}, Errors: {errors}")
print("to ")

Add to fixed_val:   0%|          | 0/13 [00:00<?, ?it/s]

Added: 0, Renamed: 0, Skipped dup hash: 13, Errors: 0
to 


In [49]:
# Generate previews for human QA
COLORS = [(0, 0, 255), (0, 255, 0), (255, 0, 0)]  # BGR
PREVIEW_DIR.mkdir(parents=True, exist_ok=True)

def draw_yolo_polygons(img_path: Path, lbl_path: Path, out_path: Path):
    if not img_path.exists():
        return False
    
    # Use PIL to ensure EXIF rotation is applied, matching the pipeline's behavior
    try:
        with Image.open(img_path) as pil_img:
            pil_img = ImageOps.exif_transpose(pil_img)
            if pil_img.mode != 'RGB':
                pil_img = pil_img.convert('RGB')
            # Convert to OpenCV format (BGR)
            img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
    except Exception as e:
        print(f"Error reading {img_path}: {e}")
        return False

    h, w = img.shape[:2]
    if lbl_path.exists():
        with open(lbl_path) as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) < 3:
                    continue
                cls_id = int(float(parts[0]))
                coords = [float(p) for p in parts[1:]]
                pts = []
                for i in range(0, len(coords), 2):
                    x = int(coords[i] * w)
                    y = int(coords[i+1] * h)
                    pts.append([x, y])
                if len(pts) < 3:
                    continue
                poly = np.array(pts, np.int32).reshape((-1, 1, 2))
                color = COLORS[cls_id % len(COLORS)]
                cv2.polylines(img, [poly], True, color, 2)
                cv2.putText(img, f"cls {cls_id}", (pts[0][0], pts[0][1]-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    cv2.imwrite(str(out_path), img)
    return True

# Create previews from NEW auto-labeled images (not fixed_val which contains old data)
imgs = list(NEW_IMAGES_DIR.glob("*.jpg")) + list(NEW_IMAGES_DIR.glob("*.png"))
random.shuffle(imgs)
sel = imgs[:min(len(imgs), SAMPLES_PER_SPLIT)]

for img_path in tqdm(sel):
    lbl_path = NEW_LABELS_DIR / f"{img_path.stem}.txt"
    out_path = PREVIEW_DIR / f"new_{img_path.name}"
    draw_yolo_polygons(img_path, lbl_path, out_path)

print(f"Showing {len(sel)} samples from NEW auto-labeled images (not existing fixed_val)")
print("Previews written to")
PREVIEW_DIR.display()

  0%|          | 0/13 [00:00<?, ?it/s]

Showing 13 samples from NEW auto-labeled images (not existing fixed_val)
Previews written to


[datasets/preprocessed/fixed_val_preview](datasets/preprocessed/fixed_val_preview)

In [50]:
# Clean up rejected previews
# Scan PREVIEW_DIR to find which images you manually deleted
# Then remove the corresponding images/labels from fixed_val

# Get all previews currently in PREVIEW_DIR
PREVIEW_DIR.mkdir(parents=True, exist_ok=True)
existing_previews = {p.stem.replace("new_", "") for p in PREVIEW_DIR.glob("*")}

# Get all images in fixed_val
fixed_val_images = set(p.stem for p in (PROJECT_DIR / "images").glob("*.jpg"))
fixed_val_images.update(p.stem for p in (PROJECT_DIR / "images").glob("*.png"))

# Find images that were added but are NOT in previews anymore (= manually deleted by user)
deleted_by_user = fixed_val_images - existing_previews

print(f"Images in fixed_val: {len(fixed_val_images)}")
print(f"Previews in PREVIEW_DIR: {len(existing_previews)}")
print(f"Images to delete from fixed_val (not in previews): {len(deleted_by_user)}")

if deleted_by_user:
    print("\nImages that will be REMOVED from fixed_val:")
    for img_stem in sorted(deleted_by_user):
        print(f"  - {img_stem}")
    
    # Flag to control cleanup
    CLEAN_UP = False  # Set to True to actually delete
    
    if CLEAN_UP:
        cleaned = 0
        for img_stem in deleted_by_user:
            # Find the actual image file (could be .jpg ou .png)
            img_file = None
            for ext in ['.jpg', '.jpeg', '.png']:
                candidate = PROJECT_DIR / "images" / f"{img_stem}{ext}"
                if candidate.exists():
                    img_file = candidate
                    break
            
            if img_file:
                img_file.unlink()
                cleaned += 1
            
            # Remove corresponding label
            lbl_file = PROJECT_DIR / "labels" / f"{img_stem}.txt"
            if lbl_file.exists():
                lbl_file.unlink()
        
        print(f"\n✓ Cleaned up {cleaned} rejected images and their labels from fixed_val")
    else:
        print("\n→ To DELETE these rejected images, set CLEAN_UP = True and rerun this cell")
else:
    print("✓ All previewed images are still in PREVIEW_DIR (no cleanup needed)")

Images in fixed_val: 47
Previews in PREVIEW_DIR: 13
Images to delete from fixed_val (not in previews): 34

Images that will be REMOVED from fixed_val:
  - 042b9e09b3832319fe998ffb4bf44edd
  - 088610ac49bfde8470097a40c8b749d7
  - 0ac39c0cadb518e8bcc8de34579b625f
  - 0af6fd9d3636fd6403c7ada6c8a10530
  - 0d4db7c113776bc0a401d833d556df84
  - 1104093f8577602bb0425f1ccd118de9
  - 11cb69ea752cefa3a8cad413598028de
  - 19d18a004fb8ffcae5740d3bc9f87d78
  - 1ff6f7af4e054a1c879c6485194e44cb
  - 205637f0b04837de28f8d5c031b263bc
  - 256c86658031f676fe03b3516f9a899b
  - 268049ede7cfd5c42204b885785fcfc1
  - 29b6473d55641a2d6a06276b9357090f
  - 2b9a10a9f16323fd28c304e1b18f4787
  - 366acb21b00b40588372736b95776fac
  - 6ea339039c57d22328a8a9097181b4cb
  - 7dbbf64f4372e8af631d5b3261e5d6aa
  - 9666760cde1bedabf437f3dbfc95f891
  - 9d5cda5391d6ae193428fbb451d0c905
  - 9ecb763ade7cb05b25ab6f784d29744f
  - a0efc5e34c785282ee484a5e64e1c8ea
  - a56abe595542342e6049fb8dc0fccf8c
  - a6b631525ff8dc9bc26bb53e9748160