In [14]:
import torch
# 🔹 Parámetros globales para comparación justa entre modelos
PARAMS = {
    # Conjunto de imágenes
    "max_images": 50,          # Número máximo de imágenes a procesar
    # Umbrales y filtros
    "score_thr": 0.2,          # Umbral de score mínimo para considerar detecciones
    "nms_iou": 0.5,            # IoU para supresión de solapamiento (NMS)
    "min_box_side": 12,        # Tamaño mínimo de lado de caja para filtrar detecciones
    "topk_label": 40,          # Top-K detecciones por clase
    # Visualización
    "score_min_draw": 0.2,     # Umbral mínimo para mostrar en plots
    "max_draw": 25,          # Máximo de cajas a dibujar por imagen
}

### DINO

In [15]:
# 🔹 Instalar dependencias (si es necesario)
!pip install -q transformers torch torchvision pycocotools matplotlib

# 🔹 Imports y configuración
import os, json, time, gc
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from torchvision.ops import nms
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ==========================================
# 🔹 COCO: cargar anotaciones y filtrar imágenes
# ==========================================
image_dir = "./COCO/images/val2017"
annotation_file = "./COCO/annotations/instances_val2017.json"
classes = ["car", "bus", "truck", "motorcycle", "bicycle"]

coco_gt = COCO(annotation_file)
coco_categories = coco_gt.loadCats(coco_gt.getCatIds())
name_to_id = {cat["name"]: cat["id"] for cat in coco_categories}
id_to_name = {cat["id"]: cat["name"] for cat in coco_categories}
selected_cat_ids = [name_to_id[c] for c in classes if c in name_to_id]

ann_ids = coco_gt.getAnnIds(catIds=selected_cat_ids)
anns = coco_gt.loadAnns(ann_ids)
filtered_image_ids = sorted(list({a["image_id"] for a in anns}))
filtered_images_info = coco_gt.loadImgs(filtered_image_ids)
filtered_image_paths = [os.path.join(image_dir, img["file_name"]) for img in filtered_images_info]
print(f"Imágenes filtradas: {len(filtered_image_paths)}")

# ==========================================
# 🔹 Cargar modelo Grounding DINO
# ==========================================
model_id = "IDEA-Research/grounding-dino-base"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()
text_prompt = " . ".join([f"a {c}" for c in classes]) + " ."


# ==========================================
# 🔹 Funciones auxiliares
# ==========================================
def postprocess_outputs(outputs, orig_size_hw):
    target_sizes = torch.tensor([orig_size_hw], device=outputs.logits.device)
    results = processor.post_process_grounded_object_detection(
        outputs=outputs,
        target_sizes=target_sizes
    )[0]
    return results

def filter_by_threshold(results, box_threshold=None):
    if box_threshold is None:
        box_threshold = PARAMS["score_thr"]
    boxes = results["boxes"]
    scores = results["scores"]
    labels = results["labels"]
    keep = [i for i, s in enumerate(scores) if s >= box_threshold]
    return {
        "boxes": boxes[keep],
        "scores": scores[keep],
        "labels": [labels[i] for i in keep]
    }

def apply_nms(results, iou_thr=None):
    if iou_thr is None:
        iou_thr = PARAMS["nms_iou"]
    boxes = results["boxes"]
    scores = results["scores"]
    labels = results["labels"]
    if len(boxes) == 0:
        return results
    keep_indices = []
    unique_labels = sorted(set(labels))
    for lab in unique_labels:
        idxs = [i for i, l in enumerate(labels) if l == lab]
        b = boxes[idxs]
        s = scores[idxs]
        keep = nms(b, s, iou_thr).cpu().numpy().tolist()
        keep_indices.extend([idxs[i] for i in keep])
    keep_indices = sorted(set(keep_indices))
    return {
        "boxes": boxes[keep_indices],
        "scores": scores[keep_indices],
        "labels": [labels[i] for i in keep_indices]
    }

def run_inference(image_paths, max_images=None, box_thr=None, nms_iou=None):
    if max_images is None:
        max_images = PARAMS["max_images"]
    results_all = []
    for i, path in enumerate(image_paths[:max_images]):
        image = Image.open(path).convert("RGB")
        W, H = image.size
        with torch.no_grad():
            inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(device)
            t0 = time.time()
            outputs = model(**inputs)
            t1 = time.time()
        res_raw = postprocess_outputs(outputs, (H, W))
        res_filtered = filter_by_threshold(res_raw, box_threshold=box_thr)
        res_nms = apply_nms(res_filtered, iou_thr=nms_iou)
        results_all.append({
            "image": path,
            "boxes_xyxy": res_nms["boxes"].cpu().numpy().tolist(),
            "labels": res_nms["labels"],
            "scores": res_nms["scores"].cpu().numpy().tolist(),
            "time": t1 - t0
        })
        print(f"[{i+1}/{min(max_images, len(image_paths))}] {os.path.basename(path)} → {len(res_nms['labels'])} dets en {t1 - t0:.2f}s")
        del inputs, outputs
        torch.cuda.empty_cache()
        gc.collect()
    return results_all

def show_result(result, score_min=None):
    if score_min is None:
        score_min = PARAMS["score_min_draw"]
    image = Image.open(result["image"]).convert("RGB")
    fig, ax = plt.subplots(1, figsize=(9, 9))
    ax.imshow(image)
    for box, label, score in zip(result["boxes_xyxy"], result["labels"], result["scores"]):
        if score < score_min:
            continue
        x1, y1, x2, y2 = box
        w, h = x2 - x1, y2 - y1
        rect = patches.Rectangle((x1, y1), w, h, linewidth=2, edgecolor='deepskyblue', facecolor='none')
        ax.add_patch(rect)
        ax.text(x1, max(0, y1 - 5), f"{label} ({score:.2f})", color='black', fontsize=10, backgroundcolor='white')
    plt.axis('off')
    plt.show()

# ==========================================
# 🔹 Ejecutar inferencia
# ==========================================
detections = run_inference(filtered_image_paths)

# Visualizar primeros resultados
if detections and detections[0]["boxes_xyxy"]:
    for i in range(len(detections)):
        #show_result(detections[i])
        pass
else:
    print("No hay detecciones para visualizar.")

# Guardar resultados
with open("grounding_dino_detections.json", "w") as f:
    json.dump(detections, f, indent=2)
print("✅ Resultados guardados en grounding_dino_detections.json")

# ==========================================
# 🔹 Evaluación COCO
# ==========================================
def to_coco_detections(detections, name_to_id, score_min=None):
    if score_min is None:
        score_min = 0.001
    coco_dets = []
    for img_info, det in zip(filtered_images_info, detections):
        img_id = img_info["id"]
        for box, label, score in zip(det["boxes_xyxy"], det["labels"], det["scores"]):
            if score < score_min or label not in name_to_id:
                continue
            x1, y1, x2, y2 = box
            x, y, w, h = float(x1), float(y1), float(x2 - x1), float(y2 - y1)
            coco_dets.append({
                "image_id": img_id,
                "category_id": name_to_id[label],
                "bbox": [x, y, w, h],
                "score": float(score)
            })
    return coco_dets

coco_dets = to_coco_detections(detections, name_to_id)
if coco_dets:
    with open("coco_results_groundingdino.json", "w") as f:
        json.dump(coco_dets, f)
    coco_dt = coco_gt.loadRes("coco_results_groundingdino.json")
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
    coco_eval.params.imgIds = filtered_image_ids[:PARAMS["max_images"]]
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()
else:
    print("⚠️ No hay predicciones para evaluar.")


Device: cuda
loading annotations into memory...
Done (t=0.48s)
creating index...
index created!
Imágenes filtradas: 870


Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


[1/50] 000000000724.jpg → 1 dets en 0.45s
[2/50] 000000001532.jpg → 5 dets en 0.46s
[3/50] 000000001584.jpg → 6 dets en 0.39s
[4/50] 000000002006.jpg → 1 dets en 0.48s
[5/50] 000000005001.jpg → 2 dets en 0.47s
[6/50] 000000005037.jpg → 3 dets en 0.53s
[7/50] 000000006040.jpg → 3 dets en 0.52s
[8/50] 000000006723.jpg → 6 dets en 0.50s
[9/50] 000000007088.jpg → 1 dets en 0.46s
[10/50] 000000007386.jpg → 4 dets en 0.51s
[11/50] 000000007816.jpg → 1 dets en 0.52s
[12/50] 000000008211.jpg → 3 dets en 0.50s
[13/50] 000000008762.jpg → 1 dets en 0.52s
[14/50] 000000008899.jpg → 3 dets en 0.42s
[15/50] 000000009769.jpg → 1 dets en 0.46s
[16/50] 000000009891.jpg → 2 dets en 0.48s
[17/50] 000000010363.jpg → 6 dets en 0.50s
[18/50] 000000011149.jpg → 3 dets en 0.47s
[19/50] 000000011197.jpg → 6 dets en 0.52s
[20/50] 000000011511.jpg → 2 dets en 0.50s
[21/50] 000000011615.jpg → 4 dets en 0.46s
[22/50] 000000013177.jpg → 4 dets en 0.49s
[23/50] 000000013348.jpg → 2 dets en 0.50s
[24/50] 000000014380

### OMDET

In [None]:
# ===========================================
# 🔹 OmDetTurbo: inferencia + postproceso + COCO eval
# ===========================================
!pip install -q transformers torch torchvision pycocotools matplotlib tqdm

import os, json, gc, time
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from torchvision.ops import nms
from tqdm import tqdm
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from transformers import AutoProcessor, OmDetTurboForObjectDetection

# ===============================
# 0. Configuración inicial
# ===============================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# dataset COCO
image_dir = "./COCO/images/val2017"
annotation_file = "./COCO/annotations/instances_val2017.json"
coco_gt = COCO(annotation_file)

# clases de interés
classes = ["car", "bus", "truck", "motorcycle", "bicycle"]
text_queries = [f"a {c}" for c in classes]

# mapear nombres a IDs COCO
cats = coco_gt.loadCats(coco_gt.getCatIds())
name_to_id = {c["name"]: c["id"] for c in cats}
selected_cat_ids = [name_to_id[c] for c in classes if c in name_to_id]

# filtrar imágenes relevantes
ann_ids = coco_gt.getAnnIds(catIds=selected_cat_ids)
anns = coco_gt.loadAnns(ann_ids)
filtered_image_ids = sorted(list({a["image_id"] for a in anns}))
filtered_images_info = coco_gt.loadImgs(filtered_image_ids)
filtered_image_paths = [os.path.join(image_dir, img["file_name"]) for img in filtered_images_info]
print(f"Imágenes filtradas: {len(filtered_image_paths)}")

# ===============================
# 1. Cargar modelo OmDetTurbo
# ===============================
processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf").to(device).eval()

def clip_boxes_xyxy(boxes, W, H):
    boxes[:, 0] = boxes[:, 0].clamp(0, W - 1)
    boxes[:, 1] = boxes[:, 1].clamp(0, H - 1)
    boxes[:, 2] = boxes[:, 2].clamp(0, W - 1)
    boxes[:, 3] = boxes[:, 3].clamp(0, H - 1)
    return boxes

def filter_small_boxes(boxes, scores, labels, min_side=None):
    if min_side is None:
        min_side = PARAMS["min_box_side"]
    if len(boxes) == 0:
        return boxes, scores, labels
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]
    keep = (w >= min_side) & (h >= min_side)
    return boxes[keep], scores[keep], [labels[i] for i, k in enumerate(keep.tolist()) if k]

def topk_per_label(boxes, scores, labels, k=None):
    if k is None:
        k = PARAMS["topk_label"]
    if len(boxes) == 0:
        return boxes, scores, labels
    keep_idx = []
    uniq = sorted(set(labels))
    for lab in uniq:
        idxs = [i for i, l in enumerate(labels) if l == lab]
        if not idxs:
            continue
        idxs_sorted = sorted(idxs, key=lambda i: float(scores[i]), reverse=True)[:k]
        keep_idx.extend(idxs_sorted)
    keep_idx = sorted(set(keep_idx))
    return boxes[keep_idx], scores[keep_idx], [labels[i] for i in keep_idx]

def postprocess_omdet(outputs, orig_size_hw):
    H, W = orig_size_hw

    # Obtener logits y coordenadas
    logits = outputs.decoder_class_logits[0].cpu()        # [N, C]
    boxes_cxcywh = outputs.decoder_coord_logits[0].cpu()  # [N, 4] en cxcywh normalizado

    # Calcular scores y labels
    probs = torch.softmax(logits, dim=-1)
    scores, label_idx = probs.max(dim=-1)

    # Filtrar por score mínimo
    keep = scores >= PARAMS["score_thr"]
    scores = scores[keep]
    label_idx = label_idx[keep]
    boxes_cxcywh = boxes_cxcywh[keep]

    # Convertir cxcywh → xyxy en píxeles
    cx, cy, bw, bh = boxes_cxcywh.unbind(-1)
    cx *= W; cy *= H; bw *= W; bh *= H
    x1 = cx - bw / 2
    y1 = cy - bh / 2
    x2 = cx + bw / 2
    y2 = cy + bh / 2
    boxes = torch.stack([x1, y1, x2, y2], dim=-1)
    boxes = clip_boxes_xyxy(boxes, W, H)

    # Mapear índices a texto
    labels = [text_queries[i] if 0 <= i < len(text_queries) else f"Clase_{i}" for i in label_idx.tolist()]

    # Aplicar filtros adicionales
    boxes, scores, labels = filter_small_boxes(boxes, scores, labels)
    boxes, scores, labels = topk_per_label(boxes, scores, labels)

    return {"boxes": boxes, "scores": scores, "labels": labels}



def apply_nms_by_label(results, iou_thr=None):
    if iou_thr is None:
        iou_thr = PARAMS["nms_iou"]
    boxes, scores, labels = results["boxes"], results["scores"], results["labels"]
    if len(boxes) == 0:
        return results
    keep_all = []
    for lab in sorted(set(labels)):
        idxs = [i for i, l in enumerate(labels) if l == lab]
        if not idxs:
            continue
        b = boxes[idxs]
        s = scores[idxs]
        kept = nms(b, s, iou_thr).cpu().numpy().tolist()
        keep_all.extend([idxs[i] for i in kept])
    keep_all = sorted(set(keep_all))
    return {
        "boxes": boxes[keep_all],
        "scores": scores[keep_all],
        "labels": [labels[i] for i in keep_all]
    }

# ===============================
# 3. Inferencia
# ===============================
def run_omdet_inference(image_paths, max_images=None):
    if max_images is None:
        max_images = PARAMS["max_images"]
    results_all = []
    for i, path in enumerate(tqdm(image_paths[:max_images], desc="Inferencia OmDetTurbo")):
        image = Image.open(path).convert("RGB")
        W, H = image.size
        with torch.no_grad():
            inputs = processor(images=image, text=text_queries, return_tensors="pt").to(device)
            t0 = time.time()
            outputs = model(**inputs)
            t1 = time.time()
        res = postprocess_omdet(outputs, (H, W))
        res_nms = apply_nms_by_label(res)
        results_all.append({
            "image": path,
            "boxes_xyxy": res_nms["boxes"].numpy().tolist(),
            "labels": res_nms["labels"],
            "scores": res_nms["scores"].numpy().tolist(),
            "time": t1 - t0
        })
        del inputs, outputs
        torch.cuda.empty_cache(); gc.collect()
    return results_all

# ===============================
# 4. Visualización
# ===============================
def mostrar_10_resultados_omdet(resultados):
    for idx, resultado in enumerate(resultados[:10]):
        image = Image.open(resultado["image"]).convert("RGB")
        fig, ax = plt.subplots(1, figsize=(8, 8))
        ax.imshow(image)
        drawn = 0
        for box, label, score in sorted(zip(resultado["boxes_xyxy"], resultado["labels"], resultado["scores"]),
                                        key=lambda x: x[2], reverse=True):
            if score < PARAMS["score_min_draw"]:
                continue
            x1, y1, x2, y2 = box
            rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1,
                                     linewidth=2, edgecolor='lime', facecolor='none')
            ax.add_patch(rect)
            ax.text(x1, max(0, y1 - 5),
                    f"{label} ({score:.2f})", color='black',
                    fontsize=9, backgroundcolor='white')
            drawn += 1
            if drawn >= PARAMS["max_draw"]:
                break
        ax.set_title(f"Imagen {idx+1} — {drawn} detecciones (≥ {PARAMS['score_min_draw']})")
        plt.axis('off')
        plt.show()
        plt.close(fig)

# ===============================
# 5. Exportar detecciones a COCO
# ===============================
def to_coco_dets(dets, filtered_images_info, name_to_id, score_min=0.001):
    coco_dets = []
    for img_info, d in zip(filtered_images_info, dets):
        img_id = img_info["id"]
        for (x1, y1, x2, y2), lab, sc in zip(d["boxes_xyxy"], d["labels"], d["scores"]):
            cname = lab.replace("a ", "")
            if sc < score_min or cname not in name_to_id:
                continue
            coco_dets.append({
                "image_id": img_id,
                "category_id": name_to_id[cname],
                "bbox": [float(x1), float(y1), float(x2 - x1), float(y2 - y1)],
                "score": float(sc)
            })
    return coco_dets

# ===============================
# 6. Ejecutar todo
# ===============================
omdet_results = run_omdet_inference(filtered_image_paths)
#mostrar_10_resultados_omdet(omdet_results)

# guardar resultados
with open("omdet_results.json", "w") as f:
    json.dump(omdet_results, f, indent=2)
print("✅ Resultados guardados en omdet_results.json")

# evaluación COCO
coco_dets = to_coco_dets(omdet_results, filtered_images_info[:len(omdet_results)], name_to_id, score_min=0.001)
if coco_dets:
    with open("coco_results_omdet.json", "w") as f:
        json.dump(coco_dets, f)
    coco_dt = coco_gt.loadRes("coco_results_omdet.json")
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
    coco_eval.params.imgIds = filtered_image_ids[:len(omdet_results)]
    coco_eval.evaluate(); coco_eval.accumulate(); coco_eval.summarize()
else:
    print("⚠️ No hay predicciones para evaluar.")


### OWLv2

In [9]:
# 🔹 OWLv2: inferencia afinada + visualización + evaluación COCO
!pip install -q transformers torch torchvision pycocotools matplotlib

import os, json, time, gc
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from torchvision.ops import nms
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

# ===============================
# 🔹 CONFIGURACIÓN BÁSICA
# ===============================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
IMAGE_DIR = "./COCO/images/val2017"
ANNOTATION_FILE = "./COCO/annotations/instances_val2017.json"
CLASSES = ["car", "bus", "truck", "motorcycle", "bicycle"]
TEXT_QUERIES = [f"a {c}" for c in CLASSES]
MODEL_ID = "google/owlv2-base-patch16-ensemble"

print("Device:", DEVICE)

# ===============================
# 🔹 COCO: cargar y filtrar imágenes
# ===============================
coco_gt = COCO(ANNOTATION_FILE)
cats = coco_gt.loadCats(coco_gt.getCatIds())
name_to_id = {c["name"]: c["id"] for c in cats}
selected_cat_ids = [name_to_id[c] for c in CLASSES if c in name_to_id]

ann_ids = coco_gt.getAnnIds(catIds=selected_cat_ids)
anns = coco_gt.loadAnns(ann_ids)
filtered_image_ids = sorted(list({a["image_id"] for a in anns}))
filtered_images_info = coco_gt.loadImgs(filtered_image_ids)
filtered_image_paths = [os.path.join(IMAGE_DIR, img["file_name"]) for img in filtered_images_info]
print(f"Imágenes filtradas: {len(filtered_image_paths)}")

# ===============================
# 🔹 Cargar modelo OWLv2
# ===============================
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_ID).to(DEVICE).eval()

# ===============================
# 🔹 Funciones auxiliares
# ===============================
def clip_boxes_xyxy(boxes, W, H):
    boxes[:, 0] = boxes[:, 0].clamp(0, W-1)
    boxes[:, 1] = boxes[:, 1].clamp(0, H-1)
    boxes[:, 2] = boxes[:, 2].clamp(0, W-1)
    boxes[:, 3] = boxes[:, 3].clamp(0, H-1)
    return boxes

def filter_small_boxes(boxes, scores, labels, min_side=None):
    if min_side is None:
        min_side = PARAMS["min_box_side"]
    if len(boxes) == 0:
        return boxes, scores, labels
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]
    keep = (w >= min_side) & (h >= min_side)
    return boxes[keep], scores[keep], [labels[i] for i, k in enumerate(keep.tolist()) if k]

def topk_per_label(boxes, scores, labels, k=None):
    if k is None:
        k = PARAMS["topk_label"]
    if len(boxes) == 0:
        return boxes, scores, labels
    keep_idx = []
    for lab in sorted(set(labels)):
        idxs = [i for i, l in enumerate(labels) if l == lab]
        if not idxs:
            continue
        idxs_sorted = sorted(idxs, key=lambda i: float(scores[i]), reverse=True)[:k]
        keep_idx.extend(idxs_sorted)
    keep_idx = sorted(set(keep_idx))
    return boxes[keep_idx], scores[keep_idx], [labels[i] for i in keep_idx]

def postprocess_owlv2(outputs, orig_size_hw, score_thr=None, min_box_side=None, topk_label=None):
    if score_thr is None:
        score_thr = PARAMS["score_thr"]
    H, W = orig_size_hw
    target_sizes = torch.tensor([[H, W]], device=outputs.logits.device)
    results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes)[0]
    boxes = results["boxes"].cpu()
    scores = results["scores"].cpu()
    labels_idx = results["labels"].cpu().tolist()
    labels = [TEXT_QUERIES[i] if 0 <= i < len(TEXT_QUERIES) else f"Clase_{i}" for i in labels_idx]

    keep = scores >= score_thr
    boxes = boxes[keep]
    scores = scores[keep]
    labels = [labels[i] for i, k in enumerate(keep.tolist()) if k]

    boxes = clip_boxes_xyxy(boxes, W, H)
    boxes, scores, labels = filter_small_boxes(boxes, scores, labels, min_side=min_box_side)
    boxes, scores, labels = topk_per_label(boxes, scores, labels, k=topk_label)
    return {"boxes": boxes, "scores": scores, "labels": labels}

def apply_nms_by_label(results, iou_thr=None):
    if iou_thr is None:
        iou_thr = PARAMS["nms_iou"]
    boxes, scores, labels = results["boxes"], results["scores"], results["labels"]
    if len(boxes) == 0:
        return results
    keep_all = []
    for lab in sorted(set(labels)):
        idxs = [i for i, l in enumerate(labels) if l == lab]
        if not idxs:
            continue
        b = boxes[idxs]
        s = scores[idxs]
        kept = nms(b, s, iou_thr).cpu().numpy().tolist()
        keep_all.extend([idxs[i] for i in kept])
    keep_all = sorted(set(keep_all))
    return {"boxes": boxes[keep_all], "scores": scores[keep_all], "labels": [labels[i] for i in keep_all]}

# ===============================
# 🔹 Inferencia
# ===============================
def run_owlv2_inference(image_paths):
    results_all = []
    for i, path in enumerate(image_paths[:PARAMS["max_images"]]):
        image = Image.open(path).convert("RGB")
        W, H = image.size
        with torch.no_grad():
            inputs = processor(images=image, text=TEXT_QUERIES, return_tensors="pt").to(DEVICE)
            t0 = time.time()
            outputs = model(**inputs)
            t1 = time.time()
        res = postprocess_owlv2(outputs, (H, W))
        res_nms = apply_nms_by_label(res)
        results_all.append({
            "image": path,
            "boxes_xyxy": res_nms["boxes"].numpy().tolist(),
            "labels": res_nms["labels"],
            "scores": res_nms["scores"].numpy().tolist(),
            "time": t1 - t0
        })
        print(f"[{i+1}/{min(PARAMS['max_images'], len(image_paths))}] {os.path.basename(path)} → {len(res_nms['labels'])} dets en {t1 - t0:.2f}s")
        del inputs, outputs
        torch.cuda.empty_cache(); gc.collect()
    return results_all

# ===============================
# 🔹 Visualización
# ===============================
def mostrar_10_resultados_owlv2(resultados):
    for idx, resultado in enumerate(resultados[:10]):
        image = Image.open(resultado["image"]).convert("RGB")
        fig, ax = plt.subplots(1, figsize=(8, 8))
        ax.imshow(image)
        drawn = 0
        for box, label, score in sorted(zip(resultado["boxes_xyxy"], resultado["labels"], resultado["scores"]),
                                        key=lambda x: x[2], reverse=True):
            if score < PARAMS["score_min_draw"]:
                continue
            x1, y1, x2, y2 = box
            rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='lime', facecolor='none')
            ax.add_patch(rect)
            ax.text(x1, max(0, y1 - 5), f"{label} ({score:.2f})", color='black', fontsize=9, backgroundcolor='white')
            drawn += 1
            if drawn >= PARAMS["max_draw"]:
                break
        ax.set_title(f"Imagen {idx+1} — {drawn} dets (≥ {PARAMS['score_min_draw']})")
        plt.axis('off')
        plt.show()
        plt.close(fig)

# ===============================
# 🔹 COCO detections
# ===============================
def to_coco_dets(dets, filtered_images_info, name_to_id, score_min=0.001):
    coco_dets = []
    for img_info, d in zip(filtered_images_info, dets):
        img_id = img_info["id"]
        for (x1, y1, x2, y2), lab, sc in zip(d["boxes_xyxy"], d["labels"], d["scores"]):
            cname = lab.replace("a ", "")
            if sc < score_min or cname not in name_to_id:
                continue
            coco_dets.append({
                "image_id": img_id,
                "category_id": name_to_id[cname],
                "bbox": [float(x1), float(y1), float(x2 - x1), float(y2 - y1)],
                "score": float(sc)
            })
    return coco_dets

# ===============================
# 🔹 Ejecutar inferencia y visualización
# ===============================
owlv2_results = run_owlv2_inference(filtered_image_paths)
#mostrar_10_resultados_owlv2(owlv2_results)

# Guardar resultados
with open("owlv2_results.json", "w") as f:
    json.dump(owlv2_results, f, indent=2)
print("✅ Resultados guardados en owlv2_results.json")

# Evaluar mAP COCO
coco_dets = to_coco_dets(owlv2_results, filtered_images_info[:len(owlv2_results)], name_to_id)
if coco_dets:
    with open("coco_results_owlv2.json", "w") as f:
        json.dump(coco_dets, f)
    coco_dt = coco_gt.loadRes("coco_results_owlv2.json")
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
    coco_eval.params.imgIds = filtered_image_ids[:len(owlv2_results)]
    coco_eval.evaluate(); coco_eval.accumulate(); coco_eval.summarize()
else:
    print("⚠️ No hay predicciones para evaluar.")


KeyboardInterrupt: 