
# Лого Т‑банка — детекция (OWLv2 + GroundingDINO + ансамбли) ✅

В этом ноутбуке:
- аккуратные утилиты для чтения **YOLO**‑разметки и рисования боксов;
- единая функция `evaluate_on_dataset()` с метриками **Precision / Recall / F1 / mAP@50 / mAP@50‑95**;
- предикторы **OWLv2** и **Grounding DINO** (базовый + режимы *precise / recall / compromise*);
- ансамбли **consensus / union / WBF**;
- сохранение превью с боксами в подпапки `ans_*` (GT — зелёный, предикты — красный).


In [1]:

# ==== CONFIG (пути и пороги) ====

# Пути к данным 
PATH_IMAGES = r"datasets/sirius/T-bank_val/val/images"
PATH_LABELS = r"datasets/sirius/T-bank_val/val/labels"

# Единственный класс (щит с буквой T)
GT_CLASS_ID = 0

# Общие пороги
CONF = 0.30   # Порог для OWLv2
IOU  = 0.50

# Grounding DINO — базовые пороги
BOX_THR_G  = 0.30
TEXT_THR_G = 0.25

# Промпты
PROMPTS_OWL = [
    "a stylized geometric letter t inside a shield emblem",
    "a minimalist t logo inside a angular shield shape",
    "a modern t letter inside a geometric shield outline",
    "a T-Bank logo with letter t in shield",
    "t letter in white shield"
]

PROMPTS_GDINO = (
    "logo with a bold black letter T inside a white shield-like shape",
    "white or yellowshield emblem with a large black T in the center",
    "white badge shaped like a shield with a black capital T",
    "minimalist logo with a bold T inside a shield icon",
    "flat design emblem shaped like a shield containing a yellow or black or white T letter",
    "logo featuring a strong black T on a yellow or white shield background",
    "simplified yellow or black or white shield logo with a single bold yellow or black or white letter T"
)

# Варианты запуска
RUN_OWLV2_COLOR = True      # базовый OWLv2
RUN_GDINO_BASE  = True      # GDINO базовый
RUN_GDINO_PRECISE    = True # GDINO: точный
RUN_GDINO_RECALL     = True # GDINO: жадный
RUN_GDINO_COMPROMISE = True # GDINO: компромисс
RUN_ENSEMBLE_CONSENSUS = True
RUN_ENSEMBLE_UNION     = True
RUN_ENSEMBLE_WBF       = True


In [2]:

# ==== IMPORTS ====
import os, glob, torch, gc
import numpy as np
import pandas as pd
from typing import List, Tuple, Callable
from PIL import Image, ImageDraw, ImageFont
from tqdm.auto import tqdm

from torchvision.ops import box_iou, nms
from torchmetrics.detection.mean_ap import MeanAveragePrecision

from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers import AutoModelForZeroShotObjectDetection as GDINOModel, AutoProcessor as GDINOProcessor


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


In [3]:

# ==== UTILITIES ====

# Поддерживаемые расширения картинок
SUPPORTED_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}


def gpu_cleanup(sync: bool = False):
    """Чистим кеш CUDA"""
    gc.collect()
    if torch.cuda.is_available():
        if sync:
            torch.cuda.synchronize()
        torch.cuda.empty_cache()
        # иногда полезно, но не у всех сборок CUDA есть:
        try:
            torch.cuda.ipc_collect()
        except Exception:
            pass


def list_images(root: str) -> List[str]:
    """Рекурсивно собираем пути к картинкам."""
    paths = []
    for ext in SUPPORTED_EXTS:
        paths.extend(glob.glob(os.path.join(root, f"**/*{ext}"), recursive=True))
    return sorted(paths)

    
def yolo_path_for(img_path: str, labels_root: str) -> str:
    """Строим путь к .txt по имени файла изображения."""
    stem = os.path.splitext(os.path.basename(img_path))[0]
    return os.path.join(labels_root, f"{stem}.txt")


def load_yolo_labels(lbl_path: str, img_wh: Tuple[int, int]):
    """ ИЗ формата YOLO (cls cx cy w h) в формат left_x left_y right_x right_y cls"""
    w, h = img_wh
    if not os.path.isfile(lbl_path):
        return np.zeros((0,4), np.float32), np.zeros((0,), np.int64)
    boxes, labels = [], []
    with open(lbl_path, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if not s: 
                continue
            p = s.split()
            if len(p) != 5:
                continue
            cls = int(float(p[0]))
            xc, yc, bw, bh = map(float, p[1:])
            px, py, pw, ph = xc*w, yc*h, bw*w, bh*h
            x1, y1 = max(0.0, px - pw/2), max(0.0, py - ph/2)
            x2, y2 = min(w,   px + pw/2), min(h,   py + ph/2)
            boxes.append([x1, y1, x2, y2]); labels.append(cls)
    if not boxes:
        return np.zeros((0,4), np.float32), np.zeros((0,), np.int64)
    return np.asarray(boxes, np.float32), np.asarray(labels, np.int64)


def measure_text(draw, text, font):
    """Безопасное измерение текста для разных версий Pillow."""
    try:
        l, t, r, b = draw.textbbox((0, 0), text, font=font)
        return (r - l), (b - t)
    except Exception:
        try:
            return font.getsize(text)
        except Exception:
            return (len(text) * 7, 10)


def pr_counts_torch(pred_boxes, pred_scores, gt_boxes, iou_thr=0.5):
    """Счётчики TP/FP/FN для precision/recall — жадный матч по IoU."""
    pb = torch.as_tensor(pred_boxes, dtype=torch.float32)
    gb = torch.as_tensor(gt_boxes, dtype=torch.float32)
    if pb.numel() == 0 and gb.numel() == 0: return 0, 0, 0
    if pb.numel() == 0: return 0, 0, gb.shape[0]
    if gb.numel() == 0: return 0, pb.shape[0], 0
    order = torch.argsort(torch.as_tensor(pred_scores), descending=True)
    pb = pb[order]
    ious = box_iou(pb, gb)
    used = torch.zeros(gb.shape[0], dtype=torch.bool)
    tp = fp = 0
    for i in range(pb.shape[0]):
        row = ious[i].clone()
        row[used] = -1
        best_iou, best_j = row.max(0)
        if best_iou >= iou_thr:
            tp += 1; used[best_j] = True
        else:
            fp += 1
    fn = int((~used).sum().item())
    return int(tp), int(fp), fn

    
def draw_and_save(img: Image.Image, gt_boxes: np.ndarray, pred_boxes: np.ndarray, pred_scores: np.ndarray, save_path: str):
    """Отрисовываем bound boxes, как GT и inference"""
    out = img.copy()
    W, H = out.size
    lw_pred = max(2, int(round(0.004 * 0.5 * (W + H))))  # базовая толщина
    lw_gt   = lw_pred + 4                                # зеленые толще, чтобы не было перекрытий
    PAD = 5                                              # расширяем GT на 5 пикселей

    def clamp_box(x1, y1, x2, y2):
        x1 = max(0, min(W - 1, int(round(x1))))
        y1 = max(0, min(H - 1, int(round(y1))))
        x2 = max(0, min(W - 1, int(round(x2))))
        y2 = max(0, min(H - 1, int(round(y2))))
        if x2 <= x1: x2 = min(W - 1, x1 + 1)
        if y2 <= y1: y2 = min(H - 1, y1 + 1)
        return x1, y1, x2, y2

    draw = ImageDraw.Draw(out)
    try: 
        font = ImageFont.load_default()
    except Exception:
        font = None
    if font is None:
        font = ImageFont.load_default()

    # GT boxes (зелёные)
    for gb in gt_boxes:
        x1, y1, x2, y2 = [float(v) for v in gb]
        x1, y1, x2, y2 = clamp_box(x1 - PAD, y1 - PAD, x2 + PAD, y2 + PAD)
        draw.rectangle([(x1, y1), (x2, y2)], outline=(0, 255, 0), width=lw_gt)
        t = "GT"; tw, th = measure_text(draw, t, font)
        tx, ty = x1, max(0, y1 - th - 2)
        draw.rectangle([(tx, ty), (tx + tw + 2, ty + th + 2)], fill=(0, 255, 0))
        draw.text((tx + 1, ty + 1), t, fill=(0, 0, 0), font=font)

    # inference boxes (красные)
    for b, s in zip(pred_boxes, pred_scores):
        x1, y1, x2, y2 = [float(v) for v in b]
        x1, y1, x2, y2 = clamp_box(x1, y1, x2, y2)
        draw.rectangle([(x1, y1), (x2, y2)], outline=(255, 0, 0), width=lw_pred)
        t = f"{s:.2f}"; tw, th = measure_text(draw, t, font)
        tx, ty = x1, max(0, y1 - th - 2)
        draw.rectangle([(tx, ty), (tx + tw + 2, ty + th + 2)], fill=(255, 255, 0))
        draw.text((tx + 1, ty + 1), t, fill=(0, 0, 0), font=font)

    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    out.save(save_path)
    out.close()


In [4]:

# ==== DATASET ====
img_paths = list_images(PATH_IMAGES)
assert img_paths, f"Нет изображений по пути: {PATH_IMAGES}"
len(img_paths)

146

In [5]:

# ==== EVALUATION (Precision / Recall / F1 / mAP) ====

def evaluate_on_dataset(predict_fn: Callable, run_name: str, out_dir: str, save_images: bool=True):
    """Предсказываем, а после считаем метрики + сохраняем визуализацию"""
    map_metric = MeanAveragePrecision(iou_type="bbox")
    TP = FP = FN = 0

    for img_path in tqdm(img_paths, desc=run_name, unit="img"):
        try:
            img = Image.open(img_path).convert("RGB")
        except Exception:
            continue
        W, H = img.size

        # GT (берём только целевой класс)
        gt_boxes, gt_labels = load_yolo_labels(yolo_path_for(img_path, PATH_LABELS), (W, H))
        keep = (gt_labels == GT_CLASS_ID)
        gt_boxes = gt_boxes[keep]; gt_labels = gt_labels[keep]

        # Предсказание
        boxes, scores = predict_fn(img)
        labels = np.zeros((boxes.shape[0],), dtype=np.int64)  # один класс

        # mAP (torchmetrics ждёт тензоры)
        map_metric.update(
            [{ "boxes": torch.from_numpy(boxes),
               "scores": torch.from_numpy(scores),
               "labels": torch.from_numpy(labels)}],
            [{ "boxes": torch.from_numpy(gt_boxes),
               "labels": torch.from_numpy(gt_labels)}]
        )

        # Precision/Recall счётчики
        tp, fp, fn = pr_counts_torch(boxes, scores, gt_boxes, IOU)
        TP += tp; FP += fp; FN += fn

        # Визуализация
        if save_images:
            stem, ext = os.path.splitext(os.path.basename(img_path))
            save_path = os.path.join(out_dir, f"{run_name.lower()}_{stem}{ext}")
            draw_and_save(img, gt_boxes, boxes, scores, save_path)

        img.close()

    # Сводка
    res_map = map_metric.compute()
    precision = TP / max(1, TP + FP)
    recall    = TP / max(1, TP + FN)
    f1        = 0.0 if (precision + recall) == 0 else (2 * precision * recall) / (precision + recall)

    summary = {
        "run": run_name,
        "images": len(img_paths),
        "conf": CONF,
        "iou_eval": IOU,
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "mAP@50": float(res_map["map_50"]),
        "mAP@50-95": float(res_map["map"]),
    }

    print(f"\n===== {run_name} =====")
    print(f"Images: {summary['images']}  Conf>= {CONF}  IoU>= {IOU}")
    print(f"Precision:  {summary['precision']:.4f}")
    print(f"Recall:     {summary['recall']:.4f}")
    print(f"F1@{IOU}:   {summary['f1']:.4f}")
    print(f"mAP@50:     {summary['mAP@50']:.4f}")
    print(f"mAP@50-95:  {summary['mAP@50-95']:.4f}")
    return summary


In [6]:

# ==== OWLv2 MODEL ====
processor_owl = AutoProcessor.from_pretrained("google/owlv2-large-patch14-ensemble")
model_owl = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble").to(device).eval()

@torch.inference_mode()
def owlv2_predict_color(img: Image.Image):
    W, H = img.size
    inputs = processor_owl(text=[PROMPTS_OWL], images=img, return_tensors="pt").to(device)
    outputs = model_owl(**inputs)
    res = processor_owl.post_process_object_detection(
        outputs=outputs, threshold=CONF, target_sizes=torch.tensor([(H, W)], device=device)
    )[0]
    boxes  = res["boxes"].detach().cpu().numpy().astype(np.float32)
    scores = res["scores"].detach().cpu().numpy().astype(np.float32)
    return boxes, scores


Fetching 1 files: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1000.31it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 1 files: 100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


In [7]:

# ==== Grounding DINO MODEL (база) ====
gdino_model_id = "IDEA-Research/grounding-dino-base"
processor_g = GDINOProcessor.from_pretrained(gdino_model_id)
model_g     = GDINOModel.from_pretrained(gdino_model_id).to(device).eval()

@torch.inference_mode()
def gdino_predict(img: Image.Image):
    """Базовый предиктор GDINO c порогами BOX_THR_G / TEXT_THR_G."""
    W, H = img.size
    inputs = processor_g(images=img, text=[PROMPTS_GDINO], return_tensors="pt").to(device)
    outputs = model_g(**inputs)
    res = processor_g.post_process_grounded_object_detection(
        outputs=outputs, input_ids=inputs.input_ids,
        threshold=BOX_THR_G, text_threshold=TEXT_THR_G, target_sizes=[(H, W)]
    )[0]
    boxes  = res.get("boxes", torch.empty((0,4))).detach().cpu().numpy().astype(np.float32)
    scores = res.get("scores", torch.empty((0,))).detach().cpu().numpy().astype(np.float32)
    return boxes, scores


Fetching 1 files: 100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


In [8]:

# ==== Grounding DINO — расширенные режимы (precise / recall / compromise) ====
# Идея:
#  - precise: выше пороги + строгий shape-фильтр + NMS → ↑ precision
#  - recall:  мягкие пороги (2 прохода, union) + мягкий shape-фильтр + NMS → ↑ recall
#  - compromise: объединение (union) precise и recall → затем NMS для баланса


# Пороговые параметры
BOX_THR_G_P1, TEXT_THR_G_P1 = 0.35, 0.25  # precise, 1-й проход
BOX_THR_G_P2, TEXT_THR_G_P2 = 0.30, 0.22  # precise, fallback

BOX_THR_G_R1, TEXT_THR_G_R1 = 0.30, 0.25  # recall, 1-й проход
BOX_THR_G_R2, TEXT_THR_G_R2 = 0.25, 0.20  # recall, 2-й (ещё мягче)

# Shape-фильтры (соотношение сторон и доля площади от изображения)
ASPECT_MIN_P, ASPECT_MAX_P   = 0.75, 1.35
AREA_MIN_FR_P, AREA_MAX_FR_P = 0.00025, 0.14

ASPECT_MIN_R, ASPECT_MAX_R   = 0.65, 1.45
AREA_MIN_FR_R, AREA_MAX_FR_R = 0.00015, 0.20

# NMS и ограничение числа детектов
NMS_IOU_PRE, MAX_DETS_PRE = 0.55, 50
NMS_IOU_REC, MAX_DETS_REC = 0.60, 80

def _gdino_pass(img, box_thr, text_thr):
    W, H = img.size
    inputs = processor_g(images=img, text=[PROMPTS_GDINO], return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model_g(**inputs)
    res = processor_g.post_process_grounded_object_detection(
        outputs=outputs, input_ids=inputs.input_ids,
        threshold=box_thr, text_threshold=text_thr, target_sizes=[(H, W)]
    )[0]
    boxes  = res.get("boxes", torch.empty((0,4))).detach().cpu().numpy().astype(np.float32)
    scores = res.get("scores", torch.empty((0,))).detach().cpu().numpy().astype(np.float32)
    return boxes, scores

def _shape_filter(boxes, scores, W, H, a_min, a_max, fr_min, fr_max):
    if len(boxes) == 0: return boxes, scores
    img_area = float(W * H)
    keep = []
    for i, b in enumerate(boxes):
        x1, y1, x2, y2 = map(float, b)
        w = max(1.0, x2 - x1); h = max(1.0, y2 - y1)
        ar = w / h
        fr = (w * h) / img_area
        if a_min <= ar <= a_max and fr_min <= fr <= fr_max:
            keep.append(i)
    if not keep:
        return np.zeros((0,4), np.float32), np.zeros((0,), np.float32)
    keep = np.asarray(keep, int)
    return boxes[keep], scores[keep]

@torch.inference_mode()
def gdino_predict_precise(img: Image.Image):
    W, H = img.size
    b1, s1 = _gdino_pass(img, BOX_THR_G_P1, TEXT_THR_G_P1)
    if len(b1) == 0:
        b2, s2 = _gdino_pass(img, BOX_THR_G_P2, TEXT_THR_G_P2)
        b, s = (b2, s2)
    else:
        b, s = (b1, s1)
    b, s = _shape_filter(b, s, W, H, ASPECT_MIN_P, ASPECT_MAX_P, AREA_MIN_FR_P, AREA_MAX_FR_P)
    if len(b):
        keep = nms(torch.from_numpy(b), torch.from_numpy(s), iou_threshold=NMS_IOU_PRE).numpy()
        b, s = b[keep], s[keep]
        if len(b) > MAX_DETS_PRE:
            order = np.argsort(-s)[:MAX_DETS_PRE]
            b, s = b[order], s[order]
    return b, s

@torch.inference_mode()
def gdino_predict_recall(img: Image.Image):
    W, H = img.size
    b1, s1 = _gdino_pass(img, BOX_THR_G_R1, TEXT_THR_G_R1)
    b2, s2 = _gdino_pass(img, BOX_THR_G_R2, TEXT_THR_G_R2)
    if len(b1) == 0 and len(b2) == 0:
        B, S = b1, s1
    elif len(b1) == 0:
        B, S = b2, s2
    elif len(b2) == 0:
        B, S = b1, s1
    else:
        B, S = np.vstack([b1, b2]), np.hstack([s1, s2])
    B, S = _shape_filter(B, S, W, H, ASPECT_MIN_R, ASPECT_MAX_R, AREA_MIN_FR_R, AREA_MAX_FR_R)
    if len(B):
        keep = nms(torch.from_numpy(B), torch.from_numpy(S), iou_threshold=NMS_IOU_REC).numpy()
        B, S = B[keep], S[keep]
        if len(B) > MAX_DETS_REC:
            order = np.argsort(-S)[:MAX_DETS_REC]
            B, S = B[order], S[order]
    return B, S

@torch.inference_mode()
def gdino_predict_compromise(img: Image.Image):
    b_prec, s_prec = gdino_predict_precise(img)
    b_rec,  s_rec  = gdino_predict_recall(img)
    if len(b_prec)==0 and len(b_rec)==0:
        return b_prec, s_prec
    if len(b_prec)==0:
        B, S = b_rec, s_rec
    elif len(b_rec)==0:
        B, S = b_prec, s_prec
    else:
        B, S = np.vstack([b_prec, b_rec]), np.hstack([s_prec, s_rec])
    if len(B):
        keep = nms(torch.from_numpy(B), torch.from_numpy(S), iou_threshold=0.55).numpy()
        B, S = B[keep], S[keep]
    return B, S


In [9]:

# ==== Ансамбли OWLv2 + GDINO (consensus / union / WBF) ====

# Ручки ансамбля
ENSEMBLE_IOU      = 0.50   # совпадение между моделями (для consensus/WBF)
NMS_IOU_FINAL     = 0.50   # финальный NMS
MAX_DETS_ENSEMBLE = 80

# Веса для WBF (OWL немного важнее по точности)
W_OWL, W_GDINO = 1.00, 0.85

# Какой GDINO использовать в ансамбле: 'base' / 'precise' / 'recall' / 'compromise'
ENSEMBLE_GDINO_FN = 'recall'

def _get_gdino_for_ensemble():
    if ENSEMBLE_GDINO_FN == 'precise': return gdino_predict_precise
    if ENSEMBLE_GDINO_FN == 'recall':  return gdino_predict_recall
    if ENSEMBLE_GDINO_FN == 'compromise': return gdino_predict_compromise
    return gdino_predict  # base

def _final_nms(B, S):
    if len(B)==0: return B, S
    keep = nms(torch.from_numpy(B), torch.from_numpy(S), iou_threshold=NMS_IOU_FINAL).numpy()
    B, S = B[keep], S[keep]
    if len(B) > MAX_DETS_ENSEMBLE:
        order = np.argsort(-S)[:MAX_DETS_ENSEMBLE]
        B, S = B[order], S[order]
    return B, S

def _consensus_ensemble(img: Image.Image):
    """OWL как база, берём только те OWL-боксы, что подтверждены GDINO (IoU≥ENSEMBLE_IOU)."""
    b_owl, s_owl = owlv2_predict_color(img)
    b_g,   s_g   = _get_gdino_for_ensemble()(img)
    if len(b_owl)==0 or len(b_g)==0:
        return np.zeros((0,4), np.float32), np.zeros((0,), np.float32)
    ious = box_iou(torch.tensor(b_owl), torch.tensor(b_g)).numpy()
    keep_idx, keep_scores = [], []
    for i in range(ious.shape[0]):
        j = int(ious[i].argmax())
        if ious[i, j] >= ENSEMBLE_IOU:
            keep_idx.append(i)
            keep_scores.append(float(max(s_owl[i], s_g[j])))
    if not keep_idx:
        return np.zeros((0,4), np.float32), np.zeros((0,), np.float32)
    B = b_owl[np.asarray(keep_idx, int)]
    S = np.asarray(keep_scores, np.float32)
    return _final_nms(B, S)

def _union_ensemble(img: Image.Image):
    """Объединяем OWL и GDINO, затем финальный NMS."""
    b_owl, s_owl = owlv2_predict_color(img)
    b_g,   s_g   = _get_gdino_for_ensemble()(img)
    if len(b_owl)==0 and len(b_g)==0:
        return b_owl, s_owl
    if len(b_owl)==0:
        return _final_nms(b_g, s_g)
    if len(b_g)==0:
        return _final_nms(b_owl, s_owl)
    B = np.vstack([b_owl, b_g]); S = np.hstack([s_owl, s_g])
    return _final_nms(B, S)

def _wbf_ensemble(img: Image.Image):
    """Простой Weighted Boxes Fusion по IoU≥ENSEMBLE_IOU."""
    b_owl, s_owl = owlv2_predict_color(img)
    b_g,   s_g   = _get_gdino_for_ensemble()(img)

    if len(b_owl)==0 and len(b_g)==0:
        return np.zeros((0,4), np.float32), np.zeros((0,), np.float32)

    B = []; S = []; Wt = []
    if len(b_owl):
        B.append(b_owl); S.append(s_owl); Wt.append(np.full_like(s_owl, W_OWL, dtype=np.float32))
    if len(b_g):
        B.append(b_g);   S.append(s_g);   Wt.append(np.full_like(s_g,   W_GDINO, dtype=np.float32))
    B = np.vstack(B)
    S = np.hstack(S).astype(np.float32)
    Wt = np.hstack(Wt).astype(np.float32)

    clusters = []  # элементы: {'box': np.array(4), 'score': float, 'weight': float}
    for i in np.argsort(-S):
        bi, si, wi = B[i], S[i], Wt[i]
        matched = False
        for c in clusters:
            xA = max(c['box'][0], bi[0]); yA = max(c['box'][1], bi[1])
            xB = min(c['box'][2], bi[2]); yB = min(c['box'][3], bi[3])
            inter = max(0.0, xB - xA) * max(0.0, yB - yA)
            if inter <= 0: 
                continue
            area_c = max(1.0, (c['box'][2]-c['box'][0])*(c['box'][3]-c['box'][1]))
            area_i = max(1.0, (bi[2]-bi[0])*(bi[3]-bi[1]))
            iou = inter / (area_c + area_i - inter)
            if iou >= ENSEMBLE_IOU:
                total_w  = c['weight'] + si*wi
                c['box']   = (c['box']*c['weight'] + bi*(si*wi)) / total_w
                c['score'] = (c['score']*c['weight'] + si*wi) / total_w
                c['weight']= total_w
                matched = True
                break
        if not matched:
            clusters.append({'box': bi.copy(), 'score': si*wi, 'weight': si*wi})

    if not clusters:
        return np.zeros((0,4), np.float32), np.zeros((0,), np.float32)

    Bf = np.vstack([c['box'] for c in clusters]).astype(np.float32)
    Sf = np.array([c['score']/max(1e-6, c['weight']) for c in clusters], dtype=np.float32)

    return _final_nms(Bf, Sf)


In [10]:

# ==== RUN ====
results = []

# OWLv2 базовый
if RUN_OWLV2_COLOR:
    out_dir = os.path.join(os.path.dirname(PATH_IMAGES), "ans_owlv2_color")
    os.makedirs(out_dir, exist_ok=True)
    results.append(evaluate_on_dataset(owlv2_predict_color, "OWLv2_COLOR", out_dir, save_images=True))
    gpu_cleanup()

# GDINO базовый
if RUN_GDINO_BASE:
    out_dir = os.path.join(os.path.dirname(PATH_IMAGES), "ans_gdino_base")
    os.makedirs(out_dir, exist_ok=True)
    results.append(evaluate_on_dataset(gdino_predict, "GroundingDINO_BASE", out_dir, save_images=True))
    gpu_cleanup()

# GDINO режимы
if RUN_GDINO_PRECISE:
    out_dir = os.path.join(os.path.dirname(PATH_IMAGES), "ans_gdino_precise")
    os.makedirs(out_dir, exist_ok=True)
    results.append(evaluate_on_dataset(gdino_predict_precise, "GroundingDINO_PRECISE", out_dir, save_images=True))
    gpu_cleanup()

if RUN_GDINO_RECALL:
    out_dir = os.path.join(os.path.dirname(PATH_IMAGES), "ans_gdino_recall")
    os.makedirs(out_dir, exist_ok=True)
    results.append(evaluate_on_dataset(gdino_predict_recall, "GroundingDINO_RECALL", out_dir, save_images=True))
    gpu_cleanup()

if RUN_GDINO_COMPROMISE:
    out_dir = os.path.join(os.path.dirname(PATH_IMAGES), "ans_gdino_compromise")
    os.makedirs(out_dir, exist_ok=True)
    results.append(evaluate_on_dataset(gdino_predict_compromise, "GroundingDINO_COMPROMISE", out_dir, save_images=True))
    gpu_cleanup()


OWLv2_COLOR: 100%|██████████████████████████████████████████████████████████████████| 146/146 [02:08<00:00,  1.14img/s]



===== OWLv2_COLOR =====
Images: 146  Conf>= 0.3  IoU>= 0.5
Precision:  0.7324
Recall:     0.4333
F1@0.5:   0.5445
mAP@50:     0.3742
mAP@50-95:  0.3443


GroundingDINO_BASE: 100%|███████████████████████████████████████████████████████████| 146/146 [00:38<00:00,  3.80img/s]



===== GroundingDINO_BASE =====
Images: 146  Conf>= 0.3  IoU>= 0.5
Precision:  0.1399
Recall:     0.8917
F1@0.5:   0.2418
mAP@50:     0.2369
mAP@50-95:  0.1956


GroundingDINO_PRECISE: 100%|████████████████████████████████████████████████████████| 146/146 [00:42<00:00,  3.43img/s]



===== GroundingDINO_PRECISE =====
Images: 146  Conf>= 0.3  IoU>= 0.5
Precision:  0.4134
Recall:     0.6167
F1@0.5:   0.4950
mAP@50:     0.2918
mAP@50-95:  0.2429


GroundingDINO_RECALL: 100%|█████████████████████████████████████████████████████████| 146/146 [01:18<00:00,  1.85img/s]



===== GroundingDINO_RECALL =====
Images: 146  Conf>= 0.3  IoU>= 0.5
Precision:  0.2367
Recall:     0.8500
F1@0.5:   0.3702
mAP@50:     0.3533
mAP@50-95:  0.2872


GroundingDINO_COMPROMISE: 100%|█████████████████████████████████████████████████████| 146/146 [02:06<00:00,  1.15img/s]


===== GroundingDINO_COMPROMISE =====
Images: 146  Conf>= 0.3  IoU>= 0.5
Precision:  0.2383
Recall:     0.8500
F1@0.5:   0.3723
mAP@50:     0.3533
mAP@50-95:  0.2872





In [11]:

# ==== АНСАМБЛИ: запуски ====
if RUN_ENSEMBLE_CONSENSUS:
    out_dir = os.path.join(os.path.dirname(PATH_IMAGES), "ans_ensemble_consensus")
    os.makedirs(out_dir, exist_ok=True)
    results.append(evaluate_on_dataset(_consensus_ensemble, "ENSEMBLE_CONSENSUS", out_dir, save_images=True))
    gpu_cleanup()

if RUN_ENSEMBLE_UNION:
    out_dir = os.path.join(os.path.dirname(PATH_IMAGES), "ans_ensemble_union")
    os.makedirs(out_dir, exist_ok=True)
    results.append(evaluate_on_dataset(_union_ensemble, "ENSEMBLE_UNION", out_dir, save_images=True))
    gpu_cleanup()

if RUN_ENSEMBLE_WBF:
    out_dir = os.path.join(os.path.dirname(PATH_IMAGES), "ans_ensemble_wbf")
    os.makedirs(out_dir, exist_ok=True)
    results.append(evaluate_on_dataset(_wbf_ensemble, "ENSEMBLE_WBF", out_dir, save_images=True))
    gpu_cleanup()

results


ENSEMBLE_CONSENSUS: 100%|███████████████████████████████████████████████████████████| 146/146 [03:29<00:00,  1.43s/img]



===== ENSEMBLE_CONSENSUS =====
Images: 146  Conf>= 0.3  IoU>= 0.5
Precision:  0.8197
Recall:     0.4167
F1@0.5:   0.5525
mAP@50:     0.3652
mAP@50-95:  0.3367


ENSEMBLE_UNION: 100%|███████████████████████████████████████████████████████████████| 146/146 [03:29<00:00,  1.44s/img]



===== ENSEMBLE_UNION =====
Images: 146  Conf>= 0.3  IoU>= 0.5
Precision:  0.2402
Recall:     0.8667
F1@0.5:   0.3761
mAP@50:     0.4038
mAP@50-95:  0.3457


ENSEMBLE_WBF: 100%|█████████████████████████████████████████████████████████████████| 146/146 [03:29<00:00,  1.43s/img]


===== ENSEMBLE_WBF =====
Images: 146  Conf>= 0.3  IoU>= 0.5
Precision:  0.2402
Recall:     0.8667
F1@0.5:   0.3761
mAP@50:     0.2686
mAP@50-95:  0.2128





[{'run': 'OWLv2_COLOR',
  'images': 146,
  'conf': 0.3,
  'iou_eval': 0.5,
  'precision': 0.7323943661971831,
  'recall': 0.43333333333333335,
  'f1': 0.5445026178010471,
  'mAP@50': 0.37418845295906067,
  'mAP@50-95': 0.34431517124176025},
 {'run': 'GroundingDINO_BASE',
  'images': 146,
  'conf': 0.3,
  'iou_eval': 0.5,
  'precision': 0.13986928104575164,
  'recall': 0.8916666666666667,
  'f1': 0.24180790960451978,
  'mAP@50': 0.2368956059217453,
  'mAP@50-95': 0.19561053812503815},
 {'run': 'GroundingDINO_PRECISE',
  'images': 146,
  'conf': 0.3,
  'iou_eval': 0.5,
  'precision': 0.4134078212290503,
  'recall': 0.6166666666666667,
  'f1': 0.49498327759197325,
  'mAP@50': 0.29181620478630066,
  'mAP@50-95': 0.24289314448833466},
 {'run': 'GroundingDINO_RECALL',
  'images': 146,
  'conf': 0.3,
  'iou_eval': 0.5,
  'precision': 0.23665893271461716,
  'recall': 0.85,
  'f1': 0.3702359346642468,
  'mAP@50': 0.3533213138580322,
  'mAP@50-95': 0.2871873676776886},
 {'run': 'GroundingDINO_CO

In [12]:

# ==== SUMMARY ====
if results:
    df = pd.DataFrame(results)
    # Сортируем по F1, затем по mAP@50-95
    display(df[["run","precision","recall","f1","mAP@50","mAP@50-95","images","conf","iou_eval"]]
            .sort_values(["f1","mAP@50-95","mAP@50"], ascending=False))
    out_csv = os.path.join(os.path.dirname(PATH_IMAGES), "evaluation_summary_with_f1_and_ensembles.csv")
    df.to_csv(out_csv, index=False)
    print(f"Saved summary to: {out_csv}")
else:
    print("NO DATA")


Unnamed: 0,run,precision,recall,f1,mAP@50,mAP@50-95,images,conf,iou_eval
5,ENSEMBLE_CONSENSUS,0.819672,0.416667,0.552486,0.3652,0.336693,146,0.3,0.5
0,OWLv2_COLOR,0.732394,0.433333,0.544503,0.374188,0.344315,146,0.3,0.5
2,GroundingDINO_PRECISE,0.413408,0.616667,0.494983,0.291816,0.242893,146,0.3,0.5
6,ENSEMBLE_UNION,0.240185,0.866667,0.37613,0.403787,0.345705,146,0.3,0.5
7,ENSEMBLE_WBF,0.240185,0.866667,0.37613,0.268609,0.212821,146,0.3,0.5
4,GroundingDINO_COMPROMISE,0.238318,0.85,0.372263,0.353321,0.287187,146,0.3,0.5
3,GroundingDINO_RECALL,0.236659,0.85,0.370236,0.353321,0.287187,146,0.3,0.5
1,GroundingDINO_BASE,0.139869,0.891667,0.241808,0.236896,0.195611,146,0.3,0.5


Saved summary to: datasets/sirius/T-bank_val/val\evaluation_summary_with_f1_and_ensembles.csv
