In [23]:
import torch
from PIL import Image, ImageDraw, ImageFont

from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

In [24]:
# ---- Config ----
MODEL_ID = "IDEA-Research/grounding-dino-base"  # common baseline checkpoint :contentReference[oaicite:1]{index=1}

# Important: prompts are typically lowercase and end with a period.
# You can provide multiple separated by periods. :contentReference[oaicite:2]{index=2}
TEXT_PROMPT = "plastic pen cap."

BOX_THRESHOLD = 0.25   # raise to reduce false positives
TEXT_THRESHOLD = 0.25  # raise to be stricter about matching words

device = "cuda" if torch.cuda.is_available() else "cpu"

In [33]:
images_paths_in_order = ['images/1.jpeg', 'images/2.jpeg', 'images/3.jpeg', 'images/4.jpeg', 'images/5.jpeg']
images_in_order = [Image.open(image_path).convert("RGB") for image_path in images_paths_in_order]

In [26]:
# ---- Load model + processor ----
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_ID).to(device)

In [31]:
def _get_text_size(draw, text, font):
    try:
        bbox = draw.textbbox((0, 0), text, font=font)
        return bbox[2] - bbox[0], bbox[3] - bbox[1]
    except Exception:
        pass
    try:
        bbox = font.getbbox(text)
        return bbox[2] - bbox[0], bbox[3] - bbox[1]
    except Exception:
        pass
    try:
        return font.getsize(text)
    except Exception:
        pass
    try:
        mask = font.getmask(text)
        return mask.size
    except Exception:
        pass
    return (len(text) * 6, 11)

In [27]:
def get_object_bounding_box(images, text_prompt):
    top_detections = []
    for image in images:
        inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        results = processor.post_process_grounded_object_detection(
            outputs=outputs,
            input_ids=inputs["input_ids"],
            threshold=BOX_THRESHOLD,
            text_threshold=TEXT_THRESHOLD,
            target_sizes=[image.size[::-1]]  # (height, width)
        )

        detections = results[0]
        boxes = detections["boxes"]      # (N, 4) in xyxy
        scores = detections["scores"]    # (N,)
        labels = detections["labels"]    # list of strings

        # Return only the single detection with highest score (or None if no detections)
        if len(boxes) == 0:
            top_detections.append(None)
            continue

        scores_list = [float(s) for s in scores]
        max_idx = int(max(range(len(scores_list)), key=lambda i: scores_list[i]))
        top_detections.append({
            'label': labels[max_idx],
            'score': scores_list[max_idx],
            'box': boxes[max_idx].tolist()
        })

    return top_detections

In [34]:
detections = get_object_bounding_box(images_in_order, TEXT_PROMPT)

In [35]:
for idx, det in enumerate(detections):
    image = images_in_order[idx]
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()

    if det is None:
        print(f"Image {idx + 1}: Found 0 candidate boxes")
    else:
        print(f"Image {idx + 1}: Found 1 candidate box")
        print(det['label'], det['score'], det['box'])
        x1, y1, x2, y2 = det['box']
        draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
        text = f"{det['label']} {det['score']:.2f}"
        text_w, text_h = _get_text_size(draw, text, font)
        draw.rectangle([x1, max(0, y1 - text_h - 4), x1 + text_w + 4, y1], fill="red")
        draw.text((x1 + 2, max(0, y1 - text_h - 4) + 2), text, fill="white", font=font)

    out_path = f"out_detected_{idx+1}.jpg"
    image.save(out_path)
    print("Saved:", out_path)
    try:
        image.show()
    except Exception:
        # `show()` may fail in headless environments; ignore silently
        pass

Image 1: Found 1 candidate box
plastic pen cap 0.6114258170127869 [644.2440185546875, 569.2691040039062, 767.0145263671875, 1077.2977294921875]
Saved: out_detected_1.jpg
Image 2: Found 1 candidate box
plastic pen cap 0.3952738642692566 [699.7681884765625, 953.6226196289062, 726.6387329101562, 1062.0035400390625]
Saved: out_detected_2.jpg
Image 3: Found 1 candidate box
plastic pen cap 0.44769805669784546 [679.7200317382812, 1296.2269287109375, 708.9035034179688, 1422.017578125]
Saved: out_detected_3.jpg
Image 4: Found 1 candidate box
plastic pen cap 0.35261139273643494 [716.9485473632812, 780.4949951171875, 733.974609375, 831.747314453125]
Saved: out_detected_4.jpg
Image 5: Found 1 candidate box
plastic pen cap 0.6344977617263794 [548.505615234375, 790.5347900390625, 673.2413330078125, 1317.199951171875]
Saved: out_detected_5.jpg
