In [None]:
import json
from pathlib import Path
import sys
import torch
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display

project_root = Path(".").resolve().parent
sys.path.insert(0, str(project_root))

from utils.clip import load_clip, classify_image
from utils.owl import load_owl, detect_with_owl
from utils.box_mask import enlarge_box, draw_annotated
from utils.gpt_function_calling import GPTInterfaceFC
from utils.template import get_template_env


In [None]:
owl_thresh  = 0.18
box_scale   = 1.1
font = ImageFont.truetype("/Library/fonts/Arial.ttf", size=50)

In [None]:
processor_clip, model_clip = load_clip()
processor_owl,  model_owl  = load_owl()
env = get_template_env()
gpt_fc = GPTInterfaceFC(env, model="gpt-4.1", temperature=0.7)
print("Models loaded")

In [None]:
image_path = Path("../pipeline_optimization_dataset/1SIka2FSC_tE6_94GW3GsRvb-Gi5CA8wa__Küche_Wohnung1.jpg")
image = Image.open(image_path).convert("RGB")

In [None]:
text_labels_outside = [[
    "a house number", "a license plate", "person", "a face",
    "a religious symbol", "a political symbol", "a cat", "a dog",
]]
text_labels_inside = [[
    "a calendar", "a license plate", "a paper", "person",
    "a framed picture", "a picture", "a poster board",
    "a name", "a face", "a religious symbol", "a political symbol",
    "a sex toy", "a nude image", "a cat", "a dog",
    "a mirror", "a window", "a television"
]]
per_label_thresh = {
    "a calendar": 0.20, "a paper": 0.20, "a house number": 0.21,
    "a license plate": 0.19, "person": 0.20, "a framed picture": 0.22,
    "a picture": 0.22, "a poster board": 0.30, "a name": 0.20,
    "a face": 0.20, "a religious symbol": 0.24, "a political symbol": 0.20,
    "a sex toy": 0.23, "a nude image": 0.30, "a cat": 0.28, "a dog": 0.28,
    "a mirror": 0.30, "a window": 0.30, "a television": 0.50
}
default_thresh = owl_thresh

In [None]:
w, h  = image.size
inout = classify_image(image, processor_clip, model_clip)
labs  = text_labels_inside if inout == "an indoor scene" else text_labels_outside
print("Scene classification:", inout)

In [None]:
boxes_p, scores, labels = detect_with_owl(image, labs, processor_owl, model_owl, threshold=owl_thresh)
raw_boxes = [b.tolist() for b in boxes_p]

annotated_pre = draw_annotated(image.copy(), raw_boxes, [float(s.item()) for s in scores], labels, font=font)
display(annotated_pre)

In [None]:
kept = [
    (b, s, l) for b, s, l in zip(raw_boxes, scores, labels)
    if s.item() >= per_label_thresh.get(l, default_thresh)
]
if kept:
    boxes_f, scores_f, labels_f = map(list, zip(*kept))
else:
    boxes_f, scores_f, labels_f = [], [], []
    print("No boxes after score filter.")

if boxes_f:
    person_or_nude = {"person", "a nude image"}
    always_drop    = {"a television", "a window", "a mirror"}
    remove_idx = set()

    for i, lab in enumerate(labels_f):
        if lab in always_drop:
            remove_idx.add(i)

    for i, (box_i, lab_i) in enumerate(zip(boxes_f, labels_f)):
        if lab_i not in ("a television", "a window"):
            continue
        x0, y0, x1, y1 = box_i
        Ai = max(0, x1 - x0) * max(0, y1 - y0)
        if Ai == 0:
            continue
        for j, (box_j, lab_j) in enumerate(zip(boxes_f, labels_f)):
            if j == i: 
                continue
            x0j, y0j, x1j, y1j = box_j
            iw = max(0, min(x1, x1j) - max(x0, x0j))
            ih = max(0, min(y1, y1j) - max(y0, y0j))
            if iw * ih == 0:
                continue
            overlap_ratio = (iw * ih) / Ai
            if lab_j in person_or_nude:
                continue
            if overlap_ratio >= 0.20:
                remove_idx.add(j)

    filtered = [
        (b, s, l)
        for k, (b, s, l) in enumerate(zip(boxes_f, scores_f, labels_f))
        if k not in remove_idx
    ]
    
    if filtered:
        boxes_f, scores_f, labels_f = map(list, zip(*filtered))
    else:
        boxes_f, scores_f, labels_f = [], [], []

post_enl = [enlarge_box(box=b, scale=box_scale, img_w=w, img_h=h) for b in boxes_f]

annotated_post = draw_annotated(image.copy(), post_enl, [s.item() for s in scores_f], labels_f, font=font)
display(annotated_post)

In [None]:
kept_labels = [
    {"label": l, "box": b_enl, "score": s.item()}
    for b_enl, l, s in zip(post_enl, labels_f, scores_f)
]

final_dets = []
for i, det in enumerate(kept_labels):
    lbl, box, score = det["label"], det["box"], det["score"]

    if inout == "an indoor scene" and lbl in {"a picture", "a framed picture"}:
        crop_img = image.crop(box)
        display(crop_img)  

        result, usage = gpt_fc.query_inside_fc(
            image=crop_img,
            label=lbl,
            score=score,
            box=box,
        )
        print(
            f"GPT keep={result['keep']}  | "
            f"prompt={usage['prompt_tokens']}  "
            f"completion={usage['completion_tokens']}  "
            f"total={usage['total_tokens']}"
        )
        if result["keep"]:
            final_dets.append(det)
    else:
        final_dets.append(det)

if final_dets:
    boxes_gpt  = [d["box"]   for d in final_dets]
    labels_gpt = [d["label"] for d in final_dets]
    scores_gpt = [d["score"] for d in final_dets]
else:
    boxes_gpt, labels_gpt, scores_gpt = [], [], []

In [None]:
image_final = image.copy()
annotated_final = draw_annotated(image_final, boxes_gpt, scores_gpt, labels_gpt, font=font)
display(annotated_final)