
# Hugging Face `pipeline`: Object Detection, Depth Estimation, and Descriptions (Zero-Shot where possible)



**Goal:** Use pretrained Hugging Face pipelines to (1) detect objects (zero-shot), (2) estimate relative distance, and (3) describe each object's distance and left/front/right position.



## 0) Setup


In [None]:

# %pip install -U transformers torch torchvision accelerate pillow matplotlib numpy

import os, random
import numpy as np
import torch
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from transformers import pipeline

SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()
device = 0 if torch.cuda.is_available() else -1
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available(), "| pipeline(device) =", device)



## 1) Load an Image


In [None]:

IMAGE_PATH = "example.jpg"  # Replace with your file path
assert os.path.exists(IMAGE_PATH), f"Image not found: {IMAGE_PATH}"
image = Image.open(IMAGE_PATH).convert("RGB")
display(image)



## 2) Zero-Shot Object Detection


In [None]:

CANDIDATE_LABELS = ["person", "car", "bicycle", "dog", "cat", "traffic light", "chair", "bottle"]
detector = pipeline("zero-shot-object-detection", model="google/owlvit-base-patch32", device=device)
detections = detector(image, candidate_labels=CANDIDATE_LABELS)
print("Num detections:", len(detections))
detections[:3]



### Visualize detections


In [None]:

def draw_bboxes(img, dets, score_thresh=0.25):
    img = img.copy()
    draw = ImageDraw.Draw(img)
    W, H = img.size
    for d in dets:
        if d.get("score", 0) < score_thresh: 
            continue
        box = d["box"]
        label = d["label"]
        score = d["score"]
        xmin = max(0, int(box["xmin"])); ymin = max(0, int(box["ymin"]))
        xmax = min(W-1, int(box["xmax"])); ymax = min(H-1, int(box["ymax"]))
        draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
        text = f"{label} ({score:.2f})"
        tw = draw.textlength(text); th = 12
        draw.rectangle([(xmin, ymin - th - 4), (xmin + tw + 6, ymin)], fill="red")
        draw.text((xmin + 3, ymin - th - 2), text, fill="white")
    return img

viz = draw_bboxes(image, detections, score_thresh=0.25)
display(viz)



## 3) Depth Estimation


In [None]:

depth_pipe = pipeline("depth-estimation", model="Intel/dpt-large", device=device)
depth_out = depth_pipe(image)
depth_map_img = depth_out["depth"]
pred_depth = depth_out["predicted_depth"]
display(depth_map_img); print(pred_depth.shape)



### Per-object distances (relative)


In [None]:

import numpy as np

def compute_box_depths(dets, depth_tensor, score_thresh=0.25, near_q=0.33, far_q=0.66):
    d = depth_tensor.squeeze(0).cpu().numpy()
    H, W = d.shape
    results, meds = [], []
    for item in dets:
        if item.get("score", 0) < score_thresh: continue
        box = item["box"]
        xmin = max(0, int(box["xmin"])); ymin = max(0, int(box["ymin"]))
        xmax = min(W-1, int(box["xmax"])); ymax = min(H-1, int(box["ymax"]))
        if xmax <= xmin or ymax <= ymin: continue
        patch = d[ymin:ymax, xmin:xmax]
        if patch.size == 0: continue
        med = float(np.median(patch)); meds.append(med)
    if not meds: return []
    qn, qf = float(np.quantile(meds, near_q)), float(np.quantile(meds, far_q))
    for item in dets:
        if item.get("score", 0) < score_thresh: continue
        label, score, box = item["label"], item["score"], item["box"]
        xmin = max(0, int(box["xmin"])); ymin = max(0, int(box["ymin"]))
        xmax = min(W-1, int(box["xmax"])); ymax = min(H-1, int(box["ymax"]))
        if xmax <= xmin or ymax <= ymin: continue
        patch = d[ymin:ymax, xmin:xmax]
        if patch.size == 0: continue
        med = float(np.median(patch))
        if med <= qn: dist_bin = "near"
        elif med >= qf: dist_bin = "far"
        else: dist_bin = "medium"
        results.append({"label": label, "score": score, "box": {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax},
                        "median_depth": med, "distance_bin": dist_bin})
    return results

box_depths = compute_box_depths(detections, pred_depth, score_thresh=0.25)
box_depths[:3]



## 4) Left / Front / Right classification


In [None]:

def classify_lr_front(box, img_width, center_slice_ratio=0.2):
    cx = 0.5 * (box["xmin"] + box["xmax"])
    center = img_width * 0.5
    half_slice = (img_width * center_slice_ratio) * 0.5
    left_edge, right_edge = center - half_slice, center + half_slice
    if cx < left_edge: return "left"
    elif cx > right_edge: return "right"
    return "front"

W, H = image.size
for obj in box_depths:
    obj["position_lr"] = classify_lr_front(obj["box"], W, center_slice_ratio=0.2)

box_depths[:3]



### Visualization with distance & position


In [None]:

from PIL import ImageDraw

def draw_bboxes_with_extras(img, objs):
    img = img.copy()
    draw = ImageDraw.Draw(img)
    W, H = img.size
    center = W * 0.5
    left_edge = int(center - 0.1 * W)
    right_edge = int(center + 0.1 * W)
    draw.line([(left_edge, 0), (left_edge, H)], fill="blue", width=2)
    draw.line([(right_edge, 0), (right_edge, H)], fill="blue", width=2)
    for o in objs:
        box = o["box"]
        xmin, ymin, xmax, ymax = box["xmin"], box["ymin"], box["xmax"], box["ymax"]
        draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
        text = f"{o['label']} | {o['distance_bin']} | {o['position_lr']}"
        tw = draw.textlength(text); th = 12
        draw.rectangle([(xmin, ymin - th - 4), (xmin + tw + 6, ymin)], fill="red")
        draw.text((xmin + 3, ymin - th - 2), text, fill="white")
    return img

viz2 = draw_bboxes_with_extras(image, box_depths)
display(viz2)



## 5) Textual Description (rule-based)


In [None]:

def describe_objects(objects):
    if not objects: return "No confident detections."
    objs = sorted(objects, key=lambda o: o["median_depth"])
    parts = [f"a {o['label']} that is {o['distance_bin']} and to the {o['position_lr']}" for o in objs]
    if len(parts) == 1: return "I see " + parts[0] + "."
    return "I see " + ", ".join(parts[:-1]) + ", and " + parts[-1] + "."

summary = describe_objects(box_depths)
print(summary)



## 6) Notes
- Zero-shot detection uses `google/owlvit-base-patch32`; expand `CANDIDATE_LABELS` for broader coverage.
- Depth is **relative**, not metric meters.
- "Front" is a vertical slice around the image center; adjust `center_slice_ratio` as needed.
