In [1]:
#PART 1: SCANNING AN PERSONAL OBJECT AND SAVING OBJECT IN MULTI_VIEW FOLDER
import cv2
import numpy as np
import os
from sam2 import load_model
from sam2.sam2_image_predictor import SAM2ImagePredictor


# Load SAMv2 model (tiny for CPU)
model = load_model(
    variant="tiny",
    ckpt_path="models/sam2/checkpoints/sam2_hiera_tiny.pt",
    device="cpu"
)
predictor = SAM2ImagePredictor(model)
print("SAMv2 loaded on CPU")

# Webcam setup
cap = cv2.VideoCapture(0)
window_name = "SAMv2 + Tracker + Multi-View Capture"
cv2.namedWindow(window_name)


# Multi-view output folder
output_folder = "multi_view"
os.makedirs(output_folder, exist_ok=True)
capture_count = 0

# Tracker initialization
tracker_initialized = False
tracker = None

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    h, w, _ = frame.shape

    # Draw red center point (user selects object here)
    center_x, center_y = w // 2, h // 2
    cv2.circle(frame, (center_x, center_y), 5, (0, 0, 255), -1)
    cv2.putText(frame, "Press SPACE to start/capture", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)

    # Wait until first SPACE press to initialize tracker

    if not tracker_initialized:
        cv2.imshow(window_name, frame)
        key = cv2.waitKey(1) & 0xFF

        if key == 32:  
            print("Initializing tracker and SAM segmentation...")

            predictor.set_image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            pts = np.array([[center_x, center_y]])
            labels = np.array([1])
            masks, _, _ = predictor.predict(
                point_coords=pts,
                point_labels=labels,
                multimask_output=True
            )
            mask_bool = masks[0].astype(bool)

            # Bounding box from mask
            y_idx, x_idx = np.where(mask_bool)
            x_min, x_max = x_idx.min(), x_idx.max()
            y_min, y_max = y_idx.min(), y_idx.max()
            bbox = (x_min, y_min, x_max - x_min, y_max - y_min)

            # Initialize tracker
            tracker = cv2.TrackerCSRT_create()
            tracker.init(frame, bbox)
            tracker_initialized = True
        elif key == 27:  # ESC
            break
        continue


    # Update tracker

    success, bbox = tracker.update(frame)
    if success:
        x, y, w_box, h_box = [int(v) for v in bbox]
        cv2.rectangle(frame, (x, y), (x + w_box, y + h_box), (255, 0, 0), 2)
    else:
        tracker_initialized = False
        print("Tracker lost object. Press SPACE to reinitialize.")
        continue

    cv2.imshow(window_name, frame)
    key = cv2.waitKey(1) & 0xFF


    # Capture view on SPACE
    if key == 32:  # SPACE
        if success:
            #Save full-frame overlay mask (green)
            mask_full = np.zeros(frame.shape[:2], dtype=np.uint8)
            mask_full[y:y+h_box, x:x+w_box] = 255
            rgb_mask_overlay = frame.copy()
            rgb_mask_overlay[mask_full.astype(bool)] = rgb_mask_overlay[mask_full.astype(bool)] * 0.5 + np.array([0, 255, 0]) * 0.5
            cv2.imwrite(os.path.join(output_folder, f"view_{capture_count:03d}_full.png"), rgb_mask_overlay)

            # SAM segmentation on full frame using bbox center
            center_point_full = np.array([[x + w_box//2, y + h_box//2]])
            predictor.set_image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            masks_full, _, _ = predictor.predict(
                point_coords=center_point_full,
                point_labels=np.array([1]),
                multimask_output=True
            )
            mask_full_object = masks_full[0].astype(bool)

            #RGB patch with background set to white
            y_idx, x_idx = np.where(mask_full_object)
            x0, x1 = x_idx.min(), x_idx.max()
            y0, y1 = y_idx.min(), y_idx.max()
            obj_crop = frame[y0:y1, x0:x1].copy()
            mask_crop_bool = mask_full_object[y0:y1, x0:x1]

            #Set background to white
            obj_crop_white_bg = obj_crop.copy()
            obj_crop_white_bg[~mask_crop_bool] = 255  # white background

            #Save RGB patch and mask
            cv2.imwrite(os.path.join(output_folder, f"view_{capture_count:03d}_crop.png"), obj_crop_white_bg)

            print(f"Saved multi-view capture {capture_count:03d}")
            capture_count += 1

    elif key == 27:  # ESC
        break

cap.release()
cv2.destroyAllWindows()




SAMv2 loaded on CPU
Initializing tracker and SAM segmentation...
Saved multi-view capture 000
Saved multi-view capture 001
Saved multi-view capture 002


In [2]:
#PART 2: USING DINO TO EXTRACT FEATURES OF SCANNED OBJECT
import torch
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import cv2
import os

#Setup DINOv2
#'dinov2-small' is very fast; 'dinov2-base' or 'dinov2-giant' are more accurate
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-small")
model = AutoModel.from_pretrained("facebook/dinov2-small").to(device)

def extract_features(folder_path): 
    instance_library = []
    file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    
    print(f"Extracting DINOv2 features for {len(file_list)/2} potential perspectives...")
    
    for filename in file_list:
        if "full" in filename:
            continue
        full_path = os.path.join(folder_path, filename)
        
        # 4. Read and check if image exists
        img_bgr = cv2.imread(full_path)
        if img_bgr is None:
            print(f"Warning: Could not read {filename}. Skipping.")
            continue

        # Process
        crop_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
        pil_img = Image.fromarray(crop_rgb)
        
        inputs = processor(images=pil_img, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
            instance_library.append(embeddings.cpu().numpy())
            
    return instance_library

multi_view="multi_view"
instance_features = extract_features(multi_view)
print(f"Library created with {len(instance_features)} feature vectors.")

print("\n--- Feature Preview (First 10 values per perspective) ---")
for i, feat in enumerate(instance_features):
    # Flatten if necessary and take the first 10 elements
    preview = feat.flatten()[:10]
    print(f"Perspective {i}: {preview}")

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Extracting DINOv2 features for 3.0 potential perspectives...
Library created with 3 feature vectors.

--- Feature Preview (First 10 values per perspective) ---
Perspective 0: [ 0.00556454  0.01795515 -0.0441304   0.0353582   0.04004984 -0.00626491
  0.02437494 -0.03971501  0.00890234  0.03981451]
Perspective 1: [ 0.06236326  0.0010755  -0.08177099  0.0025537  -0.04416817  0.02222248
 -0.00332737 -0.03287883 -0.08411137  0.08426923]
Perspective 2: [ 0.00268321  0.0295081  -0.09203594 -0.05245358  0.05578673  0.04401093
 -0.07666223 -0.02480725 -0.08794063  0.08172967]


In [3]:
#PART 3.1: IDENTIFYING OBJECT IN ROOM USING FASTSAM + DINO + TRACKING, MODIFIED VERSION
#Modification: every 2 seconds the re-identification with FastSAM + DINO is performed again to correct potential tracking drifts
import cv2
import numpy as np
import torch
import torch.nn.functional as F
import time
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
from ultralytics import FastSAM

# ===============================
# SETUP
# ===============================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ---- FastSAM ----
fastsam = FastSAM("FastSAM-s.pt")   # change path if needed

# ---- DINOv2 ----
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-small")
dino_model = AutoModel.from_pretrained("facebook/dinov2-small").to(device)
dino_model.eval()

# ---- Webcam ----
cap = cv2.VideoCapture(0)

# ---- Tracking ----
tracker = None
tracking_active = False
tracked_box = None

SIM_THRESHOLD = 0.6
REID_INTERVAL = 2.0   # seconds between re-ID attempts if tracking lost
last_reid_time = 0

# ===============================
# MAIN LOOP
# ===============================
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    current_time = time.time()

    # =========================================================
    # TRACKING ACTIVE → just track
    # =========================================================
    if tracking_active:
        success, tracked_box = tracker.update(frame)

        if success:
            x, y, w, h = map(int, tracked_box)
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 3)
            cv2.putText(frame, "TRACKING", (x, y - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
        else:
            print("Tracking lost.")
            tracking_active = False  # object lost → allow re-ID

    # =========================================================
    # TRACKING LOST → re-run FastSAM + DINO every REID_INTERVAL
    # =========================================================
    if not tracking_active and (current_time - last_reid_time > REID_INTERVAL):
        last_reid_time = current_time
        print("\nRunning FastSAM + DINO identification")
        total_start = time.time()

        # ---- FastSAM ----
        t_fastsam = time.time()
        results = fastsam(
            frame,
            device=device,
            imgsz=640,
            conf=0.4,
            iou=0.9,
            retina_masks=True
        )
        print(f"FastSAM proposal time: {time.time() - t_fastsam:.3f}s")

        masks_obj = results[0].masks
        if masks_obj is None:
            print("No masks found.")
            cv2.imshow("FastSAM + DINO → TRACKING", frame)
            key = cv2.waitKey(1) & 0xFF
            if key == 27:  # ESC
                break
            continue

        masks_tensor = masks_obj.data
        print(f"FastSAM masks found: {masks_tensor.shape[0]}")

        best_match = {"score": 0, "bbox": None}

        # ---- Loop over masks ----
        for i in range(masks_tensor.shape[0]):
            mask = masks_tensor[i].detach().cpu().numpy().astype(bool)
            ys, xs = np.where(mask)
            if len(xs) == 0 or len(ys) == 0:
                continue

            x1, x2 = xs.min(), xs.max()
            y1, y2 = ys.min(), ys.max()
            w, h = x2 - x1, y2 - y1
            if w < 30 or h < 30:
                continue

            # isolate object
            mask_3d = np.repeat(mask[:, :, None], 3, axis=2)
            white_bg = np.ones_like(frame) * 255
            isolated = np.where(mask_3d, frame, white_bg).astype(np.uint8)
            crop = isolated[y1:y2, x1:x2]
            if crop.size == 0:
                continue

            # ---- DINO embedding ----
            t_dino = time.time()
            img_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
            inputs = processor(images=img_pil, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = dino_model(**inputs)

            feat = outputs.last_hidden_state[:, 0, :]
            feat = F.normalize(feat, dim=1)
            candidate_emb = feat.cpu().numpy()
            print(f"DINO embedding time: {time.time() - t_dino:.3f}s")

            # ---- Similarity ----
            sims = [
                cosine_similarity(candidate_emb, ref.reshape(1, -1))[0][0]
                for ref in instance_features
            ]
            max_sim = max(sims)

            if max_sim > best_match["score"]:
                best_match = {
                    "score": max_sim,
                    "bbox": (x1, y1, w, h)
                }

        print(f"TOTAL IDENTIFICATION TIME: {time.time() - total_start:.3f}s")

        # ---- Initialize tracker if confident ----
        if best_match["score"] > SIM_THRESHOLD:
            print(f"TARGET IDENTIFIED (score={best_match['score']:.3f})")
            tracker = cv2.legacy.TrackerCSRT_create()
            tracker.init(frame, best_match["bbox"])
            tracked_box = best_match["bbox"]
            tracking_active = True
        else:
            print(f"Target not found (best score={best_match['score']:.3f})")
            tracking_active = False

    # =========================================================
    # SHOW FRAME
    # =========================================================
    cv2.imshow("FastSAM + DINO → TRACKING", frame)
    key = cv2.waitKey(10) & 0xFF
    if key == 27:  # ESC
        break

cap.release()
cv2.destroyAllWindows()


Using device: cpu

Running FastSAM + DINO identification

0: 384x640 3 objects, 54.4ms
Speed: 1.4ms preprocess, 54.4ms inference, 4.1ms postprocess per image at shape (1, 3, 384, 640)
FastSAM proposal time: 0.090s
FastSAM masks found: 3
DINO embedding time: 0.028s
DINO embedding time: 0.023s
DINO embedding time: 0.029s
TOTAL IDENTIFICATION TIME: 0.199s
Target not found (best score=0.248)

Running FastSAM + DINO identification

0: 384x640 18 objects, 71.7ms
Speed: 3.3ms preprocess, 71.7ms inference, 26.6ms postprocess per image at shape (1, 3, 384, 640)
FastSAM proposal time: 0.104s
FastSAM masks found: 18
DINO embedding time: 0.023s
DINO embedding time: 0.022s
DINO embedding time: 0.022s
DINO embedding time: 0.029s
DINO embedding time: 0.023s
DINO embedding time: 0.023s
DINO embedding time: 0.024s
DINO embedding time: 0.023s
DINO embedding time: 0.023s
DINO embedding time: 0.024s
DINO embedding time: 0.022s
DINO embedding time: 0.023s
DINO embedding time: 0.023s
DINO embedding time: 0.