In [61]:
import torch
from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
from transformers.image_utils import load_image
import torch
import cv2
import numpy as np
import os
import random
from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
from PIL import Image
# Deep SORT imports
from deep_sort.deep_sort.tracker import Tracker as DeepSortTracker
from deep_sort.tools import generate_detections as gdet
from deep_sort.deep_sort import nn_matching
from deep_sort.deep_sort.detection import Detection

In [None]:



# Prepare processor and model
model_id = "iSEE-Laboratory/llmdet_tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

# Prepare inputs
image_url = "/home/nhinguyen/Desktop/OmniLRS/OmniLRS/scripts/nhi/rover_data_20250804_114916/photo_0038.jpg"
image = load_image(image_url)
text_labels = [["rover"]]
inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)

# Run inference
with torch.no_grad():
    outputs = model(**inputs)

# Postprocess outputs
results = processor.post_process_grounded_object_detection(
    outputs,
    threshold=0.4,
    target_sizes=[(image.height, image.width)]
)

# Retrieve the first image result
result = results[0]
for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
    box = [round(x, 2) for x in box.tolist()]
    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")

Detected rover with confidence 0.733 at location [249.08, 542.25, 421.49, 648.52]
Detected rover with confidence 0.732 at location [1042.32, 500.53, 1172.6, 577.09]


In [102]:


# Load model
model_id = "iSEE-Laboratory/llmdet_tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

# Video paths and settings
input_video = "/home/nhinguyen/Downloads/lunar_videos/1.mp4"
output_video = "/home/nhinguyen/Downloads/lunar_videos/1_detect.mp4"
text_labels = [["rover", "robot", "occluded rover", "moving rover"]]
threshold = 0.4

# Process video
cap = cv2.VideoCapture(input_video)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Convert and detect
    pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = processor(images=pil_img, text=text_labels, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    results = processor.post_process_grounded_object_detection(
        outputs, threshold=threshold, target_sizes=[(pil_img.height, pil_img.width)]
    )[0]
    
    # Draw boxes
    for box, score, label in zip(results["boxes"], results["scores"], results["labels"]):
        x1, y1, x2, y2 = [int(x) for x in box.tolist()]
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"{label}: {score:.3f}", (x1, y1-10), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    out.write(frame)

cap.release()
out.release()

### __DeepSORT__

In [None]:
class Track:
    """
    A simple container to hold track_id and bbox data.
    """
    def __init__(self, track_id, bbox):
        self.track_id = track_id
        self.bbox = bbox


class DeepSortWrapper:
    """
    Wrapper class around the Deep SORT functionality.
    """
    def __init__(self, model_filename='model_data/mars-small128.pb', max_cosine_distance=0.4, nn_budget=None):
        metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)
        self.tracker = DeepSortTracker(metric)
        self.encoder = gdet.create_box_encoder(model_filename, batch_size=1)
        self.tracks = []

    def update(self, frame, detections):
        """
        Args:
            frame (numpy.ndarray): Current video frame (BGR).
            detections (list): List of detections in [x1, y1, x2, y2, score] format.
        """

        # Step 1: If no detections, just run tracker predict
        if len(detections) == 0:
            self.tracker.predict()
            self.tracker.update([])
            self._update_tracks()
            return

        # Step 2: Convert [x1, y1, x2, y2] to [x, y, w, h]
        bboxes = np.array([d[:4] for d in detections])
        scores = [d[4] for d in detections]

        bboxes[:, 2:] = bboxes[:, 2:] - bboxes[:, :2]  # width, height

        # Step 3: Generate features for each bbox
        features = self.encoder(frame, bboxes)

        # Step 4: Wrap Deep SORT detections
        dets = []
        for bbox_id, bbox in enumerate(bboxes):
            dets.append(Detection(bbox, scores[bbox_id], features[bbox_id]))

        # Step 5: Run the tracker
        self.tracker.predict()
        self.tracker.update(dets)
        self._update_tracks()

    def _update_tracks(self):
        """
        Refresh self.tracks with updated tracks from the deep_sort tracker
        """
        active_tracks = []
        for track in self.tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()  # returns [x1, y1, x2, y2]
            track_id = track.track_id
            active_tracks.append(Track(track_id, bbox))

        self.tracks = active_tracks

### __LLMDet + DeepSort__

In [103]:
# Load model
model_id = "iSEE-Laboratory/llmdet_tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

# Video paths and settings
input_video = "/home/nhinguyen/Downloads/lunar_videos/1.mp4"
output_video = "/home/nhinguyen/Downloads/lunar_videos/1_detect_tracking.mp4"
text_labels = [["rover"]]
threshold = 0.4

# output_dir = "/results/"

# Process video
cap = cv2.VideoCapture(input_video)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))


In [105]:
from natsort import natsorted
import glob
image_dir = f"C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504/"
output_dir=os.path.join(f"C:/Users/ongng/OneDrive/Desktop/SEA-RAFT/nhi_results_draft/", image_dir.split('/')[-2]+'/')
pattern = "photo"
images = natsorted(glob.glob(f"{image_dir}{pattern}*.jpg"))
# images
os.makedirs(output_dir,exist_ok=True)
print(output_dir)


C:/Users/ongng/OneDrive/Desktop/SEA-RAFT/nhi_results_draft/rover_data_20250804_112504/


In [107]:
deepsort = DeepSortWrapper(
    model_filename="C:\\Users\\ongng\\Downloads\\Sample_Detection_Tracking_Yolov8_DeepSORT_Robotics-main\\Sample_Detection_Tracking_Yolov8_DeepSORT_Robotics-main\\deep_sort\\deep_sort\mars-small128.pb",
    max_cosine_distance=0.2,
    nn_budget=None
)
colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in range(100)]
for i in range(len(images))[800:]:
    img_ = cv2.imread(images[i])
    img = cv2.cvtColor(img_, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(img)
    inputs = processor(images=pil_img, text=text_labels, return_tensors="pt").to(device)
    # inputs = processor(images=pil_img, text=text_labels, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    results = processor.post_process_grounded_object_detection(
        outputs, threshold=threshold, target_sizes=[(pil_img.height, pil_img.width)]
    )[0]
    # results = results.numpy()
    bboxes = [b.cpu().detach().numpy() for b in results['boxes']]
    scores = [s.cpu().detach().numpy() for s in results['scores']]
    detections = [list(bboxes[i]) + [scores[i]] for i in range(len(scores))]

    # for idx,item in enumerate(detections):
    #     print(item[0])
    #     print(item[1]) 
    #DEEP SORT------------------------
    # print(detections)
    deepsort.update(img, detections)
    # print
    for track in deepsort.tracks:
        # print(track)
        x1, y1, x2, y2 = track.bbox
        track_id = track.track_id
        
        color = colors[track_id % len(colors)]
        cv2.rectangle(img_, (int(x1), int(y1)), (int(x2), int(y2)), color, 1)
        cv2.putText(img_, f"ID: {track_id}", (int(x1), int(y1) - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    # cv2.imshow(img)
    out_img_dir = output_dir+"/"+f"{i:04}.jpg"
    cv2.imwrite(out_img_dir, img_)
    if i%30 == 0:
        print(f'Detected {i}/{len(images)} images')
    
    # print('x')

    # Write the processed img into the output video
    # out.write(img)

# Release resources


# print(f"Video saved as {output_file}")
        

Detected 810/1177 images
Detected 840/1177 images
Detected 870/1177 images
Detected 900/1177 images
Detected 930/1177 images
Detected 960/1177 images
Detected 990/1177 images
Detected 1020/1177 images
Detected 1050/1177 images
Detected 1080/1177 images
Detected 1110/1177 images
Detected 1140/1177 images
Detected 1170/1177 images


In [None]:
# glob.glob(f"{image_dir}{pattern}*.jpg")

['C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0000.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0001.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0002.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0003.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0004.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0005.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0006.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0007.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0008.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0009.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0010.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_20250804_112504\\photo_0011.jpg',
 'C:/Users/ongng/OneDrive/Desktop/rover_data_2025080