In [1]:
import cv2
import numpy as np
import supervision as sv
from ultralytics import YOLO

In [2]:
road_detect_model_path = "models/yolo-road_object_detect_model/best.pt"
road_detect_model = YOLO(road_detect_model_path)
print("Road Object Detection Model Loaded ... ")

Road Object Detection Model Loaded ... 


In [3]:
from segment_anything import SamPredictor, sam_model_registry
model_type = "vit_h"
device = "mps"
sam = sam_model_registry[model_type](checkpoint="models/sam_segment_model/sam_vit_h_4b8939.pth")
sam.to(device=device)
predictor = SamPredictor(sam)
print("Sam model loaded")

Sam model loaded


In [4]:
def road_object_detection(frame, model):
    results = model(frame, device="mps", conf=0.5)[0]
    results = results.boxes.data.tolist()
    bounding_box = []
    if len(results) > 0:
        for i in results:
            x1, y1, x2, y2, _, class_id = i
            bounding_box.append([x1, y1, x2, y2])
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
    return bounding_box, frame

In [5]:
def sam_model(image, road_bounding_box):
    for i in range(len(road_bounding_box)):
        temp = road_bounding_box[i]
        box = np.array([temp[0], temp[1], temp[2], temp[3]])
        predictor.set_image(image)   
        masks, scores, logits = predictor.predict(
            box=box,
            multimask_output=True
        )
        box_annotator = sv.BoxAnnotator(color=sv.Color.green())
        mask_annotator = sv.MaskAnnotator(color=sv.Color.green(),color_lookup=sv.ColorLookup.INDEX)
        detections = sv.Detections(
            xyxy=sv.mask_to_xyxy(masks=masks),
            mask=masks
        )
        segmented_image = mask_annotator.annotate(scene=image, detections=detections)
        return segmented_image

In [7]:
def extract_and_process(input_video_key, output_video_key):
    cap = cv2.VideoCapture(input_video_key)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
    
    out = cv2.VideoWriter(output_video_key, cv2.VideoWriter_fourcc(*'MP4V'), frame_rate, (frame_width, frame_height), isColor=True)
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        road_bounding_box, image = road_object_detection(frame, road_detect_model)
        if len(road_bounding_box):
            segmented_image = sam_model(frame, road_bounding_box)
        else:
            segmented_image = frame
        
        out.write(segmented_image)
    
    cap.release()
    out.release()

In [9]:
if __name__ == "__main__":
    input_video_key = 'input/sample.mp4'
    output_video_key = 'output/processed_output.mp4'
    extract_and_process(input_video_key, output_video_key)

OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'



0: 384x640 1 road, 250.2ms
Speed: 10.4ms preprocess, 250.2ms inference, 120.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 road, 33.6ms
Speed: 1.8ms preprocess, 33.6ms inference, 8.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 road, 20.2ms
Speed: 2.0ms preprocess, 20.2ms inference, 7.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 road, 19.8ms
Speed: 2.3ms preprocess, 19.8ms inference, 6.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 road, 19.6ms
Speed: 2.3ms preprocess, 19.6ms inference, 6.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 road, 19.6ms
Speed: 2.0ms preprocess, 19.6ms inference, 6.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 road, 24.7ms
Speed: 2.4ms preprocess, 24.7ms inference, 8.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 road, 18.8ms
Speed: 1.8ms preprocess, 18.8ms inference, 6.3ms postprocess per image at shape (1, 3, 384, 640)

0: