In [6]:
import os
HOME = os.getcwd()
print(HOME)

from IPython.display import clear_output

/content


In [None]:
!pip install transformers==4.46.1
!pip install tokenizers==0.20.3

In [None]:
%%time

!pip install -q inference-gpu[yolo-world]==0.35.0

clear_output()

In [None]:
%%time

!pip install -q supervision==0.25.1

clear_output()

In [None]:
import cv2
import supervision as sv

from tqdm import tqdm
from inference.models import YOLOWorld

In [7]:
class CFG:
    ### model
    MODEL_ID = 'yolo_world/l'

    ### detections
    CLASSES_TO_DETECT = ['knife', 'scissor', 'cutter']
    CONFIDENCE = 0.15

    ### input videos
    SOURCE_VIDEO_PATH = f'{HOME}/video.mp4'

    ### save path
    TARGET_VIDEO_PATH = f"{HOME}/video-output.mp4"

In [None]:
def get_video_properties(video_path):
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Check if the video file is opened successfully
    if not cap.isOpened():
        raise ValueError("Could not open video file")

    # Get video properties
    properties = {
        "fps": int(cap.get(cv2.CAP_PROP_FPS)),
        "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
        "duration_seconds": int( cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS) ),
        "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
        "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
        "codec": int(cap.get(cv2.CAP_PROP_FOURCC)),
    }

    # Release the video capture object
    cap.release()

    return properties

In [None]:
video_properties = get_video_properties(CFG.SOURCE_VIDEO_PATH)
video_properties

{'fps': 30,
 'frame_count': 1264,
 'duration_seconds': 42,
 'width': 1920,
 'height': 1080,
 'codec': 875967080}

In [None]:
model = YOLOWorld(model_id=CFG.MODEL_ID)
model.set_classes(CFG.CLASSES_TO_DETECT)

Creating inference sessions




CLIP model loaded in 0.95 seconds


In [None]:
BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=2)
LABEL_ANNOTATOR = sv.LabelAnnotator(text_thickness=2, text_scale=1, text_color=sv.Color.BLACK)



In [None]:
frame_generator = sv.get_video_frames_generator(CFG.SOURCE_VIDEO_PATH)
video_info = sv.VideoInfo.from_video_path(CFG.SOURCE_VIDEO_PATH)

with sv.VideoSink(target_path=CFG.TARGET_VIDEO_PATH, video_info=video_info) as sink:
    for frame in tqdm(frame_generator, total=video_info.total_frames):
        results = model.infer(frame, confidence=0.15)
        detections = sv.Detections.from_inference(results)

        annotated_frame = frame.copy()
        annotated_frame = BOUNDING_BOX_ANNOTATOR.annotate(annotated_frame, detections)
        annotated_frame = LABEL_ANNOTATOR.annotate(annotated_frame, detections)
        sink.write_frame(annotated_frame)

100%|██████████| 1264/1264 [01:39<00:00, 12.69it/s]


In [None]:
!pip install ipywidgets

In [3]:
import ipywidgets as widgets
from IPython.display import Video

In [None]:
video_widget = widgets.Video.from_file(CFG.TARGET_VIDEO_PATH)
display(video_widget)