In [1]:
import transformers


model = transformers.CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = transformers.CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [2]:
import cv2
import numpy as np
import os
vidcap = cv2.VideoCapture('5837591-uhd_3840_2160_24fps.mp4')
frames = []
success,image = vidcap.read()
count = 0
frame_interval = 15
while success:
    if (count % frame_interval == 0):
        frames.append(image)
        print("Captured frame ",frames)
    success,image = vidcap.read()
    count += 1
vidcap.release()
print(f"No. of frames captured: {len(frames)}")
  

Captured frame  [array([[[223, 232, 218],
        [223, 232, 218],
        [223, 232, 218],
        ...,
        [ 67,  60,  57],
        [ 67,  60,  57],
        [ 67,  60,  57]],

       [[223, 232, 218],
        [223, 232, 218],
        [223, 232, 218],
        ...,
        [ 67,  60,  57],
        [ 67,  60,  57],
        [ 67,  60,  57]],

       [[223, 232, 218],
        [223, 232, 218],
        [223, 232, 218],
        ...,
        [ 67,  60,  57],
        [ 67,  60,  57],
        [ 67,  60,  57]],

       ...,

       [[145, 155, 154],
        [144, 154, 153],
        [145, 155, 154],
        ...,
        [124, 134, 138],
        [124, 134, 138],
        [124, 134, 138]],

       [[147, 157, 156],
        [145, 155, 154],
        [145, 155, 154],
        ...,
        [124, 134, 138],
        [125, 135, 139],
        [125, 135, 139]],

       [[145, 155, 154],
        [145, 155, 154],
        [147, 157, 156],
        ...,
        [124, 134, 138],
        [124, 134, 138],
       

In [3]:

for i, f in enumerate(frames):
    print(f"Frame {i}: Type = {type(f)}, Shape = {getattr(f, 'shape', 'No shape')}")

    # If it's not a NumPy array, print an error
    if not isinstance(f, np.ndarray):
        print(f"Frame {i} is not a valid NumPy array.")


Frame 0: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 1: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 2: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 3: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 4: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 5: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 6: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 7: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 8: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 9: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 10: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 11: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 12: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 13: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)
Frame 14: Type = <class 'numpy.ndarray'>, Shape = (1440, 2560, 3)


In [4]:
frame = np.array(frames)
import torch

from PIL import Image
def preprocess_frame(frame):
    frame_rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    pil_image = pil_image.resize((224, 224))

    return pil_image

processed_frames = [preprocess_frame(i) for i in frame]
inputs = processor(images=processed_frames, return_tensors="pt")
with torch.no_grad():
    image_embeddings = model.get_image_features(**inputs)

In [5]:
pip install ultralytics

Note: you may need to restart the kernel to use updated packages.


In [19]:

import torch.nn.functional as F


user_prompt = input("Enter a keyword or phrase: ")


text_inputs = processor(text=[user_prompt], return_tensors="pt")
with torch.no_grad():
    text_embeddings = model.get_text_features(**text_inputs)

similarities = F.cosine_similarity(image_embeddings, text_embeddings)

most_relevant_index = similarities.argmax().item()
most_relevant_frame = frames[most_relevant_index]


screen_width = 960  # Set your screen width here
screen_height = 1280 # Set your screen height here

# Get the dimensions of the original frame
height, width, _ = most_relevant_frame.shape

# Calculate the scaling factor to fit the image within the screen
scaling_factor = min(screen_width / width, screen_height / height)

# Resize the frame based on the scaling factor
new_width = int(width * scaling_factor)
new_height = int(height * scaling_factor)
resized_frame = cv2.resize(most_relevant_frame, (new_width, new_height))

# Display the resized frame
cv2.imshow("Most Relevant Frame", resized_frame)
cv2.waitKey(0)
cv2.destroyAllWindows()


Enter a keyword or phrase:  football


In [20]:
import torch
import cv2
import pandas
import torch.nn.functional as F
from PIL import Image
from ultralytics import YOLO  # YOLOv10 is supported by the Ultralytics library


yolo_model = YOLO('yolov10n.pt')  

results = yolo_model(most_relevant_frame)

result = results[0] 

boxes = result.boxes  

highest_similarity = -1
best_box = None
best_confidence = None


for box in boxes:
    # Get coordinates and confidence
    xmin, ymin, xmax, ymax = box.xyxy[0].tolist()[:4]  # xyxy format
    confidence = box.conf.tolist()[0]

    # Crop the detected object for CLIP processing
    cropped_object = most_relevant_frame[int(ymin):int(ymax), int(xmin):int(xmax)]
    cropped_pil = Image.fromarray(cv2.cvtColor(cropped_object, cv2.COLOR_BGR2RGB))

    # Preprocess for CLIP and get embeddings
    object_input = processor(images=cropped_pil, return_tensors="pt")
    with torch.no_grad():
        object_embedding = model.get_image_features(**object_input)

    # Compute cosine similarity with the prompt embedding
    similarity = F.cosine_similarity(object_embedding, text_embeddings).item()
    if similarity > highest_similarity:
        highest_similarity = similarity
        best_box = (int(xmin), int(ymin), int(xmax), int(ymax))
        best_confidence = confidence


if best_box:
    xmin, ymin, xmax, ymax = best_box
    cv2.rectangle(most_relevant_frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), thickness=3)
    font_scale = 3.8
    font_thickness = 5
    text = f"{user_prompt} ({highest_similarity:.2f} / {best_confidence:.2f})"
    cv2.putText(most_relevant_frame, text, (xmin, ymin - 10), cv2.FONT_HERSHEY_SIMPLEX, 
                font_scale, (0, 255, 0), font_thickness, cv2.LINE_AA)

scaling_factor = min(screen_width / most_relevant_frame.shape[1], screen_height / most_relevant_frame.shape[0])
display_frame = cv2.resize(most_relevant_frame, (int(most_relevant_frame.shape[1] * scaling_factor),
                                                 int(most_relevant_frame.shape[0] * scaling_factor)))

cv2.imshow("Most Relevant Frame with Bounding Box", display_frame)
cv2.waitKey(0)
cv2.destroyAllWindows()



0: 384x640 3 persons, 1 sports ball, 286.6ms
Speed: 6.7ms preprocess, 286.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)
