In [1]:
import transformers


model = transformers.CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = transformers.CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [2]:
import cv2
import numpy as np
import os
vidcap = cv2.VideoCapture('dog.mp4')
frames = []
success,image = vidcap.read()
count = 0
frame_interval = 15
while success:
    if (count % frame_interval == 0):
        frames.append(image)
        print("Captured frame ",frames)
    success,image = vidcap.read()
    count += 1
vidcap.release()
print(f"No. of frames captured: {len(frames)}")
  
    

Captured frame  [array([[[250, 244, 239],
        [250, 244, 239],
        [250, 244, 239],
        ...,
        [148, 197, 226],
        [149, 198, 227],
        [148, 197, 226]],

       [[250, 244, 239],
        [250, 244, 239],
        [250, 244, 239],
        ...,
        [147, 196, 225],
        [148, 197, 226],
        [147, 196, 225]],

       [[250, 244, 239],
        [250, 244, 239],
        [250, 244, 239],
        ...,
        [146, 195, 224],
        [147, 196, 225],
        [147, 196, 225]],

       ...,

       [[181, 181, 172],
        [181, 181, 172],
        [181, 181, 172],
        ...,
        [203, 200, 194],
        [203, 200, 194],
        [203, 200, 194]],

       [[181, 181, 172],
        [181, 181, 172],
        [181, 181, 172],
        ...,
        [200, 197, 191],
        [200, 197, 191],
        [200, 197, 191]],

       [[181, 181, 172],
        [181, 181, 172],
        [181, 181, 172],
        ...,
        [199, 196, 190],
        [199, 196, 190],
       

In [3]:

for i, f in enumerate(frames):
    print(f"Frame {i}: Type = {type(f)}, Shape = {getattr(f, 'shape', 'No shape')}")

    # If it's not a NumPy array, print an error
    if not isinstance(f, np.ndarray):
        print(f"Frame {i} is not a valid NumPy array.")


Frame 0: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 1: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 2: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 3: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 4: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 5: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 6: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 7: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 8: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 9: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 10: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 11: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 12: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 13: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 14: Type = <class 'numpy.ndarray'>, Shape = (2160, 3840, 3)
Frame 15: Type = <cl

In [4]:
frame = np.array(frames)
import torch

from PIL import Image
def preprocess_frame(frame):
    frame_rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(frame_rgb)
    pil_image = pil_image.resize((224, 224))

    return pil_image

processed_frames = [preprocess_frame(i) for i in frame]
inputs = processor(images=processed_frames, return_tensors="pt")
with torch.no_grad():
    image_embeddings = model.get_image_features(**inputs)

In [None]:

import torch.nn.functional as F


user_prompt = input("Enter a keyword or phrase: ")


text_inputs = processor(text=[user_prompt], return_tensors="pt")
with torch.no_grad():
    text_embeddings = model.get_text_features(**text_inputs)

similarities = F.cosine_similarity(image_embeddings, text_embeddings)

most_relevant_index = similarities.argmax().item()
most_relevant_frame = frames[most_relevant_index]


screen_width = 960  # Set your screen width here
screen_height = 1280 # Set your screen height here

# Get the dimensions of the original frame
height, width, _ = most_relevant_frame.shape

# Calculate the scaling factor to fit the image within the screen
scaling_factor = min(screen_width / width, screen_height / height)

# Resize the frame based on the scaling factor
new_width = int(width * scaling_factor)
new_height = int(height * scaling_factor)
resized_frame = cv2.resize(most_relevant_frame, (new_width, new_height))

# Display the resized frame
cv2.imshow("Most Relevant Frame", resized_frame)
cv2.waitKey(0)
cv2.destroyAllWindows()


Enter a keyword or phrase:  dog
