In [1]:
import os
import numpy as np
from PIL import Image
import cv2
import re

import torch

from transformers import CLIPTokenizer, CLIPTextModelWithProjection, CLIPTextModel,AutoTokenizer, CLIPVisionModelWithProjection, CLIPVisionModel
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode

# !pip install torchvision==0.14
# !pip install opencv-python # opencv-python-4.9.0.80

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
current_directory = os.getcwd()

In [3]:
# model = CLIPTextModelWithProjection.from_pretrained("Searchium-ai/clip4clip-webvid150k",cache_dir=current_directory)
model = CLIPTextModel.from_pretrained("Searchium-ai/clip4clip-webvid150k",cache_dir=current_directory)
tokenizer = CLIPTokenizer.from_pretrained("Searchium-ai/clip4clip-webvid150k",cache_dir=current_directory)

In [4]:
search_sentence = "Finland and Denmark have consistently topped the World Happiness Report as the happiest countries in the world. The annual happiness ranking, which began in 2012, measures happiness through various factors including relationships, income, and life satisfaction. Finland was named the happiest country in 2019, with Denmark coming in second place. The Nordic countries, including Norway, Iceland, and Sweden, also consistently rank high in the report."
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', search_sentence)

In [5]:
sentences

['Finland and Denmark have consistently topped the World Happiness Report as the happiest countries in the world.',
 'The annual happiness ranking, which began in 2012, measures happiness through various factors including relationships, income, and life satisfaction.',
 'Finland was named the happiest country in 2019, with Denmark coming in second place.',
 'The Nordic countries, including Norway, Iceland, and Sweden, also consistently rank high in the report.']

In [6]:
texts_tensor = []
for text in sentences:
    inputs = tokenizer(text=text , return_tensors="pt")
    outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    final_output = outputs[1] / outputs[1].norm(dim=-1, keepdim=True) # final_output.shape : torch.Size([1, 512])
    texts_tensor.append(final_output)

In [7]:
example = '/root/Youtube-Short-Generator/videos/test.mp4'

In [8]:
def video2image(video_path, frame_rate=1.0, size=224):
    def preprocess(size, n_px):
        return Compose([
            Resize(size, interpolation=InterpolationMode.BICUBIC),            
            CenterCrop(size),
            lambda image: image.convert("RGB"),
            ToTensor(),
            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ])(n_px)
    
    cap = cv2.VideoCapture(video_path)
    cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))+1

    if fps < 1:
        images = np.zeros([3, size, size], dtype=np.float32) 
        print("ERROR: problem reading video file: ", video_path)
    else:
        total_duration = (frameCount + fps - 1) // fps
        start_sec, end_sec = 0, total_duration
        interval = fps / frame_rate
        frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))
        ret = True     
        images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)
            
        for i, idx in enumerate(frames_idx):
            cap.set(cv2.CAP_PROP_POS_FRAMES , idx)
            ret, frame = cap.read()    
            if not ret: break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)            
            last_frame = i
            images[i,:,:,:] = preprocess(size, Image.fromarray(frame).convert("RGB"))
            
        images = images[:last_frame+1]
    cap.release()
    video_frames = torch.tensor(images)
    return video_frames
    
# video = video2image(example)

In [9]:
video = video2image(example)

In [10]:
video.shape

torch.Size([1544, 3, 224, 224])

In [11]:
model = CLIPVisionModelWithProjection.from_pretrained("Searchium-ai/clip4clip-webvid150k",cache_dir=current_directory)
# model = CLIPVisionModel.from_pretrained("Searchium-ai/clip4clip-webvid150k",cache_dir=current_directory)
model = model.eval()
visual_output = model(video)

visual_output = visual_output["image_embeds"]
visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)

In [12]:
visual_output.shape

torch.Size([1544, 512])

In [13]:
top_idx = []
for st in texts_tensor:
    dot_product_result = torch.matmul(st, visual_output.t())
    dot_product_result.shape
    _, top_indices = torch.topk(dot_product_result, k=20)
    top_idx.append(top_indices)

In [14]:
top_idx

[tensor([[1246,  108,  382, 1245,  529,  790,  458, 1243,  107, 1261,  417, 1259,
            88,   74,  121,   53,   52,  418, 1425,  776]]),
 tensor([[1020,  152,  157,  217,  155,  156,  162, 1425,  158, 1245, 1019,  223,
           227,  225,   69,  229,  909,  907,  228, 1219]]),
 tensor([[1261, 1246,  830, 1247, 1196,  767,  529,   90,   52, 1238,  567,  776,
          1243,  458,  568,  768, 1235,  108,  928, 1258]]),
 tensor([[ 121,  382, 1261,   75,  186, 1478,   69, 1477,  875, 1247,   81,  914,
            80,   73,   76,  119,   74,  229,  226,  225]])]

In [15]:
flat_list = torch.cat(top_idx, dim=0).flatten()

In [16]:
sorted_tensor = torch.sort(flat_list).values

In [17]:
sorted_tensor

tensor([  52,   52,   53,   69,   69,   73,   74,   74,   75,   76,   80,   81,
          88,   90,  107,  108,  108,  119,  121,  121,  152,  155,  156,  157,
         158,  162,  186,  217,  223,  225,  225,  226,  227,  228,  229,  229,
         382,  382,  417,  418,  458,  458,  529,  529,  567,  568,  767,  768,
         776,  776,  790,  830,  875,  907,  909,  914,  928, 1019, 1020, 1196,
        1219, 1235, 1238, 1243, 1243, 1245, 1245, 1246, 1246, 1247, 1247, 1258,
        1259, 1261, 1261, 1261, 1425, 1425, 1477, 1478])

In [18]:
def process_and_output(tensor):
    result = []
    tensor = tensor.tolist()  
    
    while tensor:
        current_value = tensor[0]
        start_range = current_value - 10
        end_range = start_range + 60
        
        current_range_values = [num for num in range(start_range, end_range + 1) if num in tensor]
        result.append(current_range_values)
        
        tensor = [num for num in tensor if num not in current_range_values]
    
    return result


output = process_and_output(sorted_tensor)
print(output)


[[52, 53, 69, 73, 74, 75, 76, 80, 81, 88, 90], [107, 108, 119, 121, 152, 155, 156, 157], [158, 162, 186], [217, 223, 225, 226, 227, 228, 229], [382, 417, 418], [458], [529, 567, 568], [767, 768, 776, 790], [830, 875], [907, 909, 914, 928], [1019, 1020], [1196, 1219, 1235, 1238, 1243, 1245, 1246], [1247, 1258, 1259, 1261], [1425], [1477, 1478]]


In [19]:
sentences

['Finland and Denmark have consistently topped the World Happiness Report as the happiest countries in the world.',
 'The annual happiness ranking, which began in 2012, measures happiness through various factors including relationships, income, and life satisfaction.',
 'Finland was named the happiest country in 2019, with Denmark coming in second place.',
 'The Nordic countries, including Norway, Iceland, and Sweden, also consistently rank high in the report.']

In [20]:
def convert_seconds_to_minutes_and_seconds(seconds):
    minutes = seconds // 60
    remaining_seconds = seconds % 60
    return minutes, remaining_seconds