In [1]:
# from pytube import YouTube
import cv2
from PIL import Image
import clip
import torch
import math
import numpy as np
import pickle

In [2]:
# Load the open CLIP model
device = "mps" if getattr(torch,'has_mps',False) else "gpu" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [3]:
def download_yt_video(url):
  # Choose a video stream with resolution of 360p
  streams = YouTube(url).streams.filter(adaptive=True, subtype="mp4", resolution="360p", only_video=True)

  # Check if there is a valid stream
  if len(streams) == 0:
    raise "No suitable stream found for this YouTube video!"

  # Download the video as video.mp4
  print("Downloading...")
  streams[0].download(filename="video.mp4")
  print("Download completed.")

In [4]:
def get_video_frames(video_name):

  # The frame images will be stored in video_frames
  video_frames = []

  # Open the video file
  capture = cv2.VideoCapture(video_name)
  fps = round(capture.get(cv2.CAP_PROP_FPS))
  print()

  current_frame = 0
  while capture.isOpened():
    # Read the current frame
    ret, frame = capture.read()

    # Convert it to a PIL image (required for CLIP) and store it
    if ret == True:
      video_frames.append(Image.fromarray(frame[:, :, ::-1]))
    else:
      break

    # Skip N frames
    current_frame += fps
    capture.set(cv2.CAP_PROP_POS_FRAMES, current_frame)

  # Print some statistics
  print(f"Frames extracted: {len(video_frames)}, fps: {fps}")
  return video_frames, fps

In [5]:

def get_video_features(video_frames):
  # You can try tuning the batch size for very large videos, but it should usually be OK
  batch_size = 256
  batches = math.ceil(len(video_frames) / batch_size)

  # The encoded features will bs stored in video_features
  video_features = torch.empty([0, 512], dtype=torch.float16).to(device)

  # Process each batch
  for i in range(batches):
    print(f"Processing batch {i+1}/{batches}")

    # Get the relevant frames
    batch_frames = video_frames[i*batch_size : (i+1)*batch_size]
    
    # Preprocess the images for the batch
    batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device)
    
    # Encode with CLIP and normalize
    with torch.no_grad():
      batch_features = model.encode_image(batch_preprocessed)
      batch_features /= batch_features.norm(dim=-1, keepdim=True)

    # Append the batch to the list containing all features
    video_features = torch.cat((video_features, batch_features))

  # Print some stats
  # print(f"Features: {video_features.shape}")
  return video_features

In [7]:
# import plotly.express as px
import datetime
from IPython.core.display import HTML

def return_similarities(search_query, video_features, display_heatmap=True, display_results_count=3):

  # Encode and normalize the search query using CLIP
  with torch.no_grad():
    text_features = model.encode_text(clip.tokenize(search_query, truncate=True).to(device))
    text_features /= text_features.norm(dim=-1, keepdim=True)

  # Compute the similarity between the search query and each frame using the Cosine similarity
  similarities = (100.0 * video_features @ text_features.T)
  # print(similarities.shape)
  return similarities
  # values, best_photo_idx = similarities.topk(display_results_count, dim=0)

  # # Display the heatmap
  # if display_heatmap:
  #   print("Search query heatmap over the frames of the video:")
  #   fig = px.imshow(similarities.T.cpu().numpy(), height=50, aspect='auto', color_continuous_scale='viridis')
  #   fig.update_layout(coloraxis_showscale=False)
  #   fig.update_xaxes(showticklabels=False)
  #   fig.update_yaxes(showticklabels=False)
  #   fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
  #   fig.show()
  #   print()

  # # Display the top 3 frames
  # for frame_id in best_photo_idx:
  #   display(video_frames[frame_id])

  #   # Find the timestamp in the video and display it
  #   seconds = round(frame_id.cpu().numpy()[0] * N / fps)
  #   display(HTML(f"Found at {str(datetime.timedelta(seconds=seconds))} (<a target=\"_blank\" href=\"{video_url}&t={seconds}\">link</a>)"))

In [28]:
import pandas as pd
df = pd.read_csv('../data/with_aug/all_with_uuid.csv')

In [29]:
df = df.sort_values('url')

In [30]:
df.tail(1000)

Unnamed: 0.1,Unnamed: 0,url,category,comment,hate_towards_whom,label,id
9293,9293,https://www.youtube.com/watch?v=sdSRL3gHgZQ,fashion & beauty,good Lord. you're a badass for showing your bi...,,no,dab7b134-674d-4f8e-83ba-790154f03e67
9292,9292,https://www.youtube.com/watch?v=sdSRL3gHgZQ,fashion & beauty,having a woman can be so expensive sighing,,no,27ba891e-b3c7-4edb-b1b0-ec1ef58dcbd5
8851,8851,https://www.youtube.com/watch?v=sdSRL3gHgZQ,fashion & beauty,send a link to charles james,Individual,yes,7b88361b-9bf1-443f-8974-89985e60c41d
9290,9290,https://www.youtube.com/watch?v=sdSRL3gHgZQ,fashion & beauty,the cost of being a woman can be so expensive,,no,176f9039-7e5c-4e1e-a020-593a4718a26a
9289,9289,https://www.youtube.com/watch?v=sdSRL3gHgZQ,fashion & beauty,the cost of being a woman can be so high,,no,c16aa42f-8251-4f99-a258-a71236193f47
...,...,...,...,...,...,...,...
5769,5769,https://www.youtube.com/watch?v=zpBD7QGR_xU,news & politics,this is still a serious threat because the vir...,,no,29568423-49f1-4fb9-bf67-b5d440993ca0
5768,5768,https://www.youtube.com/watch?v=zpBD7QGR_xU,news & politics,Virus adapts continously so new variants would...,,no,610e7d37-4d41-41fb-b0be-00148744d4fa
5778,5778,https://www.youtube.com/watch?v=zpBD7QGR_xU,news & politics,there shouldn't be a vote when it's necessary,,no,53890d39-8371-43fd-80f6-b29a39ea03c5
5773,5773,https://www.youtube.com/watch?v=zpBD7QGR_xU,news & politics,The virus is constantly adapting to make new v...,,no,24a3cd53-2f48-4897-a92c-d03c46cecaff


In [31]:
len(set(df['id'])), len(df), len(set(df['url']))

(9762, 9762, 402)

In [35]:
import os
clip_embeddings_done = os.listdir('../data/CLIP_embeddings/')

In [36]:
clip_embeddings_done = [i.split('.pickle')[0] for i in clip_embeddings_done]
len(clip_embeddings_done)

9763

In [37]:
prev_url = ''
count = 0
comm_done = 0
for idx, row in df.iterrows():
  if str(row['id']) in clip_embeddings_done:
    comm_done += 1
    continue
  if prev_url != row['url']:
    # download_yt_video(row['url'])
    if 'bitchute' in row['url']:
      video_id = row['url'].split('/')[-2]
    else:
      video_id = row['url'].split('https://www.youtube.com/watch?v=')[1].split('&ab_channel=')[0].split('&t=')[0]
    
    video_frames, fps = get_video_frames(f'../data/videos/{video_id}.mp4')
    count += 1
    print('Video ID:', video_id, 'Count:', count, 'Comments Done:', comm_done)
    
    video_features = get_video_features(video_frames)
  similarities = return_similarities(row['comment'], video_features)
  data = {'similarities': similarities, 'video_features': video_features}
  prev_url = row['url']
  # print(len(video_frames))
  id = row['id']
  comm_done += 1
  with open(f'../data/CLIP_embeddings/{id}.pickle', 'wb') as f:
    pickle.dump(data,f)
  