In [1]:
import os
import cv2
import torch.cuda
import yt_dlp as youtube_dl
import numpy as np
import pandas as pd
import torch
from transformers import VivitConfig, VivitModel
from sklearn.cluster import KMeans
import whisper

In [2]:
# Load the dataset
dataset = pd.read_csv("./dataset/youtube.csv")

In [3]:
# Function to download video using yt-dlp
def download_video(video_link, output_path):
    ydl_opts = {"format": "best/bestvideo+bestaudio", "outtmpl": output_path}
    try:
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_link])
    except youtube_dl.DownloadError as e:
        print(f"DownloadError: {e}")


# Function to extract frames from a video
def extract_frames(video_path, output_folder, frame_rate=1):
    os.makedirs(output_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)

    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Save frame at the specified rate
        if count % frame_rate == 0:
            frame_filename = os.path.join(output_folder, f"frame_{count}.jpg")
            cv2.imwrite(frame_filename, frame)

        count += 1

    cap.release()


# Function to preprocess frames
def preprocess_frame(frame, size=(224, 224)):
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = cv2.resize(frame, size)
    frame = frame / 255.0
    frame = np.transpose(frame, (2, 0, 1))
    frame = np.expand_dims(frame, axis=0)
    return frame


# Function to preprocess all frames
def preprocess_all_frames(output_folder, size=(224, 224)):
    preprocessed_frames = []
    for frame_file in sorted(os.listdir(output_folder)):
        frame_path = os.path.join(output_folder, frame_file)
        frame = cv2.imread(frame_path)
        preprocessed_frame = preprocess_frame(frame, size)
        preprocessed_frames.append(preprocessed_frame)
    preprocessed_frames = np.vstack(preprocessed_frames)
    return preprocessed_frames


# Function to transcribe audio from video using Whisper
def transcribe_audio(video_path):
    model = whisper.load_model("base")  # You can use different sizes: "tiny", "base", "small", "medium", "large"
    result = model.transcribe(video_path)
    transcription = result["text"]
    return transcription

In [4]:
# Example usage:
video_link = dataset["link"][299]
video_path = "downloaded_video.mkv"
output_folder = "video_frames"

In [5]:
# Step 1: Download the video
download_video(f"https://www.youtube.com/watch?v={video_link}", video_path)

[youtube] Extracting URL: https://www.youtube.com/watch?v=M9WtLebogS4?si=wHdcV1cYpMJaqwen
[youtube] M9WtLebogS4: Downloading webpage
[youtube] M9WtLebogS4: Downloading ios player API JSON
[youtube] M9WtLebogS4: Downloading tv player API JSON


         Install PhantomJS to workaround the issue. Please download it from https://phantomjs.org/download.html
         n = hr4D2QQmCgOFGzgb0 ; player = https://www.youtube.com/s/player/3c3d6e4f/player_ias.vflset/en_US/base.js
         Install PhantomJS to workaround the issue. Please download it from https://phantomjs.org/download.html
         n = QmVo1fnB0FK0XxfbG ; player = https://www.youtube.com/s/player/3c3d6e4f/player_ias.vflset/en_US/base.js


[youtube] M9WtLebogS4: Downloading m3u8 information
[info] M9WtLebogS4: Downloading 1 format(s): 313+140
[download] Resuming download at byte 116560387
[download] Destination: downloaded_video.mp4.f313.webm
[download] 100% of    2.93GiB in 00:04:30 at 11.10MiB/s     
[download] Destination: downloaded_video.mp4.f140.m4a
[download] 100% of   35.27MiB in 00:00:11 at 3.19MiB/s   
[Merger] Merging formats into "downloaded_video.mp4.mkv"
Deleting original file downloaded_video.mp4.f140.m4a (pass -k to keep)
Deleting original file downloaded_video.mp4.f313.webm (pass -k to keep)


In [5]:
# Step 2: Extract frames from the video
extract_frames(video_path, output_folder, frame_rate=30)

In [5]:
# Step 3: Preprocess all frames
preprocessed_frames = preprocess_all_frames(output_folder)
print("Preprocessed frames shape:", preprocessed_frames.shape)

Preprocessed frames shape: (1905, 3, 224, 224)


In [6]:
# Step 4: Feature Extraction with ViViT
torch.cuda.empty_cache()
configuration = VivitConfig(image_size=224, num_frames=3)
model = VivitModel(configuration)

In [7]:
model.eval()

VivitModel(
  (embeddings): VivitEmbeddings(
    (patch_embeddings): VivitTubeletEmbeddings(
      (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): VivitEncoder(
    (layer): ModuleList(
      (0-11): 12 x VivitLayer(
        (attention): VivitAttention(
          (attention): VivitSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): VivitSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): VivitIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): D

In [10]:
frames_tensor = torch.tensor(preprocessed_frames, dtype=torch.float32)
num_frames_per_segment = 3
batch_size = frames_tensor.shape[0] // num_frames_per_segment
frames_tensor = frames_tensor[: batch_size * num_frames_per_segment].reshape(batch_size, num_frames_per_segment, 3, 224, 224)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

frames_tensor = frames_tensor.to(device)
model = model.to(device)

with torch.no_grad():
    outputs = model(frames_tensor)

video_features = outputs.last_hidden_state
print("Video features shape:", video_features.shape)

Device: cuda
Video features shape: torch.Size([635, 197, 768])


In [11]:
# Step 5: Key Frame Selection
num_frames, feature_dim = video_features.shape[1], video_features.shape[-1]
video_features_reshaped = video_features.view(num_frames, -1).cpu().numpy()

k = 20
kmeans = KMeans(n_clusters=k, random_state=0).fit(video_features_reshaped)

key_frame_indices = []
for i in range(k):
    cluster_center = kmeans.cluster_centers_[i]
    distances = np.linalg.norm(video_features_reshaped - cluster_center, axis=1)
    key_frame_idx = np.argmin(distances)
    key_frame_indices.append(key_frame_idx)

key_frames = [preprocessed_frames[idx] for idx in key_frame_indices]

for i, key_frame in enumerate(key_frames):
    if key_frame.shape[0] == 1:
        key_frame_img = key_frame.squeeze(0)
    else:
        key_frame_img = key_frame

    if key_frame_img.shape[0] == 3:
        key_frame_img = np.transpose(key_frame_img, (1, 2, 0))

    cv2.imwrite(f"key_frames/key_frame_{i}.jpg", key_frame_img * 255)



In [12]:
# Step 6: Generate Descriptions for Key Frames using BLIP
from transformers import BlipForConditionalGeneration, BlipProcessor

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

descriptions = []
for i, key_frame in enumerate(key_frames):
    key_frame_img_uint8 = (key_frame * 255).astype(np.uint8)
    inputs = blip_processor(images=key_frame_img_uint8, return_tensors="pt").to("cuda")
    outputs = blip_model.generate(**inputs, max_new_tokens=150, num_beams=7)
    description = blip_processor.decode(outputs[0], skip_special_tokens=True)
    descriptions.append(f"Key Frame {i+1}: {description}")
    print(f"Description for Key Frame {i+1}: {description}")

Description for Key Frame 1: a man in a blue and white shirt standing in front of a door
Description for Key Frame 2: a man standing in front of a wall with graffiti on it
Description for Key Frame 3: a man working out on a machine in a gym
Description for Key Frame 4: the man is wearing a white t - shirt
Description for Key Frame 5: a man standing in front of a mirror
Description for Key Frame 6: a man standing in front of a mirror
Description for Key Frame 7: the floor is white in color
Description for Key Frame 8: a man standing in front of a wall with a cell in his hand
Description for Key Frame 9: the floor is white in color
Description for Key Frame 10: a man standing in front of a wall with graffiti on it
Description for Key Frame 11: a man standing in front of a mirror in a gym
Description for Key Frame 12: a man standing in front of a wall with a logo on it
Description for Key Frame 13: the man is wearing a black shirt
Description for Key Frame 14: a man standing in front of a

In [13]:
# Step 7: Combine Key Frame Descriptions with Video Transcript
video_transcript = transcribe_audio(video_path)
combined_text = f"{video_transcript} {' '.join(descriptions)}"

In [18]:
# Step 8: Summarize Combined Content
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/mbart-large-50", device=0 if torch.cuda.is_available() else -1)
input_length = len(combined_text.split())
max_length = min(1500, input_length)
summary = summarizer(combined_text, max_length=max_length, min_length=int(max_length / 2), do_sample=False)

print("Generated Summary:")
print(summary[0]["summary_text"])

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
