In [4]:
import cv2
import numpy as np
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipModel
from moviepy.editor import VideoFileClip

# Initialize BLIP processor and models
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")

# Move models to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
blip_caption_model.to(device)
blip_model.to(device)

def generate_captions(video_path):
    """
    DESC: Generate captions for each frame in the video.

    PARAMETER:
        video_path (str): Path to the input video.

    RETURNS:
        list: List of generated captions for each frame.
    """
    video_capture = cv2.VideoCapture(video_path)
    captions = []

    while True:
        ret, frame = video_capture.read()
        if not ret:
            break

        # Preprocess the frame for BLIP
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        inputs = blip_processor(images=frame_rgb, return_tensors="pt").to(device)
        outputs = blip_caption_model.generate(**inputs)
        caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
        captions.append(caption)

    video_capture.release()
    return captions

def add_captions_to_video(video_path, captions, output_path):
    """
    DESC: Add captions to the video frames and save the video.

    PARAMETERS:
        video_path (str): Path to the input video.
        captions (list): List of captions to add to the video frames.
        output_path (str): Path to save the captioned video.

    RETRUNS: NOTHING
    """
    video_capture = cv2.VideoCapture(video_path)
    fps = video_capture.get(cv2.CAP_PROP_FPS)
    width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    while True:
        ret, frame = video_capture.read()
        if not ret:
            break

        # Add caption to the frame
        cv2.rectangle(frame, (0, height - 50), (width, height), (255, 255, 255), -1)
        cv2.putText(frame, captions[frame_count], (10, height - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
        video_writer.write(frame)
        frame_count += 1

    video_capture.release()
    video_writer.release()

def calculate_similarity(captions, description):
    """
    DESC: Calculate similarity between captions and a provided description.

    PARAMERTRESs:
        captions (list): List of captions.
        description (str): Description of the activity.

    RETURNS:
        list: Indices of captions similar to the description.
    """
    inputs = blip_processor(text=[description], return_tensors="pt").to(device)
    description_embedding = blip_model.get_text_features(**inputs)

    matching_indices = []
    for i, caption in enumerate(captions):
        inputs = blip_processor(text=[caption], return_tensors="pt").to(device)
        caption_embedding = blip_model.get_text_features(**inputs)

        # Cosine similarity
        cos_sim = torch.nn.functional.cosine_similarity(description_embedding, caption_embedding, dim=-1)
        if cos_sim.item() > 0.8:  # Threshold for similarity
            matching_indices.append(i)

    return matching_indices

def clip_video(video_path, output_path, frame_indices, fps=30):
    """
    DESC: Clip the video based on frame indices and save the clipped video.

    PARAMETERS:
        video_path (str): Path to the input video.
        output_path (str): Path to save the clipped video.
        frame_indices (list): Indices of frames to include in the clipped video.
        fps (int, optional): Frames per second of the video. Defaults to 30.

    RETURNS: NOTHING
    """
    start_time = frame_indices[0] / fps
    end_time = (frame_indices[-1] + 1) / fps
    video = VideoFileClip(video_path).subclip(start_time, end_time)
    video.write_videofile(output_path, codec='libx264')

def process_video():
    """
    DESC: Process the video by generating captions, adding them to the video, and clipping it based on activity description.
    """
    video_path = input("Enter the video path: ")
    output_path = f"Captioned_{video_path.split('/')[-1]}"

    print("Generating captions...")
    captions = generate_captions(video_path)
    print("Adding captions to video...")
    add_captions_to_video(video_path, captions, output_path)

    response = input("Do you want to clip the video? (yes/no): ")
    if response.lower() == 'yes':
        activity_description = input("Enter the activity description: ")
        print("Calculating similarities...")
        matching_indices = calculate_similarity(captions, activity_description)
        if matching_indices:
            clip_path = f"Clipped_{output_path}"
            print("Clipping video...")
            clip_video(video_path, clip_path, matching_indices)
            print(f"Clipped video saved to {clip_path}")

if __name__ == "__main__":
    process_video()


Some weights of BlipModel were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.bias', 'text_model.encoder.layer.0.attention.self.value.weight', 'text_model.encoder.layer.0.crossattention.output.LayerNorm.bias', 'text_model.

Enter the video path: cat.mp4
Generating captions...
Adding captions to video...
Do you want to clip the video? (yes/no): yes
Enter the activity description: cat playing with a ball
Calculating similarities...
Clipping video...
Moviepy - Building video Clipped_Captioned_cat.mp4.
MoviePy - Writing audio in Clipped_Captioned_catTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video Clipped_Captioned_cat.mp4





Moviepy - Done !
Moviepy - video ready Clipped_Captioned_cat.mp4
Clipped video saved to Clipped_Captioned_cat.mp4
