In [1]:
import os
import cv2
import pandas as pd
import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from tqdm import tqdm
from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
import imageio_ffmpeg as ffmpeg

In [2]:
os.environ['IMAGEMAGICK_BINARY'] = r'C:\Program Files\ImageMagick-7.0.11-Q16-HDRI\convert.exe'

In [3]:
# Load CSV file
df = pd.read_csv(r'C:\Users\AT\Downloads\MovieData\video_0000.csv')

In [4]:
# Function to extract frames from video
def extract_frames(video_path, num_frames=16):
    frames = []
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(total_frames // num_frames, 1)
    for i in range(0, total_frames, interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(cv2.resize(frame, (224, 224)))
        if len(frames) == num_frames:
            break
    cap.release()
    return frames

In [5]:
# Load pre-trained model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [6]:
# Function to generate captions
def generate_caption(model, feature_extractor, tokenizer, frames):
    inputs = feature_extractor(images=frames, return_tensors="pt")
    pixel_values = inputs.pixel_values
    output_ids = model.generate(pixel_values)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

In [7]:
# Function to generate caption for a specific video
def generate_caption_for_video(video_path, model, feature_extractor, tokenizer):
    frames = extract_frames(video_path)
    caption = generate_caption(model, feature_extractor, tokenizer, frames)
    return caption

In [12]:
# Function to add text descriptions to video
def add_text_descriptions_to_video(video_path, description, output_path):
    # Load the video
    video = VideoFileClip(video_path)
    
    # Debug: Print the duration of the video
    print(f"Video Duration: {video.duration}")
    
    # Ensure duration is correctly set
    duration = video.duration
    
    if duration is None:
        raise ValueError("The duration of the video clip is None. Check the video file.")

    # Create a TextClip object with duration set
    text_clip = TextClip(description, fontsize=24, color='white', bg_color='black', size=(video.size[0], 50))
    text_clip = text_clip.set_position(('center', 'bottom')).set_duration(duration)
    
    # Debug: Print the duration of the text clip
    print(f"TextClip Duration: {text_clip.duration}")
    
    # Composite the video with the text overlay
    result = CompositeVideoClip([video, text_clip])
    
    # Write the result to the output file
    result.write_videofile(output_path, codec='libx264', audio_codec='aac')

In [13]:
video_path = r'C:\Users\AT\Downloads\MovieData\video_0000\1007770414.mp4'  # Specify the path to the video
description = generate_caption_for_video(video_path, model, feature_extractor, tokenizer)
print(f"Generated Caption: {description}")

Generated Caption: a traffic light with a street sign on it 


In [14]:
output_path = r'C:\Users\AT\Downloads\MovieData\video_0000\1007770414_withalign.mp4'
add_text_descriptions_to_video(video_path, description, output_path)

Video Duration: 9.27
TextClip Duration: 9.27
Moviepy - Building video C:\Users\AT\Downloads\MovieData\video_0000\1007770414_withalign.mp4.
Moviepy - Writing video C:\Users\AT\Downloads\MovieData\video_0000\1007770414_withalign.mp4



                                                               

Moviepy - Done !
Moviepy - video ready C:\Users\AT\Downloads\MovieData\video_0000\1007770414_withalign.mp4
