In [1]:
import cv2
import os

def extract_frames(video_path, output_folder):

    #Extract frames from a video file and save them as images.
    #video_path : Path to the video file.
    #output_folder : Folder where frames will be saved.

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    cap = cv2.VideoCapture(video_path)
    count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        cv2.imwrite(f"{output_folder}/frame_{count:04d}.jpg", frame)
        count += 1

    cap.release()

video_path = '/content/drive/MyDrive/Movie Data/video_0000/10046243.mp4'
output_folder = '/content/drive/MyDrive/Movie Data/video_0000_frames'
extract_frames(video_path, output_folder)


In [2]:
!pip install transformers torch torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [6]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import os

# Define function to generate text descriptions
def generate_text_descriptions(frames_folder, max_new_tokens=50):
    # Load the processor and model
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    descriptions = []

    # Iterate over each image frame in the folder
    for frame_name in os.listdir(frames_folder):
        frame_path = os.path.join(frames_folder, frame_name)
        image = Image.open(frame_path).convert("RGB")

        # Process the image and generate the description
        inputs = processor(images=image, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        description = processor.decode(outputs[0], skip_special_tokens=True)

        descriptions.append(description)

    return descriptions

frames_folder = '/content/drive/MyDrive/Movie Data/video_0000_frames'
descriptions = generate_text_descriptions(frames_folder)
print(descriptions)

['ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants crawling on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants in the nest', 'ants are eating on the ground', 'ants are eating on the ground', 'ants crawling on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'a group of ants in the sand', 'a group of ants with long legs and long legs', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'ants are eating on the ground', 'a group of ants crawling on a rock', 'a group of ants are gathered together', 'a group of ants in the ground', 'a group of ants in the ground', 'a group of ants in the ground', 

In [8]:
%%shell
jupyter nbconvert --to html /content/Text_description.ipynb

[NbConvertApp] Converting notebook /content/Text_description.ipynb to html
[NbConvertApp] Writing 602814 bytes to /content/Text_description.html


