In [2]:
!pip install opencv-python transformers torch pillow


# Import libraries
from google.colab import files
import cv2
import os
import json
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Upload video file
uploaded = files.upload()
video_path = list(uploaded.keys())[0]  # Get the uploaded video file name

# Load the pre-trained BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Function to generate a caption for a given image
def generate_caption(image):
    inputs = processor(images=image, return_tensors="pt")
    output = model.generate(**inputs)
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption

# Function to extract frames from a video
def extract_frames(video_path, output_folder='frames', frame_interval=30):
    os.makedirs(output_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = 0
    saved_frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % frame_interval == 0:
            frame_path = f'{output_folder}/frame_{saved_frame_count}.jpg'
            cv2.imwrite(frame_path, frame)
            saved_frame_count += 1

        frame_count += 1

    cap.release()
    return saved_frame_count

# Extract frames from the video
frame_count = extract_frames(video_path)
print(f"Extracted {frame_count} frames.")

# Generate captions for each frame
captions = {}
for i in range(frame_count):
    frame_path = f'frames/frame_{i}.jpg'
    image = Image.open(frame_path).convert("RGB")
    caption = generate_caption(image)
    captions[frame_path] = caption
    print(f"Frame {i}: {caption}")

# Save captions to a JSON file
with open('captions.json', 'w') as f:
    json.dump(captions, f)

print("Captions saved to captions.json.")

# Download the captions JSON file
files.download('captions.json')

# Optionally, download frames as a zip file
import shutil
shutil.make_archive('frames', 'zip', 'frames')
files.download('frames.zip')




Saving video.mp4 to video.mp4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Extracted 8 frames.




Frame 0: hot air balloons in the sky
Frame 1: hot air balloons in cappada
Frame 2: hot air balloons in cappada
Frame 3: a group of people standing on top of a mountain
Frame 4: a group of hot air balloons flying over a mountain
Frame 5: a group of people standing on a hill
Frame 6: a group of people standing on a hill
Frame 7: hot air balloons in the sky
Captions saved to captions.json.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>