In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import torch
import cv2
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
processor = Blip2Processor.from_pretrained('Salesforce/blip2-opt-2.7b')
model = Blip2ForConditionalGeneration.from_pretrained('Salesforce/blip2-opt-2.7b',
                                                      load_in_8bit=True,
                                                      device_map='auto',
                                                      torch_dtype=torch.float16
                                                      )

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [5]:
def extract_middle_frame(video_path):
  cap = cv2.VideoCapture(video_path)
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
  ret, frame = cap.read()
  cap.release()
  if not ret:
    raise Exception('Failed to extract middle frame')
  rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
  return Image.fromarray(rgb)

In [6]:
def generate_description(image):
  inputs = processor(images=image, return_tensors='pt').to(device, torch.float16)
  generated_ids = model.generate(**inputs)
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
  return generated_text

In [9]:
video_dir = '/content/drive/MyDrive/Akai_Space_Videos'

In [11]:
import re
def extract_video_number(filename):
    match = re.search(r'Video\s*-\s*(\d+):', filename)
    return int(match.group(1)) if match else float('inf')

video_files = sorted(
    [f for f in os.listdir(video_dir) if f.endswith('.avi')],
    key=extract_video_number
)

In [13]:
for video_file in video_files:
    video_path = os.path.join(video_dir, video_file)
    try:
        image = extract_middle_frame(video_path)
        caption = generate_description(image)
        print(f"{video_file} ➜ {caption}")
    except Exception as e:
        print(f"Failed to process {video_file}: {e}")

Video - 1: ApplyEyeMakeup.avi ➜ a woman is standing in front of a mirror
Video - 2: CuttingInKitchen.avi ➜ a picture of a man standing in front of a building
Video - 3: HandStandPushups.avi ➜ a red and white striped rug on the floor
Video - 4: PizzaTossing.avi ➜ a man is standing in front of a building with a camera
Video - 5: SoccerPenalty.avi ➜ a picture of a soccer field with a ball
Video - 6: Typing.avi ➜ a man is using a computer to play a game
Video - 7: UnevenBars.avi ➜ a video of a man walking down a street
Video - 8: WalkingWithDog.avi ➜ a computer screen showing a photo of a person
Video - 9: WritingOnBoard.avi ➜ a man standing in front of a window with a sign that says "the window is open"
Video - 10: YoYo.avi ➜ a man is standing in front of a tree
