In [1]:
import cv2
import torch
import os
import csv
from transformers import BlipProcessor, BlipForConditionalGeneration
from concurrent.futures import ThreadPoolExecutor
import glob
from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Initialize the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def downsample_and_caption_video(video_path, output_frame_rate=3, base_img_folder='img', base_csv_folder='output_csv'):
    video_name = os.path.basename(video_path).split('.')[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    img_folder_video = os.path.join(base_img_folder, f"{video_name}_{timestamp}")
    output_csv_video = os.path.join(base_csv_folder, f"{video_name}_{timestamp}.csv")

    if not os.path.exists(img_folder_video):
        os.makedirs(img_folder_video)

    if not os.path.exists(base_csv_folder):
        os.makedirs(base_csv_folder)

    with open(output_csv_video, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Video Name', 'Frame Number', 'Caption'])

        cap = cv2.VideoCapture(video_path)

        # Retrieve the original frame rate of the video
        original_fps = int(cap.get(cv2.CAP_PROP_FPS))
        frame_interval = int(original_fps / output_frame_rate)

        frame_count = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_interval == 0:
                # Convert the frame to RGB (OpenCV uses BGR by default)
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                inputs = processor(images=frame_rgb, return_tensors="pt")
                
                with torch.no_grad():
                    caption_ids = model.generate(**inputs)
                    caption = processor.decode(caption_ids[0], skip_special_tokens=True)
                
                img_filename = os.path.join(img_folder_video, f'frame_{frame_count}.jpg')
                cv2.imwrite(img_filename, frame)

                writer.writerow([video_name, frame_count, caption])

                print(f"{video_name} - Frame {frame_count}: {caption}")

            frame_count += 1

        cap.release()
        cv2.destroyAllWindows()

def process_videos_in_parallel(folder_path):
    video_files = glob.glob(os.path.join(folder_path, '*.mp4'))
    with ThreadPoolExecutor() as executor:
        executor.map(downsample_and_caption_video, video_files)

process_videos_in_parallel('stimuli')




test5 - Frame 0: a body of water
seg8 - Frame 0: a cat with a blue collar
seg9 - Frame 0: a man with a black shirt
seg14 - Frame 0: a horse standing in a field
seg4 - Frame 0: a fish swimming in the ocean
seg15 - Frame 0: a woman in a white coat
seg5 - Frame 0: a deer is standing in the grass
seg16 - Frame 0: a woman is using a laptop computer
test1 - Frame 0: a bear is swimming in the water
seg6 - Frame 0: a woman is holding a cell phone
test2 - Frame 0: a person skiing down a snowy slope
test3 - Frame 0: a person holding a puppy in their hands
seg2 - Frame 0: a group of people standing around hot air balloons
test4 - Frame 0: a group of soldiers standing outside of a building
seg18 - Frame 0: a person riding a motorcycle down a mountain road
seg17 - Frame 0: a woman is holding a baby's hand
seg1 - Frame 0: a woman is walking with a bear on the street
seg3 - Frame 0: a woman is standing in front of a large elephant
test5 - Frame 10: a body of water
seg11 - Frame 0: a man and woman are