In [1]:
# mounting google drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
!pip install torch torchvision transformers datasets tqdm
!pip install torch torchvision transformers opencv-python tqdm


Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0

In [4]:
import pandas as pd
import os
import cv2
from tqdm import tqdm

# Read the CSV file
video_folder = '/content/drive/MyDrive/AIP/MovieData/videos_1000'
frame_folder="/content/drive/MyDrive/AIP/MovieData/frame"

os.makedirs(frame_folder, exist_ok=True)

df = pd.read_csv("/content/drive/MyDrive/AIP/MovieData/fixed_downloaded_videos.csv")

# Extract frames from videos
def extract_frames(video_path, frame_rate=1):
    video_cap = cv2.VideoCapture(video_path)
    count = 0
    frames = []
    while video_cap.isOpened():
        ret, frame = video_cap.read()
        if not ret:
            break
        if count % frame_rate == 0:
            frames.append(frame)
        count += 1
    video_cap.release()
    return frames

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    video_id = row['video_id']
    video_path = os.path.join(video_folder, f"{video_id}.mp4")
    if os.path.exists(video_path):
        frames = extract_frames(video_path)
        for i, frame in enumerate(frames):
            frame_path = os.path.join(frame_folder, f"{video_id}_frame_{i}.jpg")
            cv2.imwrite(frame_path, frame)


100%|██████████| 1000/1000 [00:00<00:00, 11426.41it/s]


In [5]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_caption(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [6]:
captions = {}

for frame_file in tqdm(os.listdir(frame_folder)):
    if frame_file.endswith('.jpg'):
        frame_path = os.path.join(frame_folder, frame_file)
        caption = generate_caption(frame_path)
        video_id = frame_file.split('_frame_')[0]
        if video_id not in captions:
            captions[video_id] = []
        captions[video_id].append(caption)

# Combine captions for each video into a single paragraph
for video_id in captions:
    captions[video_id] = ' '.join(captions[video_id])

# Save the captions to a new CSV file
output_df = pd.DataFrame(list(captions.items()), columns=['video_id', 'generated_caption'])
output_df.to_csv('generated_captions.csv', index=False)


0it [00:00, ?it/s]


In [7]:
def extract_frames_from_video(video_path, frame_rate=1):
    video_cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while video_cap.isOpened():
        ret, frame = video_cap.read()
        if not ret:
            break
        if count % frame_rate == 0:
            frames.append(frame)
        count += 1
    video_cap.release()
    return frames

new_video_folder = '/content/drive/MyDrive/AIP/MovieData/test video'
new_frame_folder = '/content/drive/MyDrive/AIP/MovieData/frame'
os.makedirs(new_frame_folder, exist_ok=True)

for video_file in os.listdir(new_video_folder):
    if video_file.endswith('.mp4'):
        video_path = os.path.join(new_video_folder, video_file)
        frames = extract_frames_from_video(video_path)
        for i, frame in enumerate(frames):
            frame_path = os.path.join(new_frame_folder, f"{os.path.splitext(video_file)[0]}_frame_{i}.jpg")
            cv2.imwrite(frame_path, frame)


In [8]:
new_captions = {}

for frame_file in tqdm(os.listdir(new_frame_folder)):
    if frame_file.endswith('.jpg'):
        frame_path = os.path.join(new_frame_folder, frame_file)
        caption = generate_caption(frame_path)
        video_id = frame_file.split('_frame_')[0]
        if video_id not in new_captions:
            new_captions[video_id] = []
        new_captions[video_id].append(caption)

# Combine captions for each video into a single paragraph
for video_id in new_captions:
    new_captions[video_id] = ' '.join(new_captions[video_id])


100%|██████████| 1323/1323 [2:22:35<00:00,  6.47s/it]


In [9]:
 #Save the generated captions to a new CSV file
output_new_captions_df = pd.DataFrame(list(new_captions.items()), columns=['video_id', 'generated_caption'])
#output_new_captions_df.to_csv('new_generated_captions.csv', index=False)

# Print some sample captions
print(output_new_captions_df.head())


                    video_id  \
0                     173710   
1                     330874   
2  855289-hd_1920_1080_25fps   

                                   generated_caption  
0  a river in the middle of a forest a river in t...  
1  autumn leaves in the forest autumn leaves in t...  
2  a squirrel is eating a nut in the leaves a squ...  
