In [9]:
from google.colab import drive
drive.mount('/content/drive')
import os
import sys

%cd /content/drive/MyDrive/CSC2508/

if 'google.colab' in sys.modules:
    if not os.path.isdir("/content/drive/MyDrive/CSC2508/CSC2508_final_project"):
        !git clone --recurse-submodule https://github.com/BenAgro314/CSC2508_final_project.git
        %cd /content/drive/MyDrive/CSC2508/CSC2508_final_project
    else:
        %cd /content/drive/MyDrive/CSC2508/CSC2508_final_project
        !git pull  --recurse-submodules

    # install and clone requirments
    !pip3 install transformers==4.16.0 timm==0.4.12 fairscale==0.4.4
    !pip install rank_bm25

from primitives.document import Document
from primitives.corpus import Corpus
from primitives.caption import Caption
from utils.video_to_images import load_video_into_images
import torch
from pathlib import Path
import time
import tqdm
import cv2
from rank_bm25 import BM25Okapi
import numpy as np
sys.path.append("/content/drive/MyDrive/CSC2508/CSC2508_final_project/BLIP")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CSC2508
/content/drive/MyDrive/CSC2508/CSC2508_final_project
Fetching submodule BLIP
Already up to date.


In [2]:
from models.blip import blip_decoder

image_size = 384

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth'

model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base', med_config="BLIP/configs/med_config.json")
model.eval()
model = model.to(device)


reshape position embedding from 196 to 576
load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth


In [7]:
# preprocess videos into text

video_path = "/content/drive/MyDrive/CSC2508/videos/"
doc_path = "/content/drive/MyDrive/CSC2508/documents/"

videos = {Path(v).stem: v for v in os.listdir(video_path)}
docs = {Path(d).stem: d for d in os.listdir(doc_path)}

assert len(docs) <= len(videos), f"Documents {set(docs.keys()).difference(set(videos.keys()))} do not have corresponding videos"
unprocessed_vids = set(videos.keys()).difference(set(docs.keys()))
print(f"Unprocessed videos: {unprocessed_vids}")

for vid_name in unprocessed_vids:
    vid_path = os.path.join(video_path, videos[vid_name])
    print(f"Processing {vid_name}")

    cap = cv2.VideoCapture(vid_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    document = Document(name=vid_name, video_path=vid_path, fps=fps)

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()
    print(f"Video fps: {fps}, total frames: {total_frames}")
    subsample_rate = int(round(fps * 0.5)) # we want to read in videos at 2 fps

    with torch.no_grad():
        for frame, img, pil_itmage in tqdm.tqdm(load_video_into_images(vid_path, image_size, device, subsample_rate)):

            start_time = time.time()
            caption = model.generate(img, sample=False, num_beams=3, max_length=20, min_length=5)
            # print(f'inference time {time.time() - start_time:.3f} s')
            #display(pil_image.resize((image_size, image_size)))
            document.add_caption(frame=frame, caption=Caption(caption[0]))

    print(document)
    document.save(doc_path)

corpus = Corpus(doc_path)
print(corpus)

Unprocessed videos: {'People Swimming with Whale Shark'}
Processing People Swimming with Whale Shark
Video fps: 29.97002997002997, total frames: 461


31it [00:44,  1.43s/it]

0.000               a whale swimming in the ocean
0.501               a whale swimming in the ocean
1.001               a whale swimming in the ocean
1.502               a whale swimming in the ocean
2.002               a whale swimming in the ocean
2.502               two people swimming with a whale
3.003               a whale and a man swimming in the ocean
3.504               two people swimming with a whale
4.004               a whale swimming in the ocean
4.505               a whale and its baby swimming in the ocean
5.005               a whale swimming in the ocean
5.506               a whale swimming in the ocean
6.006               a whale swimming in the ocean
6.506               a whale and a man swimming in the ocean
7.007               a whale swimming in the ocean
7.508               a whale and a whale in the ocean
8.008               a whale and a whale in the ocean
8.508               a whale and a penguin swimming in the ocean
9.009               a whale swimming in t




In [12]:
query = "frog"

tokenized_query = query.split(" ")
corpus_bm25 = BM25Okapi(corpus.tokenize_documents())
doc_scores = corpus_bm25.get_scores(tokenized_query)

selected_doc = corpus[np.argmax(doc_scores)]

doc_bm25 = BM25Okapi(selected_doc.tokenize_captions())
caption_scores = doc_bm25.get_scores(tokenized_query)
selected_caption = selected_doc[np.argmax(caption_scores)]


print(selected_doc.name, "|", selected_caption[0], "|", selected_caption[1])
cap = cv2.VideoCapture(selected_doc.video_path)

costa_rica_cropped | a green tree frog | 1095


In [None]:
from IPython.display import HTML
from base64 import b64encode
mp4 = open(selected_doc.video_path,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
start_time = selected_caption[1] / selected_doc.fps
end_time = start_time + 5
HTML(f"""
<video width=400 controls autoplay>
      <source src="{data_url}#t={start_time},{end_time}" type="video/mp4">
</video>
""")