## download ViCILP weights and put its pth file in viclip folder.


In [48]:
import os
from pathlib import Path

import cv2
import numpy as np
import polars as pl
import torch
from IPython.display import HTML, Video, display, display_html
from ipywidgets import GridspecLayout, Output

from viclip import (
    _frame_from_video,
    frames2tensor,
    get_text_feat_dict,
    get_viclip,
    get_vid_feat,
)

In [60]:
MODEL = get_viclip("l", "models/ViCLIP-L_InternVid-FLT-10M.pth")

GOALS = [
    "A stick model of a dog actively running in grass.",
    "A stick model of a dog trying to move in grass but failing.",
    "A stick model of a dog on a grey background actively running.",
    "A stick model of a dog on a grey background trying to move but failing.",
    "A stick model of a dog doing weird things in grass.",
    "A stick model of a pair of legs, attempting to walk.",
]

In [64]:
from viclip import retrieve_text


def get_label_probs(video_path, labels, model=MODEL):
    video = cv2.VideoCapture(video_path)
    frames = [x for x in _frame_from_video(video)]
    texts, probs = retrieve_text(frames, labels, model, topk=len(labels))

    result = []
    for t, p in zip(texts, probs):
        result.append(f"[{p:.2f}]: {t}")
    return result


def get_similarity_score(video_path, labels, model=MODEL):
    results = []
    for label in labels:
        device = torch.device("cuda")

        video = cv2.VideoCapture(video_path)
        frames = [x for x in _frame_from_video(video)]

        clip, tokenizer = model["viclip"], model["tokenizer"]
        clip = clip.to(device)

        v = get_vid_feat(frames2tensor(frames, device=device), clip)
        t = get_text_feat_dict([label], clip, tokenizer)[label]

        results.append(torch.nn.functional.cosine_similarity(v, t).item())
    return results

In [65]:
data = pl.DataFrame({"path": [str(p) for p in Path("videos").glob("*.mp4")]})
# data = data.with_columns(
#     sim=pl.col("path").map_elements(
#         lambda p: get_similarity_score(video_path=p, labels=GOALS)
#     )
# )

In [66]:
grid = GridspecLayout(len(data), 2)

for i, row in enumerate(data.iter_rows(named=True)):
    out = Output()
    with out:
        display(Video(row["path"], width=150, html_attributes="loop autoplay muted"))
    grid[i, 0] = out
    label_probs = get_label_probs(row["path"], GOALS)
    label_str = "<br>".join(label_probs)
    o2 = Output()
    with o2:
        display_html(label_str, raw=True)
    grid[i, 1] = o2

grid

GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [107]:
import gc

import torch

gc.collect()
torch.cuda.empty_cache()