## download ViCILP weights and put its pth file in viclip folder.


In [97]:
from pathlib import Path

import cv2
import polars as pl
import torch
from ipywidgets import HTML, GridspecLayout, Label, VBox, Video

from viclip import (
    _frame_from_video,
    frames2tensor,
    get_text_feat_dict,
    get_viclip,
    get_vid_feat,
    retrieve_text,
)

In [98]:
MODEL = get_viclip("l", "models/ViCLIP-L_InternVid-FLT-10M.pth")

data = pl.DataFrame({"path": [str(p) for p in Path("videos").glob("*.mp4")]})

In [213]:
def get_frames(video_path):
    video = cv2.VideoCapture(video_path)
    return [x for x in _frame_from_video(video)]


def probs(video_path, model=MODEL):
    frames = get_frames(video_path)

    labels = {
        "A stick model of a dog actively running in grass": "running",
        "A stick model of a dog actively running in grass.": "running.",
        # "A simulated stick model of a dog actively running towards the right in grass": "running-right",
        # "A simulated stick model of a dog actively running towards the left in grass": "running-left",
        # "A simulated stick model of a dog standing still in grass": "standing",
        "A stick model of a dog doing weird things in grass": "weird things",
        "A stick model of a dog doing weird things in grass.": "weird things.",
        "A stick model of a dog trying to move in grass but failing": "failing",
        "A stick model of a dog trying to move in grass but failing.": "failing.",
    }
    texts, probs = retrieve_text(frames, list(labels.keys()), model, topk=len(labels))

    result = []
    for t, p in zip(texts, probs):
        result.append(f"[{p:.2f}]: {labels[t]}")
    return result


def similarity_score(video_path, model=MODEL):
    labels = {
        "A stick model of a dog actively running in grass": "running",
        "A stick model of a dog actively running in grass.": "running.",
        # "A simulated stick model of a dog actively running towards the right in grass": "running-right",
        # "A simulated stick model of a dog actively running towards the left in grass": "running-left",
        # "A simulated stick model of a dog standing still in grass": "standing",
        "A stick model of a dog doing weird things in grass": "weird things",
        "A stick model of a dog doing weird things in grass.": "weird things.",
        "A stick model of a dog trying to move in grass but failing": "failing",
        "A stick model of a dog trying to move in grass but failing.": "failing.",
    }

    device = torch.device("cuda")
    frames = frames = get_frames(video_path)

    clip, tokenizer = model["viclip"], model["tokenizer"]
    clip = clip.to(device)

    results = []
    for label, desc in labels.items():
        v = get_vid_feat(frames2tensor(frames, device=device), clip)
        t = get_text_feat_dict([label], clip, tokenizer)[label]

        val = torch.nn.functional.cosine_similarity(v, t).item()
        results.append((val, desc))
    return [f"[sim {val:.2f}]: {desc}" for val, desc in sorted(results, reverse=True)]


def projection_score(video_path, model=MODEL):
    device = torch.device("cuda")
    frames = get_frames(video_path)

    clip, tokenizer = model["viclip"], model["tokenizer"]
    clip = clip.to(device)

    v_0 = get_vid_feat(frames2tensor(frames[:1] * 8, device=device), clip)
    v_f = get_vid_feat(
        frames2tensor(
            get_frames("videos/4059f863-279e-41dc-8b34-48422b64c832.mp4"), device=device
        ),
        clip,
    )

    v = get_vid_feat(frames2tensor(frames, device=device), clip)

    labels = [
        "A stick model of a dog standing completely still.",
        "A stick model of a dog actively running in grass.",
    ]
    ts = get_text_feat_dict(labels, clip, tokenizer)

    direction = v_f - v_0
    direction = direction / direction.norm()
    return [f"running dir: {(v @ direction.T).item():.4f}"]

In [214]:
def video_widget(path):
    video_html = f"""
    <video width="180" autoplay muted>
      <source src="{path}" type="video/mp4">
    </video>
    """
    return HTML(video_html)


def display_table(data, funcs):
    grid = GridspecLayout(len(data), len(funcs) + 1)

    for i, row in enumerate(data.iter_rows(named=True)):
        video = video_widget(row["path"])
        grid[i, 0] = video
        for j, f in enumerate(funcs, start=1):
            grid[i, j] = VBox([Label(v, width=400) for v in f(row["path"])])
    return grid

In [212]:
display_table(data, [probs, similarity_score, projection_score])



GridspecLayout(children=(HTML(value='\n    <video width="180" autoplay muted>\n      <source src="videos/1eb2b…