## Video Embeddings

In [1]:
# Importing useful dependencies
import io
import os
import boto3
import chromadb
import torch
from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
from PIL import Image
import imageio
import numpy as np


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\

AttributeError: _ARRAY_API not found

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [3]:
# Connect to the server (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)
# Although we set a path for persistent directory when defining the Docker Container
# It actually stores the embeddings inside the container

# We can use the following line to remove all the stored data in a collection
#client.delete_collection(name="texts")

# Create or get the collection named "texts"
collection = client.create_collection(name="videos", get_or_create=True, embedding_function=None)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "Searchium-ai/clip4clip-webvid150k"
tokenizer = CLIPImageProcessor.from_pretrained(model_name)
model = CLIPVisionModelWithProjection.from_pretrained(model_name)
model.to(device)
model.eval()

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

CLIPVisionModelWithProjection(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (position_embedding): Embedding(50, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=

In [5]:
temp_file = "temp_video_in.mp4"
def get_video(bucket, key, max_frames = 16):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    with open(temp_file, "wb") as f:
        f.write(body)
    frames = []
    reader = imageio.get_reader(temp_file, format="ffmpeg")
    total_frames = reader.count_frames()
    if total_frames and total_frames > 0:
        step = max(1, total_frames // max_frames)
        #print(total_frames, step, max_frames)
        idxs = list(range(0, total_frames, step))[:max_frames]
        for i in idxs:
            try:
                frame = reader.get_data(i)
                frames.append(Image.fromarray(frame))
            except Exception:
                continue
    else:
        # fallback: iterate and collect up to max_frames
        for i, frame in enumerate(reader):
            frames.append(Image.fromarray(frame))
            if len(frames) >= max_frames:
                break
    reader.close()
    return frames

In [6]:
def videos_to_embeddings(bucket, prefix, collection):
    id_counter = 0

    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        paths = []
        embeddings = []
        ids = []

        for obj in page.get("Contents", []):
            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"):
                continue

            id_counter += 1

            frames = get_video(bucket=bucket, key=key)
            if not frames:
                continue

            inputs = tokenizer(images=frames, return_tensors="pt")
            pixel_values = inputs["pixel_values"].to(device)  # shape (num_frames, 3, H, W)

            with torch.no_grad():
                outputs = model(pixel_values=pixel_values)
                # prefer pooler_output if available, else mean over last_hidden_state
                emb_frames = getattr(outputs, "pooler_output", None)
                if emb_frames is None:
                    emb_frames = outputs.last_hidden_state.mean(dim=1)
                # average frame embeddings to make video embedding
                video_emb = emb_frames.mean(dim=0).cpu().numpy()
                # normalize
                norm = np.linalg.norm(video_emb)
                if norm > 0:
                    video_emb = video_emb / norm

            paths.append(f"{bucket}/{key}")
            embeddings.append(video_emb.tolist())
            ids.append(f"vid_{id_counter}")

            print(f"Created embedding for {key} ({len(embeddings)} items in current batch).")

        # upsert into chroma collection
        collection.add(
            ids=ids,
            documents=paths,
            embeddings=embeddings
        )
        
        print(f"All embeddings in the current batch are store successfully in the collection {collection.name}.")
    os.remove(temp_file)

In [7]:
videos_to_embeddings(bucket = "trusted-zone", prefix = "videos/", collection = collection)

3582 223 16
Created embedding for videos/video_1760786503181.mp4 (1 items in current batch).
6059 378 16
Created embedding for videos/video_1760786503789.mp4 (2 items in current batch).
4800 300 16
Created embedding for videos/video_1760786504973.mp4 (3 items in current batch).
1517 94 16
Created embedding for videos/video_1760786506567.mp4 (4 items in current batch).
1649 103 16
Created embedding for videos/video_1760786507389.mp4 (5 items in current batch).
3224 201 16
Created embedding for videos/video_1760786507749.mp4 (6 items in current batch).
5845 365 16
Created embedding for videos/video_1760786509055.mp4 (7 items in current batch).
1772 110 16
Created embedding for videos/video_1760786510257.mp4 (8 items in current batch).
1800 112 16
Created embedding for videos/video_1760786511011.mp4 (9 items in current batch).
2285 142 16
Created embedding for videos/video_1760786511692.mp4 (10 items in current batch).
1980 123 16
Created embedding for videos/video_1760786512108.mp4 (11 i

In [8]:
def print_stored_embeddings(collection, x=None): # x is the maximum number of files to print
    results = collection.get(include=["documents", "embeddings"])
    for i in range(len(results["documents"])):
        print("ID:", results['ids'][i])
        print("Document:", results["documents"][i])
        print("Embedding (first 5 dims):", results["embeddings"][i][:5])
        print("---")
        if x and (x-1) == i:
            break
        
# We can use this function to print the embeddings stored in chromaDB
print_stored_embeddings(collection, x = 10)

ID: vid_1
Document: trusted-zone/videos/video_1760786503181.mp4
Embedding (first 5 dims): [ 0.00699654  0.03213104 -0.01002311 -0.04518802  0.0115661 ]
---
ID: vid_2
Document: trusted-zone/videos/video_1760786503789.mp4
Embedding (first 5 dims): [ 0.01091814  0.04181465 -0.02120226 -0.04388113 -0.00216616]
---
ID: vid_3
Document: trusted-zone/videos/video_1760786504973.mp4
Embedding (first 5 dims): [ 0.01047543  0.03237706 -0.01970521 -0.04911201  0.00637116]
---
ID: vid_4
Document: trusted-zone/videos/video_1760786506567.mp4
Embedding (first 5 dims): [ 0.00626602  0.03580121 -0.01199441 -0.04718189  0.01287883]
---
ID: vid_5
Document: trusted-zone/videos/video_1760786507389.mp4
Embedding (first 5 dims): [ 0.00306915  0.02716548 -0.02542129 -0.04034228  0.0046057 ]
---
ID: vid_6
Document: trusted-zone/videos/video_1760786507749.mp4
Embedding (first 5 dims): [ 0.00736225  0.03909471 -0.00081623 -0.04899862  0.00935894]
---
ID: vid_7
Document: trusted-zone/videos/video_1760786509055.mp4
