In [34]:
# Importing useful dependencies
import io
import os
import boto3
import chromadb
import torch
from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
from PIL import Image
import imageio
import numpy as np

In [35]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [36]:
# Connect to the server (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)
# Although we set a path for persistent directory when defining the Docker Container
# It actually stores the embeddings inside the container

# We can use the following line to remove all the stored data in a collection
#client.delete_collection(name="texts")

# Create or get the collection named "texts"
collection = client.create_collection(name="videos", get_or_create=True, embedding_function=None)

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "Searchium-ai/clip4clip-webvid150k"
tokenizer = CLIPImageProcessor.from_pretrained(model_name)
model = CLIPVisionModelWithProjection.from_pretrained(model_name)
model.to(device)
model.eval()

CLIPVisionModelWithProjection(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (position_embedding): Embedding(50, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=

In [38]:

temp_file = "temp_video_in.mp4"
def get_video(bucket, key, max_frames = 16):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    with open(temp_file, "wb") as f:
        f.write(body)
    frames = []
    reader = imageio.get_reader(temp_file, format="ffmpeg")
    total_frames = reader.count_frames()
    if total_frames and total_frames > 0:
        step = max(1, total_frames // max_frames)
        print(total_frames, step, max_frames)
        idxs = list(range(0, total_frames, step))[:max_frames]
        for i in idxs:
            try:
                frame = reader.get_data(i)
                frames.append(Image.fromarray(frame))
            except Exception:
                continue
    else:
        # fallback: iterate and collect up to max_frames
        for i, frame in enumerate(reader):
            frames.append(Image.fromarray(frame))
            if len(frames) >= max_frames:
                break
    reader.close()
    return frames

In [39]:
def videos_to_embeddings(bucket, prefix, collection):
    id_counter = 0

    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        paths = []
        embeddings = []
        ids = []

        for obj in page.get("Contents", []):
            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"):
                continue

            id_counter += 1

            frames = get_video(bucket=bucket, key=key)
            if not frames:
                continue

            inputs = tokenizer(images=frames, return_tensors="pt")
            pixel_values = inputs["pixel_values"].to(device)  # shape (num_frames, 3, H, W)

            with torch.no_grad():
                outputs = model(pixel_values=pixel_values)
                # prefer pooler_output if available, else mean over last_hidden_state
                emb_frames = getattr(outputs, "pooler_output", None)
                if emb_frames is None:
                    emb_frames = outputs.last_hidden_state.mean(dim=1)
                # average frame embeddings to make video embedding
                video_emb = emb_frames.mean(dim=0).cpu().numpy()
                # normalize
                norm = np.linalg.norm(video_emb)
                if norm > 0:
                    video_emb = video_emb / norm

            paths.append(f"{bucket}/{key}")
            embeddings.append(video_emb.tolist())
            ids.append(f"vid_{id_counter}")

            print(f"Created embedding for {key} ({len(embeddings)} items in current batch).")

        # upsert into chroma collection
        collection.add(
            ids=ids,
            documents=paths,
            embeddings=embeddings
        )
        
        print(f"All embeddings in the current batch are store successfully in the collection {collection.name}.")
    os.remove(temp_file)

In [40]:
videos_to_embeddings(bucket = "trusted-zone", prefix = "videos/", collection = collection)

3582 223 16
Created embedding for videos/video_1758900500520.mp4 (1 items in current batch).
1980 123 16
Created embedding for videos/video_1758900500636.mp4 (2 items in current batch).
1886 117 16
Created embedding for videos/video_1758900500710.mp4 (3 items in current batch).
All embeddings in the current batch are store successfully in the collection videos.


In [41]:
def print_stored_embeddings(collection, x=None): # x is the maximum number of files to print
    results = collection.get(include=["documents", "embeddings"])
    for i in range(len(results["documents"])):
        print("ID:", results['ids'][i])
        print("Document:", results["documents"][i])
        print("Embedding (first 5 dims):", results["embeddings"][i][:5])
        print("---")
        if x and (x-1) == i:
            break
        
# We can use this function to print the embeddings stored in chromaDB
print_stored_embeddings(collection, x = 10)

ID: vid_1
Document: trusted-zone/videos/video_1758900500520.mp4
Embedding (first 5 dims): [ 0.00670266  0.03183165 -0.01030848 -0.04513294  0.01144601]
---
ID: vid_2
Document: trusted-zone/videos/video_1758900500636.mp4
Embedding (first 5 dims): [-0.00259666  0.03752612 -0.03215834 -0.03951677  0.01091534]
---
ID: vid_3
Document: trusted-zone/videos/video_1758900500710.mp4
Embedding (first 5 dims): [ 0.00654727  0.03116178 -0.0128628  -0.04546758  0.00775671]
---
