## Combining Text Embeddings, Image Embeddings and Video Embeddings

The objective is the same as the one stated in the file called "text_image_embeddings.jpynb"

In this case, we will use a model that creates embeddings for texts, images and videos.

In [60]:
# Importing useful dependencies
import io
import torch
import boto3
import imageio
import chromadb
import open_clip
import numpy as np
from PIL import Image
import torch.nn as nn
from io import BytesIO
import ipywidgets as widgets
from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, AutoTokenizer, CLIPTextModelWithProjection

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [67]:
# Connect to the server (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)

# Create or get the collection named "texts_images_videos" to store embeddings of videos, images and text
collection_texts_images_videos = client.create_collection(name="texts_images_videos", get_or_create=True, embedding_function=None)

In [68]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "Searchium-ai/clip4clip-webvid150k"
# By default, this model creates embeddings of size 512 for texts and images
# And strangely, creates embeddings of size 768 for videos, so we must homogenize the dimensions

# === Text encoder ===
text_tokenizer = AutoTokenizer.from_pretrained(model_name)
text_model = CLIPTextModelWithProjection.from_pretrained(model_name)
text_model.to(device)

# === Image / video frame encoder ===
image_processor = CLIPImageProcessor.from_pretrained(model_name)
vision_model = CLIPVisionModelWithProjection.from_pretrained(model_name)
vision_model.to(device)

CLIPVisionModelWithProjection(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (position_embedding): Embedding(50, 768)
    )
    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=

In [61]:
# Layer to project 512-d embeddings to 768-d
projection = nn.Linear(512, 768).to(device)

### Creating Text Embeddings

In [69]:
# We can use this function to retrieve an text from our bucket
def get_text(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    text = body.decode("utf-8")
    return text

In [70]:
@torch.no_grad()
# The next function returns the embedding of the given text
def embed_text(model, text: str, tokenizer=text_tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=77).to(device) # max_length = 77 -> CLIP's max token length
    outputs = model(**inputs)
    text_features = outputs.text_embeds
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
    # Project to 768-d
    text_features_768 = projection(text_features)
    text_features_768 = text_features_768 / text_features_768.norm(dim=-1, keepdim=True)
    return text_features_768.cpu().numpy()[0]

In [71]:
# The next function stores the embeddings of the texts stored in the Trusted Zone and store them in the collection named 'texts' of our ChromaDB
def texts_to_embeddings(src_bucket, collection, model, src_prefix=""):

    # Incremental id assigned to each embedding
    id_counter = 0
    
    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=src_bucket, Prefix=src_prefix):

        # List of paths (meta_data)
        file_paths = []
        # List of embeddings
        embeddings = []
        # List of unique IDs for each embedding
        ids = []
        
        for obj in page.get("Contents", []):

            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue
            id_counter += 1

            # Fetch and open the text file
            response = s3.get_object(Bucket=src_bucket, Key=key)
            body = response["Body"].read().decode("utf-8")
            
            # Compute embedding
            vector = embed_text(model, body) # A numerical vector of size 512

            print(f"Created embedding for {key} ({len(embeddings)} items in current batch).")

            # Storing data
            file_paths.append(f"{src_bucket}/{key}")
            embeddings.append(vector)
            ids.append(f"text_{id_counter}")

        # Store the images of a page at once
        collection.add(
                ids=ids,
                documents=file_paths,
                embeddings=embeddings
        )

        print(f"All embeddings in the current batch are store successfully in the collection {collection.name}.")

In [72]:
# Store embeddings
texts_to_embeddings(src_bucket = "trusted-zone", src_prefix = "texts/", collection = collection_texts_images_videos, model = text_model)

Created embedding for texts/text_1760786400687.txt (0 items in current batch).
Created embedding for texts/text_1760786400752.txt (1 items in current batch).
Created embedding for texts/text_1760786400827.txt (2 items in current batch).
Created embedding for texts/text_1760786400902.txt (3 items in current batch).
Created embedding for texts/text_1760786400988.txt (4 items in current batch).
Created embedding for texts/text_1760786401058.txt (5 items in current batch).
Created embedding for texts/text_1760786401122.txt (6 items in current batch).
Created embedding for texts/text_1760786401209.txt (7 items in current batch).
Created embedding for texts/text_1760786401285.txt (8 items in current batch).
Created embedding for texts/text_1760786401393.txt (9 items in current batch).
Created embedding for texts/text_1760786401463.txt (10 items in current batch).
Created embedding for texts/text_1760786401534.txt (11 items in current batch).
Created embedding for texts/text_1760786401623.txt

### Creating Image Embeddings

In [73]:
# We can use this function to retrieve an image from our bucket in PIL Image format
def get_image(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    img = Image.open(io.BytesIO(body))
    return img

In [77]:
# The next function returns the embedding of the given PIL Image
@torch.no_grad()
def embed_image(processor, model, pil_img):
    inputs = processor(images=pil_img, return_tensors="pt").to(device)
    outputs = model(**inputs)
    feats = outputs.image_embeds
    feats = feats / feats.norm(dim=-1, keepdim=True)
    
    # Project to 768-d
    feats_768 = projection(feats)
    feats_768 = feats_768 / feats_768.norm(dim=-1, keepdim=True)
    
    return feats_768.cpu().numpy().squeeze()

In [78]:
# The next function stores the embeddings of the images stored in the Trusted Zone and store them in the collection named 'images' of our ChromaDB
def images_to_embeddings(src_bucket, collection, preprocess, model, src_prefix=""):

    # Incremental id assigned to each image embedding
    id_counter = 0
    
    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=src_bucket, Prefix=src_prefix):

        # List of paths (meta_data)
        image_paths = []
        # List of embeddings
        embeddings = []
        # List of unique IDs for each embedding
        ids = []
        
        for obj in page.get("Contents", []):

            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue

            id_counter += 1

            # Download the image
            img = get_image(src_bucket, key)
            
            # Compute embedding
            vector = embed_image(preprocess, model, img) # A numerical vector of size 512

            print(f"Created embedding for {key} ({len(embeddings)} items in current batch).")

            # Storing data
            image_paths.append(f"{src_bucket}/{key}")
            embeddings.append(vector)
            ids.append(f"img_{id_counter}")

        # Store the images of a page at once
        collection.add(
                ids=ids,
                documents=image_paths,
                embeddings=embeddings
        )

        print(f"All embeddings in the current batch are store successfully in the collection {collection.name}.")

In [79]:
# Store embeddings
images_to_embeddings(src_bucket = "trusted-zone", src_prefix = "images/", collection = collection_texts_images_videos, preprocess = image_processor, model = vision_model)

Created embedding for images/image_1760786279860.png (0 items in current batch).
Created embedding for images/image_1760786279932.png (1 items in current batch).
Created embedding for images/image_1760786279989.png (2 items in current batch).
Created embedding for images/image_1760786280056.png (3 items in current batch).
Created embedding for images/image_1760786280131.png (4 items in current batch).
Created embedding for images/image_1760786280218.png (5 items in current batch).
Created embedding for images/image_1760786280278.png (6 items in current batch).
Created embedding for images/image_1760786280334.png (7 items in current batch).
Created embedding for images/image_1760786280394.png (8 items in current batch).
Created embedding for images/image_1760786280458.png (9 items in current batch).
Created embedding for images/image_1760786280522.png (10 items in current batch).
Created embedding for images/image_1760786280581.png (11 items in current batch).
Created embedding for imag

### Creating Video Embeddings

Our video embeddings from the collection "videos" are generated using this model, so we only need to copy them into this collection.

In [81]:
# Create or get the collection named "videos"
collection_videos = client.create_collection(name="videos", get_or_create=True, embedding_function=None)

# Fetch all embeddings of videos
videos_data = collection_videos.get(include=["embeddings","documents"])
# Copy them to our new collection
collection_texts_images_videos.add(
    ids=videos_data["ids"],
    embeddings=videos_data["embeddings"],
    documents=videos_data["documents"],
)