## Text Embeddings

In [1]:
# Importing useful dependencies
import boto3
import torch
import chromadb
import numpy as np
import torch.nn.functional as F
import open_clip

# Set a seed for reproducibility
np.random.seed(10721)
torch.manual_seed(10721)
torch.cuda.manual_seed_all(10721)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\SakuraSnow\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\

AttributeError: _ARRAY_API not found

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [3]:
# Connect to the server (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)
# Although we set a path for persistent directory when defining the Docker Container
# It actually stores the embeddings inside the container

# We can use the following line to remove all the stored data in a collection
#client.delete_collection(name="texts")

# Create or get the collection named "texts"
collection = client.create_collection(name="texts", get_or_create=True, embedding_function=None)

In [4]:
# Function that prints the embeddings stored in a collection
def print_stored_embeddings(collection, x=None): # x is the maximum number of files to print
    results = collection.get(include=["documents", "embeddings"])
    for i in range(len(results["documents"])):
        print("ID:", results['ids'][i])
        print("Document:", results["documents"][i])
        print("Embedding (first 5 dims):", results["embeddings"][i][:5])
        print("---")
        if x and (x-1) == i:
            break

# We can use this function to print the embeddings stored in chromaDB
print_stored_embeddings(collection, x = 10)

ID: text_1
Document: trusted-zone/texts/text_1762966926656.txt
Embedding (first 5 dims): [0.00077022 0.01614387 0.00257997 0.07546198 0.01531463]
---
ID: text_2
Document: trusted-zone/texts/text_1762966926816.txt
Embedding (first 5 dims): [ 0.00948664  0.05548258 -0.03116568  0.07714096  0.00099774]
---
ID: text_3
Document: trusted-zone/texts/text_1762966926955.txt
Embedding (first 5 dims): [-0.00191807  0.1547042  -0.06087398  0.09676054  0.02398978]
---
ID: text_4
Document: trusted-zone/texts/text_1762966927095.txt
Embedding (first 5 dims): [-0.05385948  0.05721775 -0.01375611  0.10736154  0.01041584]
---
ID: text_5
Document: trusted-zone/texts/text_1762966927196.txt
Embedding (first 5 dims): [-0.04029422  0.05703993 -0.03652878  0.01447886  0.02742019]
---
ID: text_6
Document: trusted-zone/texts/text_1762966927290.txt
Embedding (first 5 dims): [-0.06599008  0.06625921 -0.01418489  0.09016113 -0.0051327 ]
---
ID: text_7
Document: trusted-zone/texts/text_1762966927370.txt
Embedding (f

**Creating Embeddings**

In [5]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
model, _, _ = open_clip.create_model_and_transforms("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
tokenizer = open_clip.get_tokenizer("hf-hub:laion/CLIP-ViT-L-14-laion2B-s32B-b82K") # Tokenizer for texts
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-23): 24 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwi

In [7]:
# ---- Show parameter counts ----
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model: CLIP-ViT-L-14")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Model: CLIP-ViT-L-14
Total parameters: 427,616,513
Trainable parameters: 427,616,513


In [10]:
# We can use this function to retrieve an text from our bucket
def get_text(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    text = body.decode("utf-8")
    return text
@torch.no_grad()
# The next function returns the embedding of the given text
def embed_text(tokenizer, model, text):
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    tokens = tokenizer(paragraphs).to(device)

    with torch.no_grad():
        feats = model.encode_text(tokens)
        feats = F.normalize(feats, dim=-1)

    all_embs = feats.detach().cpu().numpy()
    if len(all_embs) == 0:
        return np.zeros(model.config.hidden_size)
    elif len(all_embs) == 1:
        return all_embs[0]
    else:
        full_emb = np.mean(np.stack(all_embs), axis=0)
        full_emb = full_emb / np.linalg.norm(full_emb)
        return full_emb

In [11]:
# The next function stores the embeddings of the texts stored in the Trusted Zone and store them in the collection named 'texts' of our ChromaDB
def texts_to_embeddings(src_bucket, collection, model, tokenizer, src_prefix=""):

    # Incremental id assigned to each embedding
    id_counter = 0
    
    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=src_bucket, Prefix=src_prefix):

        # List of paths (meta_data)
        file_paths = []
        # List of embeddings
        embeddings = []
        # List of unique IDs for each embedding
        ids = []
        
        for obj in page.get("Contents", []):

            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue

            id_counter += 1

            # Fetch and open the text file
            response = s3.get_object(Bucket=src_bucket, Key=key)
            body = response["Body"].read().decode("utf-8")
            
            # Compute embedding
            vector = embed_text(tokenizer, model, body) # A numerical vector of size 768

            print(f"Created embedding for {key} ({len(embeddings)} items in current batch).")

            # Storing data
            file_paths.append(f"{src_bucket}/{key}")
            embeddings.append(vector)
            ids.append(f"text_{id_counter}")

        # Store the images of a page at once
        collection.add(
                ids=ids,
                documents=file_paths,
                embeddings=embeddings
        )

        print(f"All embeddings in the current batch are store successfully in the collection {collection.name}.")

In [10]:
texts_to_embeddings(src_bucket = "trusted-zone", src_prefix = "texts/", collection = collection, model = model, tokenizer=tokenizer)

Created embedding for texts/text_1762966926656.txt (0 items in current batch).
Created embedding for texts/text_1762966926816.txt (1 items in current batch).
Created embedding for texts/text_1762966926955.txt (2 items in current batch).
Created embedding for texts/text_1762966927095.txt (3 items in current batch).
Created embedding for texts/text_1762966927196.txt (4 items in current batch).
Created embedding for texts/text_1762966927290.txt (5 items in current batch).
Created embedding for texts/text_1762966927370.txt (6 items in current batch).
Created embedding for texts/text_1762966927449.txt (7 items in current batch).
Created embedding for texts/text_1762966927532.txt (8 items in current batch).
Created embedding for texts/text_1762966927611.txt (9 items in current batch).
Created embedding for texts/text_1762966927710.txt (10 items in current batch).
Created embedding for texts/text_1762966927842.txt (11 items in current batch).
Created embedding for texts/text_1762966928008.txt

In [12]:
# Check the embeddings stored in chromaDB
print_stored_embeddings(collection)

ID: text_1
Document: trusted-zone/texts/text_1762966926656.txt
Embedding (first 5 dims): [0.00077022 0.01614387 0.00257997 0.07546198 0.01531463]
---
ID: text_2
Document: trusted-zone/texts/text_1762966926816.txt
Embedding (first 5 dims): [ 0.00948664  0.05548258 -0.03116568  0.07714096  0.00099774]
---
ID: text_3
Document: trusted-zone/texts/text_1762966926955.txt
Embedding (first 5 dims): [-0.00191807  0.1547042  -0.06087398  0.09676054  0.02398978]
---
ID: text_4
Document: trusted-zone/texts/text_1762966927095.txt
Embedding (first 5 dims): [-0.05385948  0.05721775 -0.01375611  0.10736154  0.01041584]
---
ID: text_5
Document: trusted-zone/texts/text_1762966927196.txt
Embedding (first 5 dims): [-0.04029422  0.05703993 -0.03652878  0.01447886  0.02742019]
---
ID: text_6
Document: trusted-zone/texts/text_1762966927290.txt
Embedding (first 5 dims): [-0.06599008  0.06625921 -0.01418489  0.09016113 -0.0051327 ]
---
ID: text_7
Document: trusted-zone/texts/text_1762966927370.txt
Embedding (f

### A Discarded Approach (Get text embeddings using a different model from the one used for images)

In [5]:
# Create or get the collection named "texts_another"
collection_another = client.create_collection(name="texts_another", get_or_create=True, embedding_function=None)

In [14]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# In case we want to use a different model for generating text embeddings
model_another, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained='openai')
tokenizer_another = open_clip.get_tokenizer("ViT-B-32")
model_another.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [9]:
# ---- Show parameter counts ----
total_params = sum(p.numel() for p in model_another.parameters())
trainable_params = sum(p.numel() for p in model_another.parameters() if p.requires_grad)

print(f"Model: ViT-B-32")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Model: ViT-B-32
Total parameters: 151,277,313
Trainable parameters: 151,277,313


In [15]:
texts_to_embeddings(src_bucket = "trusted-zone", src_prefix = "texts/", collection = collection_another, model = model_another, tokenizer=tokenizer_another)

Created embedding for texts/text_1762966926656.txt (0 items in current batch).
Created embedding for texts/text_1762966926816.txt (1 items in current batch).
Created embedding for texts/text_1762966926955.txt (2 items in current batch).
Created embedding for texts/text_1762966927095.txt (3 items in current batch).
Created embedding for texts/text_1762966927196.txt (4 items in current batch).
Created embedding for texts/text_1762966927290.txt (5 items in current batch).
Created embedding for texts/text_1762966927370.txt (6 items in current batch).
Created embedding for texts/text_1762966927449.txt (7 items in current batch).
Created embedding for texts/text_1762966927532.txt (8 items in current batch).
Created embedding for texts/text_1762966927611.txt (9 items in current batch).
Created embedding for texts/text_1762966927710.txt (10 items in current batch).
Created embedding for texts/text_1762966927842.txt (11 items in current batch).
Created embedding for texts/text_1762966928008.txt