In [1]:
# Importing useful dependencies
import io
import os
import boto3
import torch
import chromadb
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import numpy as np
from chromadb.config import Settings
import torch.nn.functional as F

In [2]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [4]:
# Connect to the server (Docker Container)
client = chromadb.HttpClient(host="localhost", port=8000)
# Although we set a path for persistent directory when defining the Docker Container
# It actually stores the embeddings inside the container

# We can use the following line to remove all the stored data in a collection
#client.delete_collection(name="texts")

# Create or get the collection named "texts"
collection = client.create_collection(name="texts", get_or_create=True, embedding_function=None)

In [5]:
# Just in case our device has gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = BertModel.from_pretrained("bert-large-cased")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [12]:
# We can use this function to retrieve an text from our bucket
def get_text(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    text = body.decode("utf-8")
    return text
# The next function returns the embedding of the given text
def embed_text(tokenizer, model, text):
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    all_embs = []
    for para in paragraphs:
        encoded_input = tokenizer(text, return_tensors='pt',truncation=True,max_length=512,padding='max_length').to(device)
        with torch.no_grad():
            output = model(**encoded_input)
            feats = output.pooler_output
            feats = feats / feats.norm(dim=-1, keepdim=True)
            feats_np = feats.cpu().numpy().squeeze()
            all_embs.append(feats_np)
    if len(all_embs) == 0:
        return np.zeros(model.config.hidden_size)
    elif len(all_embs) == 1:
        return all_embs[0]
    else:
        full_emb = np.mean(np.stack(all_embs), axis=0)
        full_emb = full_emb / np.linalg.norm(full_emb)
        return full_emb

In [8]:
def texts_to_embeddings(src_bucket, collection,tokenizer , model, src_prefix=""):

    # Incremental id assigned to each image embedding
    id_counter = 0
    
    paginator = s3.get_paginator("list_objects_v2") # It returns objects in pages and not all at once.
    for page in paginator.paginate(Bucket=src_bucket, Prefix=src_prefix):

        # List of paths (meta_data)
        texts_paths = []
        # List of embeddings
        embeddings = []
        # List of unique IDs for each embedding
        ids = []
        
        for obj in page.get("Contents", []):

            key = obj["Key"]

            if obj['Size'] == 0 and key.endswith("/"): # skip the folder itself
                continue

            id_counter += 1

            # Download the image
            text = get_text(src_bucket, key)
            
            # Compute embedding
            vector = embed_text(tokenizer, model, text) # A numerical vector of size 1024

            print(f"Created embedding for {key} ({len(embeddings)} items in current batch).")

            # Storing data
            texts_paths.append(f"{src_bucket}/{key}")
            embeddings.append(vector)
            ids.append(f"text_{id_counter}")

        # Store the images of a page at once
        collection.add(
                ids=ids,
                documents=texts_paths,
                embeddings=embeddings
        )

        print(f"All embeddings in the current batch are store successfully in the collection {collection.name}.")


In [13]:
texts_to_embeddings(src_bucket = "trusted-zone", src_prefix = "texts/", collection = collection, tokenizer = tokenizer, model = model)

Created embedding for texts/text_1760137269318.txt (0 items in current batch).
Created embedding for texts/text_1760137269387.txt (1 items in current batch).
Created embedding for texts/text_1760137269455.txt (2 items in current batch).
Created embedding for texts/text_1760137269523.txt (3 items in current batch).
Created embedding for texts/text_1760137269594.txt (4 items in current batch).
Created embedding for texts/text_1760137269679.txt (5 items in current batch).
Created embedding for texts/text_1760137269780.txt (6 items in current batch).
Created embedding for texts/text_1760137269913.txt (7 items in current batch).
Created embedding for texts/text_1760137269996.txt (8 items in current batch).
Created embedding for texts/text_1760137270063.txt (9 items in current batch).
Created embedding for texts/text_1760137270123.txt (10 items in current batch).
Created embedding for texts/text_1760137270192.txt (11 items in current batch).
Created embedding for texts/text_1760137270260.txt

KeyboardInterrupt: 

In [29]:
# Function that prints the embeddings stored in a collection
def print_stored_embeddings(collection, x=None): # x is the maximum number of files to print
    results = collection.get(include=["documents", "embeddings"])
    for i in range(len(results["documents"])):
        print("ID:", results['ids'][i])
        print("Document:", results["documents"][i])
        print("Embedding (first 5 dims):", results["embeddings"][i][:5])
        print("---")
        if x and (x-1) == i:
            break

# We can use this function to print the embeddings stored in chromaDB
print_stored_embeddings(collection, x = 10)

ID: text_1
Document: trusted-zone/texts/text_1759415298446.txt
Embedding (first 5 dims): [ 0.0317086   0.03406123  0.03387513 -0.02629157  0.03390692]
---
ID: text_2
Document: trusted-zone/texts/text_1759415298575.txt
Embedding (first 5 dims): [ 0.02476478  0.0335832   0.03354634 -0.03127217  0.03354959]
---
ID: text_3
Document: trusted-zone/texts/text_1759415298698.txt
Embedding (first 5 dims): [ 0.0304194   0.03397059  0.03395893 -0.02664963  0.03391798]
---
ID: text_4
Document: trusted-zone/texts/text_1759415298821.txt
Embedding (first 5 dims): [ 0.03304858  0.03422263  0.034059   -0.01954914  0.03402339]
---
ID: text_5
Document: trusted-zone/texts/text_1759415298945.txt
Embedding (first 5 dims): [ 0.02911022  0.03400113  0.03394191 -0.02647135  0.03387284]
---
ID: text_6
Document: trusted-zone/texts/text_1759415299068.txt
Embedding (first 5 dims): [-0.02995777  0.03230383  0.03228472 -0.03216354  0.0322841 ]
---
ID: text_7
Document: trusted-zone/texts/text_1759415299210.txt
Embeddi

In [32]:
# We can now perform a similarity search to test it

# The following function searches the top k most similar images in ChromaDB using the embeddings of an text
def find_similar_texts(collection, query_emb: np.ndarray, top_k: int = 5):
    # Chroma expects list-of-lists for query_embeddings
    query_vector = query_emb.tolist()

    results = collection.query(
        query_embeddings=[query_vector],
        n_results=top_k,
        include=["documents", "distances"]
    )

    # Extract first query results
    ids = results.get("ids", [[]])[0]
    docs = results.get("documents", [[]])[0]
    dists = results.get("distances", [[]])[0]

    print(f"Top {top_k} similar texts:")
    for rank, (doc_id, doc, dist) in enumerate(zip(ids, docs, dists), start=1):
        similarity = 1 - dist  
        print(f"{rank}. id={doc_id}, distance={dist:.4f} (similarity={similarity:.4f})")
        print(f"   text: {doc[:200]}{'...' if len(doc) > 200 else ''}")

    return results

In [47]:
# Sample query
emb = embed_text(tokenizer, model, "A game similar to Nier: Automata")

# Search for similar texts in ChromaDB
results = find_similar_texts(collection, emb, top_k=10) # The first one is always the target texts itself (if we are using a txt file from MinIO)

Top 10 similar texts:
1. id=text_284, distance=0.0117 (similarity=0.9883)
   text: trusted-zone/texts/text_1759415340124.txt
2. id=text_421, distance=0.0170 (similarity=0.9830)
   text: trusted-zone/texts/text_1759415363550.txt
3. id=text_74, distance=0.0172 (similarity=0.9828)
   text: trusted-zone/texts/text_1759415308880.txt
4. id=text_4, distance=0.0201 (similarity=0.9799)
   text: trusted-zone/texts/text_1759415298821.txt
5. id=text_214, distance=0.0204 (similarity=0.9796)
   text: trusted-zone/texts/text_1759415329415.txt
6. id=text_394, distance=0.0222 (similarity=0.9778)
   text: trusted-zone/texts/text_1759415356864.txt
7. id=text_426, distance=0.0225 (similarity=0.9775)
   text: trusted-zone/texts/text_1759415364349.txt
8. id=text_509, distance=0.0231 (similarity=0.9769)
   text: trusted-zone/texts/text_1759415376729.txt
9. id=text_524, distance=0.0254 (similarity=0.9746)
   text: trusted-zone/texts/text_1759415378903.txt
10. id=text_76, distance=0.0260 (similarity=0.9740)
  