In [49]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.Client() # Ocupamos caché

default_ef = embedding_functions.DefaultEmbeddingFunction()

In [56]:
collection = chroma_client.create_collection(name="my_collection", embedding_function=default_ef) # Creamos colección
# collection = chroma_client.get_collection(name="my_collection") # Recuperamos colección
# chroma_client.delete_collection(name="my_collection") # Delete colección # Eliminamos colección

In [52]:
#collection.add(
    #documents=["This is a document", "This is another document"],
    #metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    #ids=["id1", "id2"]
#) # Agregamos datos, por defecto ocupa el modelo de embedding definido

In [53]:
#results = collection.query(
    #query_texts=["This is a query document"],
    #n_results=2
#) # Búsqueda por similitud

In [78]:
class DatabaseBenchmark:
    def __init__(self, db_collection, cost_per_hour = False):
        self.db_collection = db_collection
        self.cost_per_hour = cost_per_hour

    def query(self, query_string, n_results):
        results = self.db_collection.query(
            query_texts=query_string, # Lista de strings
            n_results=n_results # Número de resultados
        )
        pass

    def build_index(self, documents, metadatas, ids):
        self.db_collection.add(
            documents=documents, # Lista Documentos
            metadatas=metadatas, # Lista de Diccionarios
            ids=ids # Lista de Ids
        )
        pass

    def measure_qps(self, num_queries=5):
        import timeit
        
        start_time = timeit.default_timer()
        for _ in range(num_queries):
            self.query(query_string=["Esto es un documento"], n_results=1)
        end_time = timeit.default_timer()
        
        qps = num_queries / (end_time - start_time)
        
        return qps
    
    def measure_qp_dollar(self, qps):
        if self.cost_per_hour is False:
            return "No hay costo por hora"
        qp_dollar = (qps / self.cost_per_hour) * 3600
        return qp_dollar
    
    def measure_latency(self):
        import timeit
        
        start_time = timeit.default_timer()
        self.query(["Esto es un documento"], 1)
        latency = timeit.default_timer() - start_time
        
        return latency
    
    def measure_index_building_time(self):
        import timeit
        
        start_time = timeit.default_timer()
        self.build_index(["Esto es un documento"], {"source": "my_source"}, ["id1"])
        index_building_time = timeit.default_timer() - start_time
        
        return index_building_time

In [79]:
Benchmark = DatabaseBenchmark(db_collection=collection)

In [66]:
Benchmark.measure_index_building_time() # Segundos

0.12283339999976306

In [80]:
Benchmark.measure_qps() # Queries per seconds

11.803291536697659

In [81]:
Benchmark.measure_latency() # Latencia query

0.09563619999971706