In [5]:
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd

# Podemos ocupar culaquier tipo de Cliente
chroma_client = chromadb.Client() # Ocupamos caché

default_ef = embedding_functions.DefaultEmbeddingFunction() # all-MiniLM-L6-v2 model

In [10]:
collection = chroma_client.create_collection(name="my_collection", embedding_function=default_ef) # Creamos colección
# collection = chroma_client.get_collection(name="my_collection") # Recuperamos colección
# chroma_client.delete_collection(name="my_collection") # Delete colección # Eliminamos colección

In [52]:
#collection.add(
    #documents=["This is a document", "This is another document"],
    #metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    #ids=["id1", "id2"]
#) # Agregamos datos, por defecto ocupa el modelo de embedding all-MiniLM-L6-v2

In [53]:
#results = collection.query(
    #query_texts=["This is a query document"],
    #n_results=2
#) # Búsqueda por similitud

In [20]:
class DatabaseBenchmark:
    def __init__(self, db_collection, cost_per_hour = False):
        self.db_collection = db_collection
        self.cost_per_hour = cost_per_hour

    def query(self, query_string, n_results):
        # Squared L2
        results = self.db_collection.query(
            query_texts=query_string, # Lista de strings
            n_results=n_results # Número de resultados
        )
        pass

    def build_index(self, documents, metadatas, ids):
        self.db_collection.add(
            documents=documents, # Lista Documentos
            metadatas=metadatas, # Lista de Diccionarios
            ids=ids # Lista de Ids
        )
        pass

    def measure_qps(self, num_queries=5):
        import timeit
        
        start_time = timeit.default_timer()
        for _ in range(num_queries):
            self.query(query_string=["¿Quién es Quijote de la Mancha?"], n_results=1)
        end_time = timeit.default_timer()
        
        qps = num_queries / (end_time - start_time)
        
        return qps
    
    def measure_qp_dollar(self, qps):
        if self.cost_per_hour is False:
            return "No hay costo por hora"
        qp_dollar = (qps / self.cost_per_hour) * 3600
        return qp_dollar
    
    def measure_latency(self):
        import timeit
        
        start_time = timeit.default_timer()
        self.query(["¿Quién es Quijote de la Mancha?"], 1)
        latency = timeit.default_timer() - start_time
        
        return latency
    
    def measure_index_building_time(self, documents, metadatas, ids):
        import timeit
        
        start_time = timeit.default_timer()
        self.build_index(documents, metadatas, ids)
        index_building_time = timeit.default_timer() - start_time
        
        return index_building_time

## Preparación Data

In [42]:
data = pd.read_csv("../data/textos.csv")
data.head()

Unnamed: 0,AUTOR,TITULO,TEXTO
0,Miguel de Cervantes Saavedra,Quijote - Primera Parte,El ingenioso hidalgo don Quijote de la Mancha\...
1,Miguel de Cervantes Saavedra,Quijote - Segunda Parte,Segunda parte del ingenioso caballero don Quij...
2,Garcilaso de la Vega,ÉGLOGA PRIMERA,"El dulce lamentar de dos pastores,\nSalicio ju..."
3,Garcilaso de la Vega,ÉGLOGA SEGUNDA,ALBANIO\n\nEn medio del invierno está templada...
4,Garcilaso de la Vega,ÉGLOGA TERCERA,"Aquella voluntad honesta y pura,\nilustre y he..."


In [33]:
textos = data['TEXTO'].tolist() # Lista de textos
metadatas = data[['TITULO', 'AUTOR']].to_dict('records') # Lista de diccionarios con metadatos
ids = [str(x) for x in data.index] # Lista de ids

## Ejemplo Benchmark

In [34]:
Benchmark = DatabaseBenchmark(db_collection=collection, cost_per_hour = False)

In [35]:
Benchmark.measure_index_building_time(textos, metadatas, ids) # Segundos

101.38702780000017

In [37]:
Benchmark.measure_qps(num_queries = 20) # Queries per seconds

9.802661638293758

In [41]:
Benchmark.measure_latency() # Latencia query

0.13359560000003512