In [1]:
import sys
import os
import numpy as np
import time
import pickle
import chromadb

sys.path.append(os.path.abspath(".."))

from mindb.mindb import minDB

In [3]:
# Read in the data
data_path = os.path.abspath(os.path.join('..', 'eval/data/fiqa_data.pickle'))
with open(data_path, "rb") as f:
    data = pickle.load(f)

corpus_embeddings = data["corpus_embeddings"]
query_embeddings = data["query_embeddings"]
ground_truths = data["ground_truths"]
text = data["text"]

### Create the minDB object

The data is added as a list of tuples containing (vector, metadata)

In [4]:
# Create the minDB object
db = minDB("fiqa_eval_test", create_or_load="load")

# Add the embeddings and text to the database
data = [(corpus_embeddings[i], {"text": text[i]}) for i in range(len(corpus_embeddings))]
#db.add(data)

### Train the minDB object

We set the parameters to be a good compromise of performance and compression. More detail about each parameter can be found [here]("https://github.com/D-Star-AI/minDB/wiki/Tunable-parameters")

In [None]:
pca_dimension = 256
opq_dimension = 128
compressed_vector_bytes = 32
omit_opq = False

db.train(pca_dimension=pca_dimension, opq_dimension=opq_dimension, compressed_vector_bytes=compressed_vector_bytes, omit_opq=omit_opq)

### Define the functions to evaluate the retrieval performance

Recall is defined as the `top_k` number of vectors retrieved that exist in the ground truth `top_k`.
For example, if you retrieve 10 vectors, and 9 of them exist in the ground truth top 10, recall would be 0.9.

Latency is defined as the latency for a single query, in milliseconds.

In [9]:
def evaluate(db, queries: np.ndarray, ground_truths: np.ndarray, preliminary_top_k: int, final_top_k: int) -> tuple[float, float]:

    start_time = time.time()
    total_sum = 0
    for i in range(queries.shape[0]):
        results = db.query(queries[i], preliminary_top_k, final_top_k)
        reranked_I = results["ids"]

        # compute recall
        total_sum += sum([1 for x in reranked_I[:final_top_k] if x in ground_truths[i, :final_top_k]]) # / final_top_k

    end_time = time.time()
    recall = total_sum / (ground_truths.shape[0] * final_top_k)
    latency = (end_time - start_time) * 1000 / queries.shape[0] # latency per query in ms

    return recall, latency


def evaluate_chroma(collection, queries: np.ndarray, ground_truths: np.ndarray, top_k: int) -> tuple[float, float]:

    start_time = time.time()
    total_sum = 0
    for i in range(queries.shape[0]):
        results = collection.query(
            query_embeddings=[queries[i].tolist()],
            n_results=top_k,
            include=["metadatas", "distances"] # Matches what is returned by minDB
        )
        reranked_I = results["ids"][0]
        # Convert each id to an integer (They are required to be strings in Chroma)
        reranked_I = [int(x) for x in reranked_I]

        # compute recall
        total_sum += sum([1 for x in reranked_I[:top_k] if x in ground_truths[i, :top_k]]) / top_k

    end_time = time.time()
    recall = total_sum / (ground_truths.shape[0])
    latency = (end_time - start_time) * 1000 / queries.shape[0] # latency per query in ms

    return recall, latency

### Evaluate the retrieval performance

We are going to use a `preliminary_top_k` of 200, and a `final_top_k` of 20. `preliminary_top_k` is the number of results returned from the search over the compressed faiss index. Then a brute force search is run on those vectors (the full, uncompressed vectors this time) to get the `final_top_k` vectors.

In [6]:
preliminary_top_k = 200
final_top_k = 20
recall, latency = evaluate(
    db, query_embeddings, ground_truths, preliminary_top_k, final_top_k
)
print ("recall: ", recall)
print ("latency: ", latency)

recall:  0.9945987654320988
latency:  5.17103554290018


### Create the chromaDB client

In [7]:
path = os.path.abspath(os.path.join('..', 'eval/chromadb/'))
client = chromadb.PersistentClient(path=path)

# Create the collection
#collection = client.create_collection("fiqa_eval_test", metadata={"hnsw:space": "cosine"})
collection = client.get_collection("fiqa_eval_test")

09-19 10:46:09 chromadb.telemetry.product.posthog:20 in __init__() INFO     Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [None]:
# Split the data into chunks of 10,000 vectors (ChromaDB has a limit on the number of vectors per add call)
chunk_size = 10000
for i in range(0, len(corpus_embeddings), chunk_size):
    chunk = corpus_embeddings[i:i+chunk_size]
    # Create the chunk ids. These need to match the indices of the vectors in the chunk
    chunk_ids = [f"{j}" for j in range(i, i+len(chunk))]
    # Create the metadata
    metadata = [{"text": text[j]} for j in range(i, i+len(chunk))]
    collection.add(
        embeddings=chunk,
        metadatas=metadata,
        ids=chunk_ids
    )


### Evaluate the retrieval performance

We are using the same recall and latency measures as with minDB

In [10]:
recall, latency = evaluate_chroma(
    collection, query_embeddings, ground_truths, top_k=20
)
print ("recall: ", recall)
print ("latency: ", latency)

recall:  0.9218364197530848
latency:  4.14560643243201
