In [10]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import time
from scipy.stats import rankdata
from hnsw_models import HNSW, HNSW_V2, HNSWNode
from tqdm import tqdm


In [2]:
# ------------------------ #
#    Helper Functions
# ------------------------ #

def generate_random_embeddings(num_embeddings, dim):
    """Generate random embeddings."""
    return np.random.rand(num_embeddings, dim).astype(np.float32)

def mean_reciprocal_rank(y_true, y_pred):
    """Calculate Mean Reciprocal Rank (MRR)."""
    mrrs = []
    for true_labels, pred_scores in zip(y_true, y_pred):
        ranks = rankdata(-pred_scores, method='ordinal')
        true_rank = ranks[np.argmax(true_labels)]
        mrrs.append(1.0 / true_rank)
    return np.mean(mrrs)

def recall_at_k(actual, predicted, k):
    """Calculate Recall@K."""
    actual_set = set(actual[:k])
    predicted_set = set(predicted[:k])
    return len(actual_set & predicted_set) / len(actual_set)

def precision_at_k(actual, predicted, k):
    """Calculate Precision@K."""
    actual_set = set(actual[:k])
    predicted_set = set(predicted[:k])
    return len(actual_set & predicted_set) / k

def calculate_metrics(exact_labels, hnsw_labels, K):
    """Calculate recall, precision, and MRR."""
    recall_scores = []
    precision_scores = []
    mrr_scores = []

    for i in range(len(exact_labels)):
        recall = recall_at_k(exact_labels[i], hnsw_labels[i], K)
        precision = precision_at_k(exact_labels[i], hnsw_labels[i], K)
        mrr = mean_reciprocal_rank(exact_labels[i], hnsw_labels[i])

        recall_scores.append(recall)
        precision_scores.append(precision)
        mrr_scores.append(mrr)

    average_recall = np.mean(recall_scores)
    average_precision = np.mean(precision_scores)
    average_mrr = np.mean(mrr_scores)

    return average_recall, average_precision, average_mrr


def benchmark_hnsw(hnsw_class, embeddings, test_embeddings, K):
    """Benchmark HNSW or HNSW_V2."""
    hnsw = hnsw_class(max_level=3)

    start_time_hnsw_add = time.time()
    for i, emb in enumerate(embeddings):
        hnsw.add_node(emb, label=f"emb{i}")
    hnsw_add_time = time.time() - start_time_hnsw_add

    hnsw_knn_labels = []
    start_time_hnsw_search = time.time()
    for emb in test_embeddings:
        top_k_nodes = hnsw.search_knn(emb, k=K)
        hnsw_knn_labels.append([int(node.label.replace('emb', '')) for node in top_k_nodes])
    hnsw_search_time = time.time() - start_time_hnsw_search

    return hnsw_knn_labels, hnsw_add_time, hnsw_search_time

In [3]:
# ------------------------ #
#    Main Function
# ------------------------ #

def run_benchmark(num_embeddings, embedding_dim, K):
    np.random.seed(42)
    
    """Run the full benchmark for HNSW and HNSW_V2."""
    # Generate embeddings and test embeddings
    embeddings = generate_random_embeddings(num_embeddings, embedding_dim)
    test_embeddings = generate_random_embeddings(num_embeddings, embedding_dim)

    # Perform exact KNN using scikit-learn for ground truth
    start_time_knn = time.time()
    knn_model = NearestNeighbors(n_neighbors=K, algorithm='brute', metric='euclidean')
    knn_model.fit(embeddings)
    distances, indices = knn_model.kneighbors(test_embeddings)
    knn_exact_time = time.time() - start_time_knn

    exact_knn_labels = [indices[i] for i in range(len(test_embeddings))]
    exact_knn_distances = [distances[i] for i in range(len(test_embeddings))]

    # Benchmark HNSW (V1)
    hnsw_v1_labels, hnsw_v1_add_time, hnsw_v1_search_time = benchmark_hnsw(HNSW, embeddings, test_embeddings, K)

    # Benchmark HNSW_V2
    hnsw_v2_labels, hnsw_v2_add_time, hnsw_v2_search_time = benchmark_hnsw(HNSW_V2, embeddings, test_embeddings, K)

    # Calculate metrics for HNSW (V1)
    v1_recall, v1_precision, v1_mrr = calculate_metrics(exact_knn_labels, hnsw_v1_labels, K)

    # Calculate metrics for HNSW_V2
    v2_recall, v2_precision, v2_mrr = calculate_metrics(exact_knn_labels, hnsw_v2_labels, K)

    # Store the results in a pandas DataFrame
    benchmark_df = pd.DataFrame({
        'method': ['HNSW_V1', 'HNSW_V2'],
        'num_embeddings': [num_embeddings, num_embeddings],
        'embedding_dim': [embedding_dim, embedding_dim],
        'average_recall': [v1_recall, v2_recall],
        'average_precision': [v1_precision, v2_precision],
        'average_mrr': [v1_mrr, v2_mrr],
        'add_time': [hnsw_v1_add_time, hnsw_v2_add_time],
        'search_time': [hnsw_v1_search_time, hnsw_v2_search_time],
        'knn_exact_search_time': [knn_exact_time, knn_exact_time]
    })

    return benchmark_df

In [4]:
# ------------------------ #
#    Running the Benchmark
# ------------------------ #
# Parameters
num_embeddings = 1000
embedding_dim = 128
K = 10

# Run the benchmark and display the results
benchmark_results = run_benchmark(num_embeddings, embedding_dim, K)

In [5]:
benchmark_results

Unnamed: 0,method,num_embeddings,embedding_dim,average_recall,average_precision,average_mrr,add_time,search_time,knn_exact_search_time
0,HNSW_V1,1000,128,0.0336,0.0336,1.0,0.082868,0.087959,0.039304
1,HNSW_V2,1000,128,0.2366,0.2366,1.0,0.237785,0.566206,0.039304


In [6]:
benchmark_results

Unnamed: 0,method,num_embeddings,embedding_dim,average_recall,average_precision,average_mrr,add_time,search_time,knn_exact_search_time
0,HNSW_V1,1000,128,0.0336,0.0336,1.0,0.060096,0.117145,0.077236
1,HNSW_V2,1000,128,0.2366,0.2366,1.0,0.22602,0.546387,0.077236


# Run Full Benchmark

In [14]:
# Define the grid search parameters
num_embeddings_values = [1000, int(1e4), int(1e5), int(1e6)]  # Example values for the number of embeddings
embedding_dim_values = [128, 512, 738, 1024]      # Example values for the dimension of embeddings
K_values = [10, 30, 50, 100, 200]

In [15]:
from tqdm import tqdm

# Initialize an empty DataFrame to store all results
all_benchmark_results = pd.DataFrame()

# Total number of iterations for tqdm
total_iterations = len(num_embeddings_values) * len(embedding_dim_values) * len(K_values)

# Perform grid search with progress tracking
with tqdm(total=total_iterations) as pbar:
    for num_embeddings in num_embeddings_values:
        for embedding_dim in embedding_dim_values:
            for K in K_values:
                # Print the current combination
                # print(f"Iter num_embeddings={num_embeddings}, embedding_dim={embedding_dim}, K={K}")

                # Run the benchmark for the current set of parameters
                benchmark_results = run_benchmark(num_embeddings, embedding_dim, K)

                # Concatenate the results to the main DataFrame
                all_benchmark_results = pd.concat([all_benchmark_results, benchmark_results], ignore_index=True)

                # Update the progress bar
                pbar.update(1)


 25%|██▌       | 20/80 [00:34<02:22,  2.37s/it]

In [13]:
all_benchmark_results.sort_values(by='average_recall', ascending=False)

Unnamed: 0,method,num_embeddings,embedding_dim,average_recall,average_precision,average_mrr,add_time,search_time,knn_exact_search_time
1,HNSW_V2,500,64,0.3076,0.3076,1.0,0.1194,0.199148,0.142072
13,HNSW_V2,500,256,0.2944,0.2944,1.0,0.102188,0.225911,0.005473
7,HNSW_V2,500,128,0.2912,0.2912,1.0,0.095428,0.267385,0.003436
15,HNSW_V2,500,256,0.267,0.267,1.0,0.101869,0.228468,0.006688
19,HNSW_V2,1000,64,0.266,0.266,1.0,0.289906,0.526098,0.005271
3,HNSW_V2,500,64,0.2654,0.2654,1.0,0.088877,0.19808,0.004132
25,HNSW_V2,1000,128,0.2576,0.2576,1.0,0.223142,0.548747,0.005359
9,HNSW_V2,500,128,0.2574,0.2574,1.0,0.096699,0.220557,0.004457
37,HNSW_V2,2000,64,0.2536,0.2536,1.0,0.584274,1.490377,0.009748
31,HNSW_V2,1000,256,0.2516,0.2516,1.0,0.365954,0.760787,0.007333
