In [1]:
import hnswlib
import numpy as np

In [2]:
# Set parameters
dim = 128  # Dimension of the vectors
num_elements = 10000  # Number of elements to add

# Initialize the index
p = hnswlib.Index(space='l2', dim=dim)  # 'l2' refers to the Euclidean distance

In [3]:
# Generate random data
data = np.float32(np.random.random((num_elements, dim)))

In [9]:
data

array([[0.8515988 , 0.01755763, 0.06527183, ..., 0.9496456 , 0.6817592 ,
        0.6892076 ],
       [0.6909492 , 0.21134596, 0.90046054, ..., 0.46829513, 0.3788647 ,
        0.10205414],
       [0.22164924, 0.20613472, 0.04461031, ..., 0.39052486, 0.05016416,
        0.6990841 ],
       ...,
       [0.63896793, 0.84920627, 0.09040519, ..., 0.8414225 , 0.90276164,
        0.3783352 ],
       [0.22978273, 0.433766  , 0.6155778 , ..., 0.18770008, 0.9650767 ,
        0.32443455],
       [0.9382895 , 0.72919345, 0.40884885, ..., 0.41671327, 0.30841056,
        0.62401426]], dtype=float32)

In [11]:
# Set the number of threads used during the build process
p.set_num_threads(4)

# Build the index
p.init_index(max_elements=num_elements, ef_construction=200, M=16)
p.add_items(data)

In [12]:
# Set the exploration factor (ef)
p.set_ef(50)  # ef should always be greater than k

In [13]:
# Generate random queries
# query_data = np.float32(np.random.random((100, dim)))
query_data = np.float32(np.random.random((1, dim)))

# Perform the query
labels, distances = p.knn_query(query_data, k=10)

In [14]:
# len(labels)
labels
# distances

array([[6871, 3444, 6394, 2600, 3170,  586, 2451, 4215,  249, 3933]],
      dtype=uint64)

In [9]:
assert len(labels) == 100  # Ensure we get results for all queries
assert labels.shape[1] == 10  # Ensure each query returns 10 nearest neighbors

In [10]:
# Compare results with brute-force search
from sklearn.metrics.pairwise import euclidean_distances

true_distances = euclidean_distances(query_data, data)
true_labels = np.argsort(true_distances, axis=1)[:, :10]

# Check if the HNSW results match the brute-force results
accuracy = np.mean([np.isin(labels[i], true_labels[i]).sum() for i in range(len(labels))])
print(f"Accuracy: {accuracy}")

Accuracy: 6.46


In [11]:
import time

start_time = time.time()
p.knn_query(query_data, k=10)
end_time = time.time()
print(f"Query time: {end_time - start_time} seconds")

Query time: 0.00299072265625 seconds
