In [1]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

In [2]:
# ===============================
# Build 1mio rows with 5D
# ===============================
n_data = 1_000_000   # try 100_000 if you have limited memory
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

In [3]:

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

=== Annoy ===
Build time: 4.283308029174805 detik
Query time: 0.0030825138092041016 detik
Neighbors: [388641, 152487, 387839, 40051, 570333] ...


In [5]:
# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")


=== FAISS (IndexFlatL2) ===
Build time: 0.012629985809326172 detik
Query time: 0.00649714469909668 detik
Neighbors: [388641 152487 387839  40051 570333] ...


In [6]:
# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== HNSW (hnswlib) ===
Build time: 41.89698791503906 detik
Query time: 0.0 detik
Neighbors: [388641 152487 387839  40051 570333] ...


When evaluated on a large-scale dataset, the performance characteristics of the three Approximate Nearest Neighbor algorithms vary significantly. FAISS (IndexFlatL2) demonstrates the most efficient index construction with a build time of only 0.01 seconds, making it the optimal choice when the index must be created rapidly.

However, for query performance, HNSW is the clear leader, achieving an unparalleled query time of 0.0 seconds. Annoy presents a balanced profile, with a build time of 4.28 seconds and a query time of 0.003 seconds. This analysis indicates that the ideal algorithm depends on the specific application: FAISS excels at fast indexing, while HNSW provides the fastest search capabilities once the index is built.