# Praktikum 4 - Perbandingan

In [None]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== Annoy ===
Build time: 22.626758098602295 detik
Query time: 0.00037932395935058594 detik
Neighbors: [905756, 154246, 715986, 3684, 512305] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.009208440780639648 detik
Query time: 0.006433963775634766 detik
Neighbors: [905756 154246 715986   3684 512305] ...

=== HNSW (hnswlib) ===
Build time: 176.4186987876892 detik
Query time: 0.0002739429473876953 detik
Neighbors: [905756 154246 715986   3684 512305] ...


Lakukan percobaan pada metric distance yang berbeda. catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [None]:
list_metrics_comparison = ['euclidean', 'inner_product']

In [None]:
comparison_results = []

print(f"{'Metric':<15} | {'N Points':<15} | {'Dimensions':<10} | {'Library':<10} | {'Build Time (s)':<15} | {'Query Time (s)':<15}")
print("-" * 100)

for metric in list_metrics_comparison:
    # Annoy uses 'euclidean', 'angular', 'manhattan', 'hamming', or 'dot'
    # Map 'inner_product' to 'dot' for Annoy
    annoy_metric = 'dot' if metric == 'inner_product' else metric

    # HNSW uses 'l2', 'ip', or 'cosine'
    # Map 'euclidean' to 'l2', 'inner_product' to 'ip' for HNSW
    hnsw_metric = 'l2' if metric == 'euclidean' else ('ip' if metric == 'inner_product' else metric)


    for n_points in list_n_points:
        for n_dims in list_n_dims:
            print(f"Running: Metric={metric}, N_Points={n_points}, Dimensions={n_dims}")

            # Generate random dataset
            X = np.random.random((n_points, n_dims)).astype(np.float32)
            query = np.random.random((1, n_dims)).astype(np.float32)
            k = 10

            # --- Annoy ---
            try:
                ann_index = AnnoyIndex(n_dims, annoy_metric)
                start = time.time()
                for i in range(n_points):
                    ann_index.add_item(i, X[i])
                ann_index.build(10)  # 10 trees
                annoy_build_time = time.time() - start

                start = time.time()
                neighbors = ann_index.get_nns_by_vector(query[0], k)
                annoy_query_time = time.time() - start

                comparison_results.append({
                    'Metric': metric,
                    'N Points': n_points,
                    'Dimensions': n_dims,
                    'Library': 'Annoy',
                    'Build Time (s)': annoy_build_time,
                    'Query Time (s)': annoy_query_time
                })
            except Exception as e:
                print(f"  Annoy failed: {e}")
                comparison_results.append({
                    'Metric': metric,
                    'N Points': n_points,
                    'Dimensions': n_dims,
                    'Library': 'Annoy',
                    'Build Time (s)': np.nan,
                    'Query Time (s)': np.nan
                })


            # --- FAISS ---
            try:
                if metric == 'euclidean':
                    faiss_index = faiss.IndexFlatL2(n_dims)
                elif metric == 'inner_product':
                    faiss_index = faiss.IndexFlatIP(n_dims)
                else:
                    faiss_index = None
                    print(f"  FAISS does not support metric: {metric}")


                if faiss_index is not None:
                    start = time.time()
                    faiss_index.add(X)
                    faiss_build_time = time.time() - start

                    start = time.time()
                    distances, indices = faiss_index.search(query, k)
                    faiss_query_time = time.time() - start

                    comparison_results.append({
                        'Metric': metric,
                        'N Points': n_points,
                        'Dimensions': n_dims,
                        'Library': 'FAISS',
                        'Build Time (s)': faiss_build_time,
                        'Query Time (s)': faiss_query_time
                    })
            except Exception as e:
                 print(f"  FAISS failed: {e}")
                 comparison_results.append({
                    'Metric': metric,
                    'N Points': n_points,
                    'Dimensions': n_dims,
                    'Library': 'FAISS',
                    'Build Time (s)': np.nan,
                    'Query Time (s)': np.nan
                })

            # --- HNSW ---
            try:
                hnsw_index = hnswlib.Index(space=hnsw_metric, dim=n_dims)
                start = time.time()
                hnsw_index.init_index(max_elements=n_points, ef_construction=200, M=16)
                hnsw_index.add_items(X)
                hnsw_build_time = time.time() - start

                hnsw_index.set_ef(50)

                start = time.time()
                labels, distances = hnsw_index.knn_query(query, k=k)
                hnsw_query_time = time.time() - start

                comparison_results.append({
                    'Metric': metric,
                    'N Points': n_points,
                    'Dimensions': n_dims,
                    'Library': 'HNSW',
                    'Build Time (s)': hnsw_build_time,
                    'Query Time (s)': hnsw_query_time
                })
            except Exception as e:
                print(f"  HNSW failed: {e}")
                comparison_results.append({
                    'Metric': metric,
                    'N Points': n_points,
                    'Dimensions': n_dims,
                    'Library': 'HNSW',
                    'Build Time (s)': np.nan,
                    'Query Time (s)': np.nan
                })
print("-" * 100)

Metric          | N Points        | Dimensions | Library    | Build Time (s)  | Query Time (s) 
----------------------------------------------------------------------------------------------------
Running: Metric=euclidean, N_Points=1000, Dimensions=2
Running: Metric=euclidean, N_Points=1000, Dimensions=5
Running: Metric=euclidean, N_Points=1000000, Dimensions=2
Running: Metric=euclidean, N_Points=1000000, Dimensions=5
Running: Metric=inner_product, N_Points=1000, Dimensions=2
Running: Metric=inner_product, N_Points=1000, Dimensions=5
Running: Metric=inner_product, N_Points=1000000, Dimensions=2
Running: Metric=inner_product, N_Points=1000000, Dimensions=5
----------------------------------------------------------------------------------------------------
