# Praktikum 4

In [None]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== Annoy ===
Build time: 22.078585386276245 detik
Query time: 0.0002734661102294922 detik
Neighbors: [129179, 559276, 630783, 213191, 559393] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.01495671272277832 detik
Query time: 0.0058858394622802734 detik
Neighbors: [129179 559276 630783 213191 559393] ...

=== HNSW (hnswlib) ===
Build time: 179.58612179756165 detik
Query time: 0.00035119056701660156 detik
Neighbors: [129179 559276 630783 213191 559393] ...


In [None]:
import numpy as np
import time
import pandas as pd
from annoy import AnnoyIndex
import faiss
import hnswlib

# ========================================
# Fungsi percobaan untuk satu metric
# ========================================
def run_experiment(metric='euclidean', n_data=1_000_000, dim=5, k=10):
    np.random.seed(42)
    X = np.random.random((n_data, dim)).astype(np.float32)
    query = np.random.random((1, dim)).astype(np.float32)

    results = []

    # ========== 1. Annoy ==========
    if metric in ['euclidean', 'angular']:
        ann_index = AnnoyIndex(dim, metric)
        start = time.time()
        for i in range(n_data):
            ann_index.add_item(i, X[i])
        ann_index.build(10)
        build_time = time.time() - start

        start = time.time()
        ann_index.get_nns_by_vector(query[0], k, include_distances=True)
        query_time = time.time() - start

        results.append({
            'Library': 'Annoy',
            'Metric Distance': metric,
            'Dimensi Data': dim,
            'Jumlah Data': n_data,
            'Waktu Build (detik)': round(build_time, 4),
            'Waktu Query (detik)': round(query_time, 4)
        })

    # ========== 2. FAISS ==========
    if metric in ['euclidean', 'L2']:
        faiss_index = faiss.IndexFlatL2(dim)
    elif metric in ['IP', 'inner']:
        faiss_index = faiss.IndexFlatIP(dim)
    else:
        faiss_index = None

    if faiss_index is not None:
        start = time.time()
        faiss_index.add(X)
        build_time = time.time() - start

        start = time.time()
        faiss_index.search(query, k)
        query_time = time.time() - start

        results.append({
            'Library': 'FAISS',
            'Metric Distance': metric,
            'Dimensi Data': dim,
            'Jumlah Data': n_data,
            'Waktu Build (detik)': round(build_time, 4),
            'Waktu Query (detik)': round(query_time, 4)
        })

    # ========== 3. HNSW ==========
    if metric in ['l2', 'cosine']:
        hnsw_index = hnswlib.Index(space=metric, dim=dim)
        start = time.time()
        hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
        hnsw_index.add_items(X)
        build_time = time.time() - start

        hnsw_index.set_ef(50)
        start = time.time()
        hnsw_index.knn_query(query, k=k)
        query_time = time.time() - start

        results.append({
            'Library': 'HNSWLIB',
            'Metric Distance': metric,
            'Dimensi Data': dim,
            'Jumlah Data': n_data,
            'Waktu Build (detik)': round(build_time, 4),
            'Waktu Query (detik)': round(query_time, 4)
        })

    return results


# ========================================
# Jalankan semua kombinasi percobaan
# ========================================
metrics = ['euclidean', 'angular', 'L2', 'IP', 'l2', 'cosine']
all_results = []

for metric in metrics:
    print(f"Menjalankan percobaan untuk metric: {metric}")
    all_results.extend(run_experiment(metric=metric, n_data=100000, dim=5))  # ubah 1_000_000 jika kuat RAM

# ========================================
# Tabel hasil
# ========================================
df = pd.DataFrame(all_results, columns=[
    'Library', 'Metric Distance', 'Dimensi Data', 'Jumlah Data',
    'Waktu Build (detik)', 'Waktu Query (detik)'
])

print("\n=== HASIL PERCOBAAN ANN, FAISS, HNSW ===")
print(df.to_string(index=False))


Menjalankan percobaan untuk metric: euclidean
Menjalankan percobaan untuk metric: angular
Menjalankan percobaan untuk metric: L2
Menjalankan percobaan untuk metric: IP
Menjalankan percobaan untuk metric: l2
Menjalankan percobaan untuk metric: cosine

=== HASIL PERCOBAAN ANN, FAISS, HNSW ===
Library Metric Distance  Dimensi Data  Jumlah Data  Waktu Build (detik)  Waktu Query (detik)
  Annoy       euclidean             5       100000               1.6861               0.0001
  FAISS       euclidean             5       100000               0.0005               0.0006
  Annoy         angular             5       100000               2.3241               0.0001
  FAISS              L2             5       100000               0.0004               0.0006
  FAISS              IP             5       100000               0.0004               0.0005
HNSWLIB              l2             5       100000              14.4342               0.0001
HNSWLIB          cosine             5       100000       