Nama : Aditya Atadewa  
Kelas : TI 3G  
NIM : 2341720174  
Absen : 01  

# Praktikum 6

Melakukan percobaan penggunaan ANNOY, FAISS, dan HNSWLIB pada dataset sekunder berukuran besar (Micro Spotify) pada link berikut: https://www.kaggle.com/datasets/bwandowando/spotify-songs-with-attributes-and-lyrics/data.

In [1]:
!pip install -q annoy faiss-cpu hnswlib

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m450.6/647.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m645.1/647.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone


In [2]:
import kagglehub

path = kagglehub.dataset_download("bwandowando/spotify-songs-with-attributes-and-lyrics")


Using Colab cache for faster access to the 'spotify-songs-with-attributes-and-lyrics' dataset.


In [None]:
import pandas as pd
import numpy as np
import os
import time
import faiss
from annoy import AnnoyIndex
import hnswlib
from sklearn.neighbors import NearestNeighbors
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler

# Use all available CPU cores where possible
n_cores = os.cpu_count() or 1
os.environ.setdefault('OMP_NUM_THREADS', str(n_cores))
os.environ.setdefault('OPENBLAS_NUM_THREADS', str(n_cores))
os.environ.setdefault('MKL_NUM_THREADS', str(n_cores))
# Tell faiss to use multiple threads (if built with OpenMP)
try:
    faiss.omp_set_num_threads(n_cores)
except Exception:
    pass

# -------------------------------
# Load dataset (drop NaNs in chosen features)
# -------------------------------
df = pd.read_csv(f'{path}/songs_with_attributes_and_lyrics.csv')

features = ['danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
df = df[features].dropna().reset_index(drop=True)
X = df.values

# Standardize and cast to float32 (required by faiss/hnswlib)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X).astype(np.float32)

n = X_scaled.shape[0]
k = 10  # jumlah nearest neighbors
# To keep this runnable on limited RAM, sample up to 1000 query points
n_queries = min(1000, n)
rng = np.random.default_rng(42)
query_idx = rng.choice(n, size=n_queries, replace=False)
# Xq = X_scaled[query_idx]
Xq = X_scaled

# -------------------------------
# Exact Nearest Neighbor (brute-force) - only for the sampled queries
# -------------------------------
t0 = time.time()
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean', n_jobs=-1)
nn.fit(X_scaled)
dist_exact, idx_exact = nn.kneighbors(Xq)
time_exact = time.time() - t0
print(f"Exact NN (queries={n_queries}) done in {time_exact:.3f} s")

# -------------------------------
# Annoy (build + query on sampled points)
# -------------------------------
t0 = time.time()
fdim = X_scaled.shape[1]
index_annoy = AnnoyIndex(fdim, 'euclidean')
for i, v in enumerate(X_scaled):
    index_annoy.add_item(i, v.tolist())
n_trees = 50
index_annoy.build(n_trees)
t_build_annoy = time.time() - t0

tq = time.time()
# Annoy: parallelize queries using joblib (threading) to utilize multiple cores
def _query_annoy(v):
    return index_annoy.get_nns_by_vector(v.tolist(), k)
idx_annoy = Parallel(n_jobs=n_cores, prefer='threads')(delayed(_query_annoy)(v) for v in Xq)
time_query_annoy = time.time() - tq
print(f"Annoy build: {t_build_annoy:.3f} s, query all: {time_query_annoy:.3f} s")

# -------------------------------
# HNSW (hnswlib)
# -------------------------------
t0 = time.time()
p = hnswlib.Index(space='l2', dim=fdim)
p.init_index(max_elements=n, ef_construction=200, M=16)
p.add_items(X_scaled)
p.set_ef(200)
t_build_hnsw = time.time() - t0

tq = time.time()
# hnswlib supports num_threads in knn_query
idx_hnsw, dist_hnsw = p.knn_query(Xq, k=k, num_threads=n_cores)
time_query_hnsw = time.time() - tq
print(f"HNSW build: {t_build_hnsw:.3f} s, query all: {time_query_hnsw:.3f} s")

# -------------------------------
# FAISS IVF (train on full set, query sampled points)
# -------------------------------
t0 = time.time()
quantizer = faiss.IndexFlatL2(fdim)
nlist = 100
index_faiss = faiss.IndexIVFFlat(quantizer, fdim, nlist, faiss.METRIC_L2)
# FAISS requires float32 and contiguous arrays
index_faiss.train(np.ascontiguousarray(X_scaled))
index_faiss.add(np.ascontiguousarray(X_scaled))
index_faiss.nprobe = 10
t_build_faiss = time.time() - t0

tq = time.time()
# FAISS can use multiple threads via set_num_threads if available
try:
    faiss.omp_set_num_threads(n_cores)
except Exception:
    pass
D_faiss, idx_faiss = index_faiss.search(np.ascontiguousarray(Xq), k)
time_query_faiss = time.time() - tq
print(f"FAISS build: {t_build_faiss:.3f} s, query all: {time_query_faiss:.3f} s")

# -------------------------------
# Evaluate recall@k for each ANN vs exact
# -------------------------------
def recall_at_k(true_idx, pred_idx, k):
    # true_idx: (n_queries, k), pred_idx: iterable of length n_queries with lists/arrays
    total = 0.0
    n = len(true_idx)
    for t, p in zip(true_idx, pred_idx):
        pset = set(p.tolist() if hasattr(p, 'tolist') else p)
        total += len(pset.intersection(set(t[:k]))) / float(k)
    return total / n

rec_annoy = recall_at_k(idx_exact, idx_annoy, k)
rec_hnsw = recall_at_k(idx_exact, idx_hnsw, k)
rec_faiss = recall_at_k(idx_exact, idx_faiss, k)

print('\nSummary (build time | query time for sampled points | recall@k)')
print(f"Exact:  - | {time_exact:.3f} s (queries only) | recall=1.00")
print(f"Annoy:  {t_build_annoy:.3f} s | {time_query_annoy:.3f} s | recall@{k}={rec_annoy:.4f}")
print(f"HNSW:   {t_build_hnsw:.3f} s | {time_query_hnsw:.3f} s | recall@{k}={rec_hnsw:.4f}")
print(f"FAISS:  {t_build_faiss:.3f} s | {time_query_faiss:.3f} s | recall@{k}={rec_faiss:.4f}")

# show top-5 neighbors for the first sampled query (original dataset index)
qid = query_idx[0]
print("\nTop-5 neighbors for first sampled query (dataset index = {})".format(int(qid)))
print(f"Exact NN: {idx_exact[0][:5]}")
print(f"Annoy:    {idx_annoy[0][:5]}")
print(f"HNSW:     {idx_hnsw[0][:5]}")
print(f"FAISS:    {idx_faiss[0][:5]}")

Exact NN (queries=1000) done in 3798.416 s
Annoy build: 75.093 s, query all: 253.940 s
HNSW build: 165.886 s, query all: 137.792 s
FAISS build: 0.426 s, query all: 717.884 s

Summary (build time | query time for sampled points | recall@k)
Exact:  - | 3798.416 s (queries only) | recall=1.00
Annoy:  75.093 s | 253.940 s | recall@10=0.9945
HNSW:   165.886 s | 137.792 s | recall@10=0.9955
FAISS:  0.426 s | 717.884 s | recall@10=0.9982

Top-5 neighbors for first sampled query (dataset index = 287796)
Exact NN: [     0 394553 764272 837727 749223]
Annoy:    [0, 394553, 764272, 837727, 749223]
HNSW:     [     0 394553 764272 837727 749223]
FAISS:    [     0 394553 764272 837727 749223]
