# Praktikum 6

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from annoy import AnnoyIndex
import hnswlib
import faiss
import time

# ===============================
#  Load dataset (lokal)
# ===============================
df = pd.read_csv("C:\\kuliah\\Machine Leaning\\2341720003_ML_2025\\data\\songs_with_attributes_and_lyrics.csv")

# Cek ukuran awal
print("Total data:", df.shape)

df = df.sample(10000, random_state=42)
print("Subset data:", df.shape)

# ===============================
#  Siapkan fitur numerik
# ===============================
features = ['danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = df[features].values

# Standarisasi fitur
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k = 10  # jumlah nearest neighbors

# ===============================
#  Exact Nearest Neighbor
# ===============================
start = time.time()
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(X_scaled)
dist_exact, idx_exact = nn.kneighbors(X_scaled)
time_exact = time.time() - start
print(f"Exact NN done in {time_exact:.3f} s")

# ===============================
#  Annoy
# ===============================
start = time.time()
f = X_scaled.shape[1]
index_annoy = AnnoyIndex(f, 'euclidean')
for i, v in enumerate(X_scaled):
    index_annoy.add_item(i, v)
index_annoy.build(10)
idx_annoy = [index_annoy.get_nns_by_vector(v, k) for v in X_scaled]
time_annoy = time.time() - start
print(f"Annoy done in {time_annoy:.3f} s")

# ===============================
#  HNSW
# ===============================
start = time.time()
p_hnsw = hnswlib.Index(space='l2', dim=X_scaled.shape[1])
p_hnsw.init_index(max_elements=X_scaled.shape[0], ef_construction=100, M=8)
p_hnsw.add_items(X_scaled)
p_hnsw.set_ef(100)
idx_hnsw, dist_hnsw = p_hnsw.knn_query(X_scaled, k=k)
time_hnsw = time.time() - start
print(f"HNSW done in {time_hnsw:.3f} s")

# ===============================
#  FAISS IVF
# ===============================
start = time.time()
quantizer = faiss.IndexFlatL2(X_scaled.shape[1])
index_faiss = faiss.IndexIVFFlat(quantizer, X_scaled.shape[1], 50, faiss.METRIC_L2)
index_faiss.train(X_scaled)
index_faiss.add(X_scaled)
index_faiss.nprobe = 5
dist_faiss, idx_faiss = index_faiss.search(X_scaled, k)
time_faiss = time.time() - start
print(f"FAISS IVF done in {time_faiss:.3f} s")

# ===============================
#  Contoh hasil
# ===============================
print("\nTop-5 neighbors for first song:")
print(f"Exact NN: {idx_exact[0][:5]}")
print(f"Annoy:    {idx_annoy[0][:5]}")
print(f"HNSW:     {idx_hnsw[0][:5]}")
print(f"FAISS:    {idx_faiss[0][:5]}")

# ===============================
#  Ringkasan waktu
# ===============================
print("\nSummary runtime:")
print(f"Exact NN : {time_exact:.3f} s")
print(f"Annoy    : {time_annoy:.3f} s")
print(f"HNSW     : {time_hnsw:.3f} s")
print(f"FAISS    : {time_faiss:.3f} s")


Total data: (955320, 17)
Subset data: (10000, 17)
Subset data: (10000, 17)


[WinError 2] The system cannot find the file specified
  File "c:\Users\sfati\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\sfati\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sfati\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^

Exact NN done in 0.518 s
Annoy done in 0.461 s
HNSW done in 0.205 s
Annoy done in 0.461 s
HNSW done in 0.205 s
FAISS IVF done in 0.057 s

Top-5 neighbors for first song:
Exact NN: [   0 6847 7747 5962 4665]
Annoy:    [0, 6847, 7747, 5962, 4665]
HNSW:     [   0 6847 7747 5962 4665]
FAISS:    [   0 6847 7747 5962 4665]

Summary runtime:
Exact NN : 0.518 s
Annoy    : 0.461 s
HNSW     : 0.205 s
FAISS    : 0.057 s
FAISS IVF done in 0.057 s

Top-5 neighbors for first song:
Exact NN: [   0 6847 7747 5962 4665]
Annoy:    [0, 6847, 7747, 5962, 4665]
HNSW:     [   0 6847 7747 5962 4665]
FAISS:    [   0 6847 7747 5962 4665]

Summary runtime:
Exact NN : 0.518 s
Annoy    : 0.461 s
HNSW     : 0.205 s
FAISS    : 0.057 s


**Analisis**
Semua metode approximate (Annoy, HNSW, FAISS) mampu meniru hasil brute-force dengan waktu jauh lebih cepat.
FAISS adalah yang paling efisien secara waktu, sehingga paling direkomendasikan untuk dataset besar seperti jutaan lagu Spotify.
