# Praktikum 6

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import time
import faiss
from annoy import AnnoyIndex
import hnswlib
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# -------------------------------
# Load dataset
# -------------------------------
print("Loading dataset...")
df = pd.read_csv('/content/drive/MyDrive/Dataset/songs_with_attributes_and_lyrics.csv')
features = ['danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = df[features].values
print(f"Dataset loaded with {X.shape[0]} samples and {X.shape[1]} features.")


# Standarisasi fitur
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features scaled.")

k = 10  # jumlah nearest neighbors

# -------------------------------
# Exact Nearest Neighbor (brute-force) on a smaller subset for feasibility
# -------------------------------
print("\nRunning Exact NN (Brute Force) on a subset...")
n_exact_samples = 1000 # Use a smaller subset for brute force
X_exact_subset = X_scaled[:n_exact_samples]

start = time.time()
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(X_exact_subset)
dist_exact_subset, idx_exact_subset = nn.kneighbors(X_exact_subset)
time_exact_subset = time.time() - start
print(f"Exact NN on subset done in {time_exact_subset:.3f} s")


# -------------------------------
# Annoy on the full dataset
# -------------------------------
print("\nRunning Annoy on the full dataset...")
start = time.time()
f = X_scaled.shape[1]
index_annoy = AnnoyIndex(f, 'euclidean')
for i, v in enumerate(X_scaled):
    index_annoy.add_item(i, v)
index_annoy.build(10)
# Query Annoy with the subset used for Exact NN to compare
idx_annoy_subset = [index_annoy.get_nns_by_vector(X_scaled[i], k) for i in range(n_exact_samples)]
time_annoy = time.time() - start
print(f"Annoy done in {time_annoy:.3f} s")

# -------------------------------
# HNSW on the full dataset
# -------------------------------
print("\nRunning HNSW on the full dataset...")
start = time.time()
p_hnsw = hnswlib.Index(space='l2', dim=X_scaled.shape[1])
p_hnsw.init_index(max_elements=X_scaled.shape[0], ef_construction=200, M=16)
p_hnsw.add_items(X_scaled)
p_hnsw.set_ef(200)
# Query HNSW with the subset used for Exact NN to compare
idx_hnsw_subset, dist_hnsw_subset = p_hnsw.knn_query(X_scaled[:n_exact_samples], k=k)
time_hnsw = time.time() - start
print(f"HNSW done in {time_hnsw:.3f} s")

# -------------------------------
# FAISS IVF on the full dataset
# -------------------------------
print("\nRunning FAISS IVF on the full dataset...")
start = time.time()
quantizer = faiss.IndexFlatL2(X_scaled.shape[1])
index_faiss = faiss.IndexIVFFlat(quantizer, X_scaled.shape[1], nlist=100, metric=faiss.METRIC_L2)
index_faiss.train(X_scaled)
index_faiss.add(X_scaled)
index_faiss.nprobe = 10
# Query FAISS with the subset used for Exact NN to compare
dist_faiss_subset, idx_faiss_subset = index_faiss.search(X_scaled[:n_exact_samples], k)
time_faiss = time.time() - start
print(f"FAISS IVF done in {time_faiss:.3f} s")

# -------------------------------
# Contoh tampilkan top-5 neighbors dari item pertama (dari subset)
# -------------------------------
print("\nTop-5 neighbors for first song (from subset):")
# Ensure indices are within the subset size for Exact NN
print(f"Exact NN: {idx_exact_subset[0][:5]}")
print(f"Annoy:    {idx_annoy_subset[0][:5]}")
print(f"HNSW:     {idx_hnsw_subset[0][:5]}")
print(f"FAISS:    {idx_faiss_subset[0][:5]}")

# -------------------------------
# Tampilkan ringkasan waktu
# Note: Exact NN time is for the subset, others are for the full dataset (build + subset query)
# -------------------------------
print("\n=== Ringkasan Waktu (detik) ===")
print(f"Exact NN (Subset): {time_exact_subset:.3f}")
print(f"Annoy (Full):      {time_annoy:.3f}")
print(f"HNSW (Full):       {time_hnsw:.3f}")
print(f"FAISS (Full):      {time_faiss:.3f}")

Karena besarnya dataset, beberapa operasi seperti bruteforce akan memakan waktu yang lama untuk dijalankan pada cloud.