<a href="https://colab.research.google.com/github/FarrelAD/2341720081_ML_2025/blob/main/JS07/P6_JS07.ipynb" target="_blank">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Preparation

In [1]:
!pip install -q faiss-cpu annoy hnswlib

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
import time
import faiss
from annoy import AnnoyIndex
import hnswlib
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from IPython.display import display

# Load Dataset

In [3]:
# load dataset from Kaggle
import kagglehub

# Download latest version
dataset_path = kagglehub.dataset_download("bwandowando/spotify-songs-with-attributes-and-lyrics")

print("Path to dataset files:", dataset_path)

Path to dataset files: /kaggle/input/spotify-songs-with-attributes-and-lyrics


In [4]:
import os

files = os.listdir(dataset_path)
print(files)

['songs_with_lyrics_and_timestamps.csv', 'songs_with_attributes_and_lyrics.csv']


In [5]:
dataset_path = Path(dataset_path)
spotify_dataset_path = dataset_path / files[1]

df = pd.read_csv(spotify_dataset_path)
display(df.describe())
display(df.info())

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0,955320.0
mean,0.55071,0.652442,-7.833732,0.083638,0.282962,0.081875,0.22019,0.488119,122.226093,234144.1
std,0.169784,0.238824,3.792018,0.092929,0.3118,0.212789,0.195938,0.251468,29.536303,90683.68
min,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,1586.0
25%,0.436,0.482,-9.75,0.0345,0.0119,0.0,0.0989,0.282,99.021,184933.0
50%,0.558,0.687,-7.041,0.0478,0.142,3.9e-05,0.137,0.477,120.661,221307.0
75%,0.675,0.857,-5.148,0.087625,0.518,0.00866,0.285,0.69,140.094,265640.0
max,0.993,1.0,4.882,0.966,0.996,1.0,1.0,1.0,246.13,5764624.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955320 entries, 0 to 955319
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                955320 non-null  object 
 1   name              955309 non-null  object 
 2   album_name        385557 non-null  object 
 3   artists           955318 non-null  object 
 4   danceability      955320 non-null  float64
 5   energy            955320 non-null  float64
 6   key               955320 non-null  object 
 7   loudness          955320 non-null  float64
 8   mode              955320 non-null  object 
 9   speechiness       955320 non-null  float64
 10  acousticness      955320 non-null  float64
 11  instrumentalness  955320 non-null  float64
 12  liveness          955320 non-null  float64
 13  valence           955320 non-null  float64
 14  tempo             955320 non-null  float64
 15  duration_ms       955320 non-null  float64
 16  lyrics            95

None

Feature selection

In [6]:
features = ['danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = df[features].values

# Data Preprocessing

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# K-Nearest Neighbor

In [8]:
k = 10

## Exact Nearest Neighbor

In [9]:
start = time.time()
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(X_scaled)

dist_exact, idx_exact = nn.kneighbors(X_scaled)
time_exact = time.time() - start

print(f"Exact NN done in {time_exact:.3f} s")

Exact NN done in 1527.607 s


## Annoy

In [10]:
start = time.time()
f = X_scaled.shape[1]
index_annoy = AnnoyIndex(f, 'euclidean')

for i, v in enumerate(X_scaled):
    index_annoy.add_item(i, v)

index_annoy.build(10)
idx_annoy = [index_annoy.get_nns_by_vector(v, k) for v in X_scaled]
time_annoy = time.time() - start

print(f"Annoy done in {time_annoy:.3f} s")

Annoy done in 46.450 s


## HSNW

In [11]:
start = time.time()

p_hnsw = hnswlib.Index(space='l2', dim=X_scaled.shape[1])
p_hnsw.init_index(max_elements=X_scaled.shape[0], ef_construction=200, M=16)
p_hnsw.add_items(X_scaled)
p_hnsw.set_ef(200)

idx_hnsw, dist_hnsw = p_hnsw.knn_query(X_scaled, k=k)
time_hnsw = time.time() - start

print(f"HNSW done in {time_hnsw:.3f} s")

HNSW done in 105.003 s


## FAISS IVF

In [12]:
start = time.time()
quantizer = faiss.IndexFlatL2(X_scaled.shape[1])
index_faiss = faiss.IndexIVFFlat(quantizer, X_scaled.shape[1], 100, faiss.METRIC_L2)

index_faiss.train(X_scaled)
index_faiss.add(X_scaled)
index_faiss.nprobe = 10

dist_faiss, idx_faiss = index_faiss.search(X_scaled, k)
time_faiss = time.time() - start

print(f"FAISS IVF done in {time_faiss:.3f} s")

FAISS IVF done in 272.158 s


# Result Summary

In [13]:
print("Top-5 neighbors for first song:")
print(f"Exact NN: {idx_exact[0][:5]}")
print(f"Annoy:    {idx_annoy[0][:5]}")
print(f"HNSW:     {idx_hnsw[0][:5]}")
print(f"FAISS:    {idx_faiss[0][:5]}")

Top-5 neighbors for first song:
Exact NN: [     0 394553 764272 837727 749223]
Annoy:    [0, 837727, 749223, 833164, 523698]
HNSW:     [     0 394553 764272 837727 749223]
FAISS:    [     0 394553 764272 837727 749223]
