In [9]:
import pandas as pd
import numpy as np
import time
import faiss
from annoy import AnnoyIndex
import hnswlib
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [10]:
# Load the dataset
df = pd.read_csv('songs_with_attributes_and_lyrics.csv')  # Change the path according to your file location

# Select feature columns for similarity search
features = ['danceability', 'energy', 'loudness', 'speechiness', 
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = df[features].values

# Standardize the feature values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Set parameters
k = 10  # number of nearest neighbors to search
n_samples = X_scaled.shape[0]  # total number of samples
dim = X_scaled.shape[1]        # number of features (dimensions)


In [11]:
# Load the dataset
df = pd.read_csv('songs_with_attributes_and_lyrics.csv')  # Update the path based on the file location

# Select relevant audio features
features = ['danceability', 'energy', 'loudness', 'speechiness', 
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = df[features].values

# Standardize the feature values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define parameters
k = 10                       # Number of nearest neighbors
n_samples = X_scaled.shape[0]  # Total number of samples
dim = X_scaled.shape[1]        # Number of feature dimensions


In [12]:
# 1. Exact Nearest Neighbors
print("Running Exact Nearest Neighbors...")

# Measure execution time
start = time.time()

# Initialize and fit the brute-force NN model using Euclidean distance
nn_model = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn_model.fit(X_scaled)

# Find nearest neighbors for the first sample
distances, indices = nn_model.kneighbors(X_scaled[:1])

# Calculate total time taken
exact_time = time.time() - start
print(f"Exact NN time: {exact_time:.4f} seconds")


Running Exact Nearest Neighbors...
Exact NN time: 0.0374 seconds


In [13]:
# 2. Annoy
print("\nRunning Annoy...")

# Initialize Annoy index with Euclidean distance
annoy_index = AnnoyIndex(dim, 'euclidean')

# Add all data points to the index
for i in range(n_samples):
    annoy_index.add_item(i, X_scaled[i])

# Measure build and query time
start = time.time()
annoy_index.build(10)  # Build the index with 10 trees
neighbors = annoy_index.get_nns_by_item(0, k)  # Query for k nearest neighbors of the first item
annoy_time = time.time() - start

print(f"Annoy time: {annoy_time:.4f} seconds")



Running Annoy...
Annoy time: 2.6699 seconds


In [14]:
# 3. FAISS
print("\nRunning FAISS...")

# Initialize FAISS index using L2 (Euclidean) distance
index_faiss = faiss.IndexFlatL2(dim)

# Measure the time to add data and perform the search
start = time.time()
index_faiss.add(X_scaled.astype('float32'))  # Add the data to the index
D, I = index_faiss.search(X_scaled[:1].astype('float32'), k)  # Search for k nearest neighbors of the first item
faiss_time = time.time() - start

print(f"FAISS time: {faiss_time:.4f} seconds")



Running FAISS...
FAISS time: 0.0424 seconds


In [15]:
# 4. HNSWLIB
print("\nRunning HNSWLIB...")

# Initialize HNSW index with L2 (Euclidean) distance
p = hnswlib.Index(space='l2', dim=dim)

# Create the index with construction parameters
p.init_index(max_elements=n_samples, ef_construction=100, M=16)

# Add all data points to the index
p.add_items(X_scaled)

# Measure query time for the first item
start = time.time()
labels, distances = p.knn_query(X_scaled[:1], k=k)
hnsw_time = time.time() - start

print(f"HNSW time: {hnsw_time:.4f} seconds")



Running HNSWLIB...
HNSW time: 0.0005 seconds


In [16]:
import pandas as pd

# Create a DataFrame to store the execution times of each method
results = pd.DataFrame({
    'Method': ['Exact NN', 'Annoy', 'FAISS', 'HNSWLIB'],
    'Execution Time (s)': [exact_time, annoy_time, faiss_time, hnsw_time]
})

# Display the performance comparison
print("\n=== Performance Comparison ===")
print(results)



=== Performance Comparison ===
     Method  Execution Time (s)
0  Exact NN            0.037416
1     Annoy            2.669936
2     FAISS            0.042378
3   HNSWLIB            0.000490


The performance comparison revealed that HNSWLIB delivered the fastest execution time, completing the nearest neighbor search in just 0.001 seconds. FAISS and Exact Nearest Neighbors produced similar outcomes, with FAISS performing slightly slower than the exact method but still maintaining good efficiency. On the other hand, ANNOY was noticeably slower, taking over 5 seconds to complete the same task. These findings highlight HNSWLIB as the most efficient option in terms of speed and scalability, making it well-suited for large-scale or real-time similarity search tasks, whereas ANNOY is less suitable for high-performance demands.