# LAB ASSIGMENT 1

In [6]:
import numpy as np
from time import time
from annoy import AnnoyIndex
import faiss
import hnswlib

In [7]:
# 1000/2
data = np.random.random((1000, 2)).astype('float32')

# --- ANNOY ---
f = 2
t = AnnoyIndex(f, 'euclidean')
for i in range(len(data)):
    t.add_item(i, data[i])

start = time()
t.build(10)
t.get_nns_by_item(0, 10) 
annoy_time = time() - start

# --- FAISS ---
index = faiss.IndexFlatL2(f)
index.add(data)
start = time()
index.search(data[:1], 10)
faiss_time = time() - start

# --- HNSW ---
p = hnswlib.Index(space='l2', dim=f)
p.init_index(max_elements=len(data), ef_construction=100, M=16)
p.add_items(data)
start = time()
p.knn_query(data[:1], k=10)
hnsw_time = time() - start

print("ANNOY:", annoy_time)
print("FAISS:", faiss_time)
print("HNSW:", hnsw_time)


ANNOY: 0.02262401580810547
FAISS: 0.00040912628173828125
HNSW: 0.0


In [9]:
# 1000/5
data = np.random.random((1000, 5)).astype('float32')

# --- ANNOY ---
f = data.shape[1]
t = AnnoyIndex(f, 'euclidean')
for i in range(len(data)):
    t.add_item(i, data[i])

start = time()
t.build(10)
t.get_nns_by_item(0, 10) 
annoy_time = time() - start

# --- FAISS ---
index = faiss.IndexFlatL2(f)
index.add(data)
start = time()
index.search(data[:1], 10)
faiss_time = time() - start

# --- HNSW ---
p = hnswlib.Index(space='l2', dim=f)
p.init_index(max_elements=len(data), ef_construction=100, M=16)
p.add_items(data)
start = time()
p.knn_query(data[:1], k=10)
hnsw_time = time() - start

print("ANNOY:", annoy_time)
print("FAISS:", faiss_time)
print("HNSW:", hnsw_time)


ANNOY: 0.011003971099853516
FAISS: 0.0
HNSW: 0.0


In [10]:
# 1.000.000/2
data = np.random.random((1000000, 2)).astype('float32')

# --- ANNOY ---
f = data.shape[1]
t = AnnoyIndex(f, 'euclidean')
for i in range(len(data)):
    t.add_item(i, data[i])

start = time()
t.build(10)
t.get_nns_by_item(0, 10) 
annoy_time = time() - start

# --- FAISS ---
index = faiss.IndexFlatL2(f)
index.add(data)
start = time()
index.search(data[:1], 10)
faiss_time = time() - start

# --- HNSW ---
p = hnswlib.Index(space='l2', dim=f)
p.init_index(max_elements=len(data), ef_construction=100, M=16)
p.add_items(data)
start = time()
p.knn_query(data[:1], k=10)
hnsw_time = time() - start

print("ANNOY:", annoy_time)
print("FAISS:", faiss_time)
print("HNSW:", hnsw_time)


ANNOY: 7.660905838012695
FAISS: 0.010268688201904297
HNSW: 0.005427360534667969


In [11]:
# 1.000.000/5
data = np.random.random((1000000, 5)).astype('float32')

# --- ANNOY ---
f = data.shape[1]
t = AnnoyIndex(f, 'euclidean')
for i in range(len(data)):
    t.add_item(i, data[i])

start = time()
t.build(10)
t.get_nns_by_item(0, 10) 
annoy_time = time() - start

# --- FAISS ---
index = faiss.IndexFlatL2(f)
index.add(data)
start = time()
index.search(data[:1], 10)
faiss_time = time() - start

# --- HNSW ---
p = hnswlib.Index(space='l2', dim=f)
p.init_index(max_elements=len(data), ef_construction=100, M=16)
p.add_items(data)
start = time()
p.knn_query(data[:1], k=10)
hnsw_time = time() - start

print("ANNOY:", annoy_time)
print("FAISS:", faiss_time)
print("HNSW:", hnsw_time)


ANNOY: 6.194023132324219
FAISS: 0.011307239532470703
HNSW: 0.0


In [12]:
data = {
    'Criteria (Rows/Dimension)': ['1000/2D', '1000/5D', '1000000/2D', '1000000/5D'],
    'ANNOY': [0.02, 0.001, 7.66, 6.19],
    'FAISS': [0.0004, 0.000, 0.01, 0.011],
    'HNSW': [0.00, 0.000, 0.005, 0.0]
}

df = pd.DataFrame(data)
print(df)

  Criteria (Rows/Dimension)  ANNOY   FAISS   HNSW
0                   1000/2D  0.020  0.0004  0.000
1                   1000/5D  0.001  0.0000  0.000
2                1000000/2D  7.660  0.0100  0.005
3                1000000/5D  6.190  0.0110  0.000


Based on the experimental results, FAISS and HNSWLIB consistently outperformed ANNOY in terms of execution time, especially on large datasets. While all algorithms performed similarly fast on small datasets (1000 samples), ANNOY became significantly slower as the data size increased to one million records, taking several seconds compared to the near-instant responses of FAISS and HNSWLIB. The number of dimensions (2D vs. 5D) had minimal impact on performance for all methods. Overall, FAISS and HNSWLIB demonstrated superior scalability and efficiency, making them more suitable for large-scale or real-time nearest neighbor searches, while ANNOY remains a simpler option for smaller datasets.


# LAB ASSIGMENT 2

In [13]:
import pandas as pd
import numpy as np
import time
import faiss
from annoy import AnnoyIndex
import hnswlib
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [14]:
# Load dataset
df = pd.read_csv('songs_with_attributes_and_lyrics.csv')  # ganti path sesuai lokasi file
features = ['danceability', 'energy', 'loudness', 'speechiness', 
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = df[features].values

# Standardisasi 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Parameter
k = 10  
n_samples = X_scaled.shape[0]
dim = X_scaled.shape[1]

In [16]:
# 1. Exact Nearest Neighbors
print("Running Exact Nearest Neighbors...")
start = time.time()
nn_model = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn_model.fit(X_scaled)
distances, indices = nn_model.kneighbors(X_scaled[:1])
exact_time = time.time() - start
print(f"Exact NN time: {exact_time:.4f} seconds")

Running Exact Nearest Neighbors...
Exact NN time: 0.1450 seconds


In [17]:
# 2. Annoy
print("\nRunning Annoy...")
annoy_index = AnnoyIndex(dim, 'euclidean')
for i in range(n_samples):
    annoy_index.add_item(i, X_scaled[i])
start = time.time()
annoy_index.build(10)  # number of trees
neighbors = annoy_index.get_nns_by_item(0, k)
annoy_time = time.time() - start
print(f"Annoy time: {annoy_time:.4f} seconds")


Running Annoy...
Annoy time: 5.5521 seconds


In [18]:
# 3. FAISS
print("\nRunning FAISS...")
index_faiss = faiss.IndexFlatL2(dim)
start = time.time()
index_faiss.add(X_scaled.astype('float32'))
D, I = index_faiss.search(X_scaled[:1].astype('float32'), k)
faiss_time = time.time() - start
print(f"FAISS time: {faiss_time:.4f} seconds")


Running FAISS...
FAISS time: 0.1783 seconds


In [19]:
# 4. HNSWLIB
print("\nRunning HNSWLIB...")
p = hnswlib.Index(space='l2', dim=dim)
p.init_index(max_elements=n_samples, ef_construction=100, M=16)
p.add_items(X_scaled)
start = time.time()
labels, distances = p.knn_query(X_scaled[:1], k=k)
hnsw_time = time.time() - start
print(f"HNSW time: {hnsw_time:.4f} seconds")


Running HNSWLIB...
HNSW time: 0.0014 seconds


In [20]:
import pandas as pd
results = pd.DataFrame({
    'Method': ['Exact NN', 'Annoy', 'FAISS', 'HNSWLIB'],
    'Execution Time (s)': [exact_time, annoy_time, faiss_time, hnsw_time]
})
print("\n=== Performance Comparison ===")
print(results)


=== Performance Comparison ===
     Method  Execution Time (s)
0  Exact NN            0.145045
1     Annoy            5.552124
2     FAISS            0.178330
3   HNSWLIB            0.001447


Based on the performance comparison, HNSWLIB achieved the fastest execution time among all methods, completing the nearest neighbor search in only 0.001 seconds. FAISS and Exact Nearest Neighbors showed comparable results, with FAISS being slightly slower than the exact search but still performing efficiently. In contrast, ANNOY was significantly slower, requiring over 5 seconds for the same task. These results indicate that HNSWLIB provides the best balance of speed and scalability, making it ideal for large-scale or real-time similarity search applications, while ANNOY is less efficient for high-performance use cases.