In [2]:
!pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.40-cp312-cp312-macosx_10_13_universal2.whl.metadata (15 kB)
Downloading hdbscan-0.8.40-cp312-cp312-macosx_10_13_universal2.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m2.7 MB/s[0m  [33m0:00:00[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: hdbscan
Successfully installed hdbscan-0.8.40


In [3]:
import faiss
import numpy as np
import hdbscan
import json



In [4]:
# ---------------------------
# 1. Load FAISS index
# ---------------------------
index = faiss.read_index("faiss_index.bin")
d = index.d
n = index.ntotal
vectors = np.zeros((n, d), dtype=np.float32)
index.reconstruct_n(0, n, vectors)  # get all vectors



array([[-1.8594841e-02,  2.2015141e-02,  1.1109457e-02, ...,
         3.8874050e-04,  2.7885014e-04,  1.1467549e-04],
       [-1.8594801e-02,  2.2015095e-02,  1.1109433e-02, ...,
        -8.3451348e-05,  2.4739438e-04,  8.6424167e-05],
       [-3.0911859e-02,  2.2164574e-02,  3.4151442e-04, ...,
         1.8507249e-04,  2.9084086e-04,  1.5904200e-04],
       ...,
       [-4.1083451e-02,  4.1397393e-02,  5.8253305e-03, ...,
        -4.1099440e-04,  3.5400767e-04,  1.6999725e-04],
       [-4.3202877e-02,  4.6174712e-02,  6.6729584e-03, ...,
         4.5829275e-04,  3.6781872e-04,  1.2223603e-04],
       [-3.9981917e-02,  3.8247123e-02, -3.4121319e-04, ...,
         7.5836346e-04,  3.5032665e-04,  2.0373805e-04]], dtype=float32)

In [7]:
# ---------------------------
# 2. HDBSCAN clustering
# ---------------------------
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=5,
    metric='euclidean'  # or 'cosine' if vectors are normalized
)
labels = clusterer.fit_predict(vectors)




In [10]:
unique_labels = set(labels)
cluster_centroids = {}
for lbl in unique_labels:
    if lbl == -1:
        continue  # skip noise
    cluster_centroids[int(lbl)] = vectors[labels == lbl].mean(axis=0).tolist()


In [11]:
# ---------------------------
# 4. Export cluster info
# ---------------------------
cluster_data = {
    "labels": labels.tolist(),          # cluster label for each song (same order as FAISS)
    "centroids": cluster_centroids      # cluster centroids for reuse
}

with open("clusters.json", "w") as f:
    json.dump(cluster_data, f, indent=2)

print("Clusters saved to 'clusters.json'")
print(f"Found {len(cluster_centroids)} clusters (noise excluded)")

Clusters saved to 'clusters.json'
Found 2 clusters (noise excluded)
