In [11]:
import os
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
os.chdir(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score



Project root: c:\Users\Adit\Desktop\425Proj\Project


In [12]:
df = pd.read_csv("data/fma_manifest_5k_5genres_lyrics_whisper_dropped_removed.csv")
df[["track_id", "genre", "lyrics_source"]].head()


Unnamed: 0,track_id,genre,lyrics_source
0,32339,Pop,whisper
1,88892,Folk,genius
2,78851,Hip-Hop,genius
3,97570,Hip-Hop,
4,88868,Folk,genius


EASY TASKS

In [13]:
#BASELINE AUDIO FEATURES (MFCC)

X_mfcc = np.load("data/audio_features_keptwhisper.npy")
X_mfcc.shape

(4182, 40)

In [14]:
#PCA + KMeans Clustering

X_pca = PCA(n_components=8).fit_transform(X_mfcc)
labels_pca = KMeans(n_clusters=3, random_state=42).fit_predict(X_pca)

print("Silhouette:", silhouette_score(X_pca, labels_pca))
print("Calinski-Harabasz:", calinski_harabasz_score(X_pca, labels_pca))


Silhouette: 0.32368966937065125
Calinski-Harabasz: 3646.490335835081


In [15]:
#Dense VAE Latent Representation

Z_vae = np.load("data/audio_latents_vae.npy")
Z_vae.shape

(4182, 8)

In [16]:
#VAE + KMeans Clustering
labels_vae = KMeans(n_clusters=3, random_state=42).fit_predict(Z_vae)

print("Silhouette:", silhouette_score(Z_vae, labels_vae))
print("Calinski-Harabasz:", calinski_harabasz_score(Z_vae, labels_vae))


Silhouette: 0.35022908449172974
Calinski-Harabasz: 2327.941725387813


The dense VAE latent representation achieved higher Silhouette Score compared to the PCA baseline, showing improved cluster seperation. However PCA produced a higher Calinski-Harabasz index due to it's variance-maximising objective, which may favour cluster dispersion even when semantic coherrence is weaker. 
This suggests that VAE learns from a more compact and semantically meaningful embeddding despite global variance