In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Carrega os dados extraídos do áudio
df = pd.read_csv("../data/features_audio.csv")

# Extrai país e estilo a partir do nome do arquivo (assumindo padrão no nome)
df["country"] = df["filename"].apply(lambda x: x.split("_")[0])
df["style"] = df["filename"].apply(lambda x: x.split("_")[1])
df["track_id"] = df["filename"].apply(lambda x: x.replace(".wav", ""))


In [3]:
# Seleciona apenas colunas numéricas
feature_cols = [col for col in df.columns if col.startswith(("mfcc", "centroid", "rolloff", "zcr", "flatness"))]
X = df[feature_cols].copy()

# Normaliza os dados
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [4]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df["pca_1"] = X_pca[:, 0]
df["pca_2"] = X_pca[:, 1]

In [5]:
tsne = TSNE(n_components=2, perplexity=5, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

df["tsne_1"] = X_tsne[:, 0]
df["tsne_2"] = X_tsne[:, 1]


[WinError 2] O sistema não pode encontrar o arquivo especificado
  File "C:\Users\psene\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executab

In [6]:
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)

df["umap_1"] = X_umap[:, 0]
df["umap_2"] = X_umap[:, 1]

  warn(


In [7]:
fig = px.scatter(df, x="pca_1", y="pca_2",
                 color="country", symbol="style",
                 hover_name="track_id",
                 title="Timbre Space - PCA Projection")
fig.show()


In [8]:
fig = px.scatter(df, x="tsne_1", y="tsne_2",
                 color="country", symbol="style",
                 hover_name="track_id",
                 title="Timbre Space - t-SNE Projection")
fig.show()

In [9]:
fig = px.scatter(df, x="umap_1", y="umap_2",
                 color="country", symbol="style",
                 hover_name="track_id",
                 title="Timbre Space - UMAP Projection")
fig.show()


In [10]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

def evaluate_embedding(X, labels):
    sil = silhouette_score(X, labels)
    cal = calinski_harabasz_score(X, labels)
    dbi = davies_bouldin_score(X, labels)
    return sil, cal, dbi

results = {}

# PCA
results["PCA_country"] = evaluate_embedding(df[["pca_1", "pca_2"]], df["country"])
results["PCA_style"] = evaluate_embedding(df[["pca_1", "pca_2"]], df["style"])

# t-SNE
results["tSNE_country"] = evaluate_embedding(df[["tsne_1", "tsne_2"]], df["country"])
results["tSNE_style"] = evaluate_embedding(df[["tsne_1", "tsne_2"]], df["style"])

# UMAP
results["UMAP_country"] = evaluate_embedding(df[["umap_1", "umap_2"]], df["country"])
results["UMAP_style"] = evaluate_embedding(df[["umap_1", "umap_2"]], df["style"])


In [11]:
summary = pd.DataFrame(results, index=["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]).T
summary = summary.round(3)
summary


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin
PCA_country,-0.204,1.818,14.684
PCA_style,-0.293,3.073,11.965
tSNE_country,-0.221,1.598,4.958
tSNE_style,-0.315,1.394,8.32
UMAP_country,-0.197,1.866,6.484
UMAP_style,-0.351,2.625,7.417


In [12]:
from sklearn.cluster import KMeans

# Aplica KMeans com 6 clusters
kmeans = KMeans(n_clusters=6, random_state=42)
df["cluster_kmeans"] = kmeans.fit_predict(X_scaled)


In [13]:
# Avaliação dos clusters em cada espaço projetado
results["PCA_kmeans"] = evaluate_embedding(df[["pca_1", "pca_2"]], df["cluster_kmeans"])
results["tSNE_kmeans"] = evaluate_embedding(df[["tsne_1", "tsne_2"]], df["cluster_kmeans"])
results["UMAP_kmeans"] = evaluate_embedding(df[["umap_1", "umap_2"]], df["cluster_kmeans"])


In [14]:
# Visualização dos clusters - PCA
fig = px.scatter(df, x="pca_1", y="pca_2",
                 color=df["cluster_kmeans"].astype(str),
                 hover_name="track_id",
                 title="KMeans Clusters - PCA Projection")
fig.show()


In [15]:
# Visualização dos clusters - t-SNE
fig = px.scatter(df, x="tsne_1", y="tsne_2",
                 color=df["cluster_kmeans"].astype(str),
                 hover_name="track_id",
                 title="KMeans Clusters - t-SNE Projection")
fig.show()


In [16]:
# Visualização dos clusters - UMAP
fig = px.scatter(df, x="umap_1", y="umap_2",
                 color=df["cluster_kmeans"].astype(str),
                 hover_name="track_id",
                 title="KMeans Clusters - UMAP Projection")
fig.show()


In [17]:
# Atualiza sumário com os resultados dos clusters
summary = pd.DataFrame(results, index=["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]).T
summary = summary.round(3)
summary


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin
PCA_country,-0.204,1.818,14.684
PCA_style,-0.293,3.073,11.965
tSNE_country,-0.221,1.598,4.958
tSNE_style,-0.315,1.394,8.32
UMAP_country,-0.197,1.866,6.484
UMAP_style,-0.351,2.625,7.417
PCA_kmeans,0.143,29.046,1.568
tSNE_kmeans,0.089,13.234,3.276
UMAP_kmeans,0.186,40.467,1.162


In [20]:
import hdbscan


In [21]:
# Aplica HDBSCAN no espaço UMAP (mais promissor até agora)
clusterer = hdbscan.HDBSCAN(min_cluster_size=4)
df["cluster_hdbscan"] = clusterer.fit_predict(df[["umap_1", "umap_2"]])



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [22]:
# Visualiza clusters encontrados
fig = px.scatter(df, x="umap_1", y="umap_2",
                 color=df["cluster_hdbscan"].astype(str),
                 hover_name="track_id",
                 title="HDBSCAN Clusters - UMAP Projection")
fig.show()


In [23]:
# Avalia os agrupamentos (ignorando outliers rotulados como -1)
mask = df["cluster_hdbscan"] != -1
results["UMAP_hdbscan"] = evaluate_embedding(df[["umap_1", "umap_2"]][mask], df["cluster_hdbscan"][mask])


In [24]:
# Atualiza o sumário
summary = pd.DataFrame(results, index=["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]).T
summary = summary.round(3)
summary


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin
PCA_country,-0.204,1.818,14.684
PCA_style,-0.293,3.073,11.965
tSNE_country,-0.221,1.598,4.958
tSNE_style,-0.315,1.394,8.32
UMAP_country,-0.197,1.866,6.484
UMAP_style,-0.351,2.625,7.417
PCA_kmeans,0.143,29.046,1.568
tSNE_kmeans,0.089,13.234,3.276
UMAP_kmeans,0.186,40.467,1.162
UMAP_hdbscan,0.485,61.826,0.767
