In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

df = pd.read_csv("SpotifyFeatures.csv")  # O el nombre exacto que tenga tu archivo
df.head()


In [None]:
df.info()


In [None]:
df.describe()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Mapa de Correlación")
plt.show()

In [None]:
#Normalizar las variables numericas
numerical_cols = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                  'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[numerical_cols])

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], alpha=0.5)
plt.title("PCA - Visualización en 2D")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.show()

print("Varianza explicada por PCA:", pca.explained_variance_ratio_)

tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
plt.scatter(X_tsne[:,0], X_tsne[:,1], alpha=0.5)
plt.title("t-SNE - Visualización en 2D")
plt.grid(True)
plt.show()

In [None]:
#Clustering no supervisado
kmeans = KMeans(n_clusters=4, random_state=42)
labels = kmeans.fit_predict(X_pca)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.6)
plt.title("Clusters con KMeans sobre PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
#DBSCAN sobre T-sne
# dbscan = DBSCAN(eps=3, min_samples=5)
# labels_db = dbscan.fit_predict(X_tsne)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels_db, cmap='rainbow')
# plt.title("Clusters con DBSCAN sobre t-SNE")
# plt.show()

In [None]:
#Evaluacion de Cluster
#Score de silhouette
score = silhouette_score(X_pca, labels)
print(f"Silhouette Score: {score}")
df_clusters = df.copy()
df_clusters['cluster'] = labels

In [None]:
#Union de datos a los originales
df_clusters = df.copy()
df_clusters['cluster'] = labels


In [None]:
#Analizando caracteristicas
df_clusters.groupby('cluster')[numerical_cols].mean()

In [None]:
#Genero dominante
df_clusters.groupby('cluster')['genre'].agg(lambda x: x.value_counts().index[0])