# 🎭 Spotify Mood Clustering Analysis

This notebook implements and analyzes mood-based clustering of songs using Spotify audio features.

## Objectives:
1. Load and preprocess the data
2. Implement KMeans clustering
3. Analyze cluster characteristics
4. Visualize clusters in 2D/3D
5. Evaluate clustering quality
6. Generate mood labels for clusters


In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.decomposition import PCA
import umap
import warnings
warnings.filterwarnings('ignore')

from features import AudioFeatureProcessor
from cluster import MoodClusterer
from viz import MoodVisualizer


In [None]:
# Load and process data
df = pd.read_csv('../data/dataset.csv')
print(f"Dataset shape: {df.shape}")

# Initialize processor
processor = AudioFeatureProcessor(scaler_type='standard')
df_processed = processor.fit_transform(df, remove_outliers=True)
print(f"Processed dataset shape: {df_processed.shape}")

# Get feature columns for clustering
feature_cols = processor.get_audio_feature_columns()
available_cols = [col for col in feature_cols if col in df_processed.columns]
X = df_processed[available_cols].values
print(f"Feature matrix shape: {X.shape}")
print(f"Features used: {available_cols}")


In [None]:
# Find optimal number of clusters
clusterer = MoodClusterer(algorithm='kmeans', random_state=42)
optimal_params = clusterer.find_optimal_clusters(X, max_clusters=15, min_clusters=2)

print("Optimal clustering parameters:")
print(f"Optimal clusters: {optimal_params['optimal_clusters']}")
print(f"Silhouette score: {optimal_params['silhouette_score']:.3f}")
print(f"Calinski-Harabasz score: {optimal_params['calinski_harabasz_score']:.3f}")
print(f"Davies-Bouldin score: {optimal_params['davies_bouldin_score']:.3f}")

# Plot silhouette scores
results_df = pd.DataFrame(optimal_params['all_results'])
plt.figure(figsize=(10, 6))
plt.plot(results_df['n_clusters'], results_df['silhouette_score'], 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.grid(True)
plt.show()


In [None]:
# Fit clustering model with optimal parameters
clusterer.fit(X, n_clusters=optimal_params['optimal_clusters'])
print(f"Clustering completed with {len(clusterer.cluster_stats)} clusters")

# Display cluster information
print("\n=== CLUSTER INFORMATION ===")
for i, stat in enumerate(clusterer.cluster_stats):
    mood_label = clusterer.mood_labels.get(stat['cluster_id'], f"Cluster {stat['cluster_id']}")
    print(f"\nCluster {stat['cluster_id']}: {mood_label}")
    print(f"  Size: {stat['size']} songs ({stat['percentage']:.1f}%)")
    print(f"  Mean features: {[f'{x:.3f}' for x in stat['mean_features'][:5]]}...")


In [None]:
# Create visualizations
visualizer = MoodVisualizer()

# UMAP visualization
print("Creating UMAP visualization...")
fig_umap = visualizer.create_cluster_scatter_plot(
    X, clusterer.cluster_labels, clusterer.mood_labels, method='umap'
)
fig_umap.show()

# PCA visualization
print("Creating PCA visualization...")
fig_pca = visualizer.create_cluster_scatter_plot(
    X, clusterer.cluster_labels, clusterer.mood_labels, method='pca'
)
fig_pca.show()


In [None]:
# Feature distribution analysis
print("Creating feature distribution plots...")
fig_dist = visualizer.create_feature_distribution_plot(
    df_processed, clusterer.cluster_labels, clusterer.mood_labels
)
fig_dist.show()

# Cluster heatmap
print("Creating cluster heatmap...")
fig_heatmap = visualizer.create_cluster_heatmap(
    df_processed, clusterer.cluster_labels, clusterer.mood_labels
)
fig_heatmap.show()


In [None]:
# Save models
print("Saving models...")
processor.save_processor('models/feature_processor.pkl')
clusterer.save_model('models/clusterer.pkl')
print("✅ Models saved successfully!")

# Summary
print("\n=== CLUSTERING ANALYSIS SUMMARY ===")
print(f"Dataset: {len(df):,} songs")
print(f"Processed: {len(df_processed):,} songs")
print(f"Features: {len(available_cols)} audio features")
print(f"Clusters: {len(clusterer.cluster_stats)} mood clusters")
print(f"Optimal clusters: {optimal_params['optimal_clusters']}")
print(f"Silhouette score: {optimal_params['silhouette_score']:.3f}")

print("\n🎭 Mood Clusters:")
for stat in clusterer.cluster_stats:
    mood_label = clusterer.mood_labels.get(stat['cluster_id'], f"Cluster {stat['cluster_id']}")
    print(f"  • {mood_label}: {stat['size']} songs ({stat['percentage']:.1f}%)")
