# Experiment 5: K-Means Clustering on Spotify Data

This experiment involves performing K-Means clustering on the Spotify dataset using two different methods:
1. **Method 1: Using Scikit-Learn Library**
2. **Method 2: Implementation from Scratch**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading and Preprocessing

In [None]:
# Load the dataset
df = pd.read_csv('../spotify.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Select numerical features for clustering
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
            'instrumentalness', 'liveness', 'valence', 'tempo']
X = df[features]

# Handle missing values if any
X = X.fillna(X.mean())

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Preprocessed data shape:", X_scaled.shape)

## 2. Method 1: K-Means using Scikit-Learn

In [None]:
# Find optimal K using Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Applying K-Means with optimal K (let's say K=5 based on common elbow results for this data)
k = 5
kmeans_model = KMeans(n_clusters=k, init='k-means++', random_state=42)
y_kmeans = kmeans_model.fit_predict(X_scaled)

df['cluster_sklearn'] = y_kmeans
print(f"Cluster counts (Sklearn):\n{df['cluster_sklearn'].value_counts()}")

## 3. Method 2: K-Means Implementation from Scratch

In [None]:
class ScratchKMeans:
    def __init__(self, n_clusters=5, max_iter=100, tol=1e-4):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.centroids = None

    def fit(self, X):
        # Randomly initialize centroids
        np.random.seed(42)
        random_indices = np.random.permutation(X.shape[0])
        self.centroids = X[random_indices[:self.n_clusters]]

        for i in range(self.max_iter):
            # Assign clusters based on nearest centroid
            labels = self._assign_clusters(X)
            
            # Update centroids with handling for empty clusters
            new_centroids = []
            for k in range(self.n_clusters):
                cluster_points = X[labels == k]
                if len(cluster_points) > 0:
                    new_centroids.append(cluster_points.mean(axis=0))
                else:
                    # If cluster is empty, pick a random point from X as new centroid
                    new_centroids.append(X[np.random.randint(0, X.shape[0])])
            
            new_centroids = np.array(new_centroids)
            
            # Check for convergence
            if np.all(np.abs(new_centroids - self.centroids) < self.tol):
                break
            
            self.centroids = new_centroids
        
        return self

    def _assign_clusters(self, X):
        distances = np.sqrt(((X[:, np.newaxis] - self.centroids)**2).sum(axis=2))
        return np.argmin(distances, axis=1)

    def predict(self, X):
        return self._assign_clusters(X)

# Apply Scratch K-Means
scratch_kmeans = ScratchKMeans(n_clusters=5)
scratch_kmeans.fit(X_scaled)
y_scratch = scratch_kmeans.predict(X_scaled)

df['cluster_scratch'] = y_scratch
print(f"Cluster counts (Scratch):\n{df['cluster_scratch'].value_counts()}")

## 4. Visualization and Comparison

In [None]:
# Use PCA to reduce dimensions to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(15, 6))

# Plot Sklearn results
plt.subplot(1, 2, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['cluster_sklearn'], cmap='viridis', alpha=0.5)
plt.title('K-Means (Scikit-Learn)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

# Plot Scratch results
plt.subplot(1, 2, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['cluster_scratch'], cmap='magma', alpha=0.5)
plt.title('K-Means (From Scratch)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

plt.tight_layout()
plt.show()

In [None]:
# Compare the two methods
comparison = pd.crosstab(df['cluster_sklearn'], df['cluster_scratch'])
print("Comparison Matrix (Sklearn vs Scratch):")
print(comparison)