# Module 13: Clustering

**Goal:** Learn to discover natural groups in data using K-means and evaluate cluster quality.

**Prerequisites:** Module 12 (Embeddings)

**Expected Runtime:** ~25 minutes

**Outputs:**
- Customer segmentation
- Elbow and silhouette analysis
- Cluster profiles

---

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.datasets import make_blobs, make_moons
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.rcParams['figure.figsize'] = (12, 5)

## Part 1: Generate Customer Data

Create synthetic RFM (Recency, Frequency, Monetary) customer data.

In [None]:
# Generate synthetic customer behavior data
n_customers = 500

# Create 4 natural segments (unknown to K-means)
segments = {
    'loyal_high_value': {'recency': (5, 10), 'frequency': (8, 12), 'monetary': (200, 500)},
    'loyal_low_value': {'recency': (5, 15), 'frequency': (6, 10), 'monetary': (30, 80)},
    'at_risk': {'recency': (30, 60), 'frequency': (2, 5), 'monetary': (50, 150)},
    'new': {'recency': (3, 10), 'frequency': (1, 3), 'monetary': (40, 120)},
}

data = []
for seg_name, params in segments.items():
    n_seg = n_customers // 4
    for _ in range(n_seg):
        data.append({
            'recency_days': np.random.uniform(*params['recency']),
            'frequency': np.random.uniform(*params['frequency']),
            'monetary': np.random.uniform(*params['monetary']),
            'true_segment': seg_name
        })

df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle

print("Customer Data Sample:")
df.head(10)

In [None]:
# Visualize the raw data
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

# Color by true segment (we know this, algorithm doesn't)
colors = {'loyal_high_value': '#22c55e', 'loyal_low_value': '#0ea5e9', 
          'at_risk': '#f97316', 'new': '#8b5cf6'}

for seg in df['true_segment'].unique():
    mask = df['true_segment'] == seg
    axes[0].scatter(df.loc[mask, 'recency_days'], df.loc[mask, 'frequency'], 
                   c=colors[seg], label=seg, alpha=0.6)
    axes[1].scatter(df.loc[mask, 'recency_days'], df.loc[mask, 'monetary'], 
                   c=colors[seg], alpha=0.6)
    axes[2].scatter(df.loc[mask, 'frequency'], df.loc[mask, 'monetary'], 
                   c=colors[seg], alpha=0.6)

axes[0].set_xlabel('Recency (days)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('True Segments (Unknown to Model)')
axes[0].legend()

axes[1].set_xlabel('Recency (days)')
axes[1].set_ylabel('Monetary ($)')

axes[2].set_xlabel('Frequency')
axes[2].set_ylabel('Monetary ($)')

plt.tight_layout()
plt.show()

print("ðŸ’¡ The algorithm doesn't know these segments exist. Can it discover them?")

## Part 2: Prepare Features

**Critical:** Always scale features before K-means!

In [None]:
# Select features for clustering
features = ['recency_days', 'frequency', 'monetary']
X = df[features].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Before scaling:")
print(f"  Recency range: {X[:, 0].min():.1f} - {X[:, 0].max():.1f}")
print(f"  Frequency range: {X[:, 1].min():.1f} - {X[:, 1].max():.1f}")
print(f"  Monetary range: {X[:, 2].min():.1f} - {X[:, 2].max():.1f}")

print("\nAfter scaling (mean=0, std=1):")
print(f"  Recency range: {X_scaled[:, 0].min():.2f} - {X_scaled[:, 0].max():.2f}")
print(f"  Frequency range: {X_scaled[:, 1].min():.2f} - {X_scaled[:, 1].max():.2f}")
print(f"  Monetary range: {X_scaled[:, 2].min():.2f} - {X_scaled[:, 2].max():.2f}")

print("\nðŸ’¡ Without scaling, monetary ($) would dominate the distance calculations.")

## Part 3: Elbow Method

Find the "elbow" in the inertia curve to choose K.

In [None]:
# Calculate inertia for different K
k_range = range(2, 11)
inertias = []

for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    inertias.append(km.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 5))
plt.plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
plt.axvline(x=4, color='r', linestyle='--', label='Elbow at K=4')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (Within-cluster Sum of Squares)')
plt.title('Elbow Method for Optimal K')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("ðŸ’¡ Look for where the curve 'bends' â€” adding more clusters gives diminishing returns.")

## Part 4: Silhouette Analysis

Silhouette score measures how similar points are to their own cluster vs. other clusters.

In [None]:
# Calculate silhouette scores for different K
silhouette_scores = []

for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)
    print(f"K={k}: Silhouette = {score:.3f}")

best_k = k_range[np.argmax(silhouette_scores)]
print(f"\nâœ“ Best K by silhouette: {best_k}")

In [None]:
# Visualize silhouette scores
plt.figure(figsize=(10, 5))
colors = ['#14b8a6' if k == best_k else '#94a3b8' for k in k_range]
plt.bar(k_range, silhouette_scores, color=colors)
plt.axhline(y=0.5, color='r', linestyle='--', label='Good threshold (0.5)')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score by K')
plt.legend()
plt.show()

print("ðŸ’¡ Silhouette > 0.5 indicates good cluster separation.")

## Part 5: Fit K-Means with Best K

In [None]:
# Fit final model
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Visualize clusters
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cluster_colors = ['#ef4444', '#22c55e', '#0ea5e9', '#f97316']

for c in range(4):
    mask = df['cluster'] == c
    axes[0].scatter(df.loc[mask, 'recency_days'], df.loc[mask, 'frequency'], 
                   c=cluster_colors[c], label=f'Cluster {c}', alpha=0.6)
    axes[1].scatter(df.loc[mask, 'frequency'], df.loc[mask, 'monetary'], 
                   c=cluster_colors[c], alpha=0.6)

axes[0].set_xlabel('Recency (days)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('K-Means Clusters')
axes[0].legend()

axes[1].set_xlabel('Frequency')
axes[1].set_ylabel('Monetary ($)')
axes[1].set_title('Clusters by Spend')

plt.tight_layout()
plt.show()

## Part 6: Cluster Profiles

What does each cluster look like? Create actionable profiles.

In [None]:
# Create cluster profiles
profiles = df.groupby('cluster').agg({
    'recency_days': ['mean', 'std'],
    'frequency': ['mean', 'std'],
    'monetary': ['mean', 'std'],
    'true_segment': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'mixed'  # Most common true segment
}).round(1)

profiles.columns = ['_'.join(col) for col in profiles.columns]
profiles['size'] = df['cluster'].value_counts().sort_index()
profiles['pct'] = (profiles['size'] / len(df) * 100).round(1)

print("=== Cluster Profiles ===")
profiles

In [None]:
# Name clusters based on profiles
cluster_names = {}

for c in range(4):
    rec = profiles.loc[c, 'recency_days_mean']
    freq = profiles.loc[c, 'frequency_mean']
    mon = profiles.loc[c, 'monetary_mean']
    
    if rec < 15 and freq > 7 and mon > 150:
        name = "Loyal High-Value"
    elif rec < 15 and freq > 5 and mon < 100:
        name = "Loyal Low-Value"
    elif rec > 25:
        name = "At-Risk"
    else:
        name = "New/Developing"
    
    cluster_names[c] = name
    print(f"Cluster {c}: {name}")
    print(f"  - Recency: {rec:.0f} days")
    print(f"  - Frequency: {freq:.1f} orders")
    print(f"  - Avg Spend: ${mon:.0f}")
    print(f"  - Size: {profiles.loc[c, 'pct']}%")
    print()

## Part 7: Compare to True Segments

How well did K-means recover the true (hidden) segments?

In [None]:
# Cross-tabulation
crosstab = pd.crosstab(df['true_segment'], df['cluster'], margins=True)
print("=== True Segment vs Cluster ===")
print(crosstab)

print("\nðŸ’¡ K-means discovered segments similar to the true underlying structure!")

## Part 8: TODO - K-Means on Non-Spherical Data

See how K-means struggles with moon-shaped clusters.

In [None]:
# Generate moon-shaped data
X_moons, y_moons = make_moons(n_samples=300, noise=0.1, random_state=42)

# Try K-means
km_moons = KMeans(n_clusters=2, random_state=42)
labels_km = km_moons.fit_predict(X_moons)

# TODO: Try DBSCAN
# dbscan = DBSCAN(eps=0.3, min_samples=5)
# labels_db = dbscan.fit_predict(X_moons)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# True labels
axes[0].scatter(X_moons[:, 0], X_moons[:, 1], c=y_moons, cmap='viridis')
axes[0].set_title('True Clusters')

# K-means
axes[1].scatter(X_moons[:, 0], X_moons[:, 1], c=labels_km, cmap='viridis')
axes[1].set_title(f'K-Means (silhouette: {silhouette_score(X_moons, labels_km):.2f})')

# Placeholder for DBSCAN
axes[2].scatter(X_moons[:, 0], X_moons[:, 1], c='gray', alpha=0.5)
axes[2].set_title('DBSCAN (TODO: uncomment above)')

plt.tight_layout()
plt.show()

print("ðŸ’¡ K-means assumes spherical clusters. DBSCAN can find arbitrary shapes.")

## Part 9: TODO - Stakeholder Summary

Explain to a marketing manager:
1. What segments you discovered and their characteristics
2. How confident you are in these segments (use metrics)
3. Recommended actions for each segment

### Your Summary:

*Write your explanation here...*

---

## Key Takeaways

1. **Always scale features** before K-means
2. **Use elbow + silhouette** to choose K
3. **Silhouette > 0.5** indicates good separation
4. **Name clusters** with business-friendly labels
5. **K-means fails** on non-spherical shapes â†’ use DBSCAN

### Next Steps
- Explore the interactive playground
- Complete the quiz
- Try clustering on your own data