# L03: K-Nearest Neighbors & K-Means

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Digital-AI-Finance/methods-algorithms/blob/master/notebooks/L03_knn_kmeans.ipynb)

**Course**: Methods and Algorithms - MSc Data Science

---

## Learning Objectives

By the end of this notebook, you will be able to:

1. Apply KNN for classification with appropriate K selection
2. Implement K-Means clustering and evaluate cluster quality
3. Compare distance metrics and their effects
4. Perform customer segmentation with business interpretation

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, silhouette_score, silhouette_samples

np.random.seed(42)
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

print('Setup complete!')

## 1. Generate Customer Data

In [None]:
# Customer segmentation data
n_customers = 300

# Segment 1: High-value frequent customers
seg1 = np.random.randn(100, 2) * np.array([0.5, 0.3]) + np.array([8, 7])
# Segment 2: Low-value occasional customers
seg2 = np.random.randn(100, 2) * np.array([0.6, 0.4]) + np.array([2, 3])
# Segment 3: Medium-value regular customers
seg3 = np.random.randn(100, 2) * np.array([0.7, 0.5]) + np.array([5, 5])

X = np.vstack([seg1, seg2, seg3])
true_labels = np.array([0]*100 + [1]*100 + [2]*100)

df = pd.DataFrame({
    'monthly_transactions': X[:, 0],
    'avg_transaction_value': X[:, 1] * 100,
    'segment': true_labels
})

print(f'Dataset shape: {df.shape}')
df.head()

In [None]:
# Visualize the data
plt.figure(figsize=(10, 6))
colors = ['red', 'green', 'blue']
labels = ['High-Value', 'Low-Value', 'Medium-Value']

for i in range(3):
    mask = true_labels == i
    plt.scatter(df.loc[mask, 'monthly_transactions'],
                df.loc[mask, 'avg_transaction_value'],
                c=colors[i], label=labels[i], alpha=0.6, s=60)

plt.xlabel('Monthly Transactions')
plt.ylabel('Avg Transaction Value ($)')
plt.title('Customer Segments (Ground Truth)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 2. K-Nearest Neighbors (Classification)

In [None]:
# Prepare data
X_features = df[['monthly_transactions', 'avg_transaction_value']].values
y = df['segment'].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f'Training samples: {len(X_train)}')
print(f'Test samples: {len(X_test)}')

In [None]:
# Find optimal K using cross-validation
k_range = range(1, 21)
cv_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# Plot
plt.figure(figsize=(10, 5))
plt.plot(k_range, cv_scores, 'o-', linewidth=2, markersize=8)
plt.xlabel('K (Number of Neighbors)')
plt.ylabel('Cross-Validation Accuracy')
plt.title('KNN: Finding Optimal K')
plt.xticks(k_range)
plt.grid(True, alpha=0.3)

best_k = k_range[np.argmax(cv_scores)]
plt.axvline(x=best_k, color='red', linestyle='--', label=f'Best K={best_k}')
plt.legend()
plt.show()

print(f'Best K: {best_k} with CV accuracy: {max(cv_scores):.3f}')

In [None]:
# Train final model
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)

# Evaluate
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.3f}')

In [None]:
# Visualize decision boundaries
h = 0.02
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap='RdYlBu', 
            edgecolors='black', s=50)
plt.xlabel('Monthly Transactions (scaled)')
plt.ylabel('Avg Transaction Value (scaled)')
plt.title(f'KNN Decision Boundaries (K={best_k})')
plt.colorbar(label='Segment')
plt.show()

## 3. K-Means Clustering

In [None]:
# Elbow method to find optimal K
inertias = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(K_range, inertias, 'o-', linewidth=2, markersize=10)
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (WCSS)')
plt.title('Elbow Method for Optimal K')
plt.xticks(K_range)
plt.grid(True, alpha=0.3)

# Highlight elbow
plt.scatter([3], [inertias[2]], c='red', s=200, zorder=5)
plt.annotate('Elbow', xy=(3, inertias[2]), xytext=(4, inertias[2]+50),
            arrowprops=dict(arrowstyle='->'))
plt.show()

In [None]:
# Silhouette scores for different K
silhouette_scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

plt.figure(figsize=(10, 5))
plt.plot(range(2, 11), silhouette_scores, 'o-', linewidth=2, markersize=10)
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Different K')
plt.xticks(range(2, 11))
plt.grid(True, alpha=0.3)
plt.show()

print(f'Best K by silhouette: {range(2, 11)[np.argmax(silhouette_scores)]}')

In [None]:
# Final K-Means with K=3
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)
centroids = kmeans.cluster_centers_

plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=cluster_labels, 
            cmap='viridis', alpha=0.6, s=60)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X',
            s=300, edgecolors='black', linewidth=2, label='Centroids')
plt.xlabel('Monthly Transactions (scaled)')
plt.ylabel('Avg Transaction Value (scaled)')
plt.title('K-Means Clustering (K=3)')
plt.legend()
plt.colorbar(label='Cluster')
plt.show()

## 4. Cluster Profiling

In [None]:
# Add cluster labels to dataframe
df['cluster'] = cluster_labels

# Profile each cluster
cluster_profile = df.groupby('cluster').agg({
    'monthly_transactions': ['mean', 'std', 'count'],
    'avg_transaction_value': ['mean', 'std']
}).round(2)

print('Cluster Profiles:')
print(cluster_profile)

In [None]:
# Business interpretation
print('\nBusiness Interpretation:')
for cluster_id in range(3):
    cluster_data = df[df['cluster'] == cluster_id]
    avg_trans = cluster_data['monthly_transactions'].mean()
    avg_value = cluster_data['avg_transaction_value'].mean()
    count = len(cluster_data)
    
    if avg_trans > 6 and avg_value > 600:
        segment_name = 'High-Value Frequent'
        strategy = 'Premium loyalty program, exclusive offers'
    elif avg_trans < 4 and avg_value < 400:
        segment_name = 'Low-Value Occasional'
        strategy = 'Re-engagement campaigns, incentivize activity'
    else:
        segment_name = 'Medium-Value Regular'
        strategy = 'Cross-sell opportunities, upgrade potential'
    
    print(f'\nCluster {cluster_id}: {segment_name}')
    print(f'  - Size: {count} customers')
    print(f'  - Avg Monthly Transactions: {avg_trans:.1f}')
    print(f'  - Avg Transaction Value: ${avg_value:.0f}')
    print(f'  - Strategy: {strategy}')

## Exercises

### Exercise 1: Distance Metrics
Compare KNN performance with Euclidean vs Manhattan distance.

In [None]:
# Exercise 1: Compare distance metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

print("Comparing Distance Metrics:\n")
metrics = ['euclidean', 'manhattan']
for metric in metrics:
    knn = KNeighborsClassifier(n_neighbors=5, metric=metric)
    scores = cross_val_score(knn, X_scaled, y, cv=5)
    print(f'{metric.capitalize():12s}: {scores.mean():.3f} (+/- {scores.std()*2:.3f})')

# Interpretation
print("\n--- Interpretation ---")
print("Euclidean distance: shortest path (as the crow flies)")
print("Manhattan distance: grid-based path (city blocks)")
print("Similar performance indicates well-separated clusters.")

### Exercise 2: Weighted KNN
Compare uniform vs distance weighting.

In [None]:
# Exercise 2: Compare uniform vs distance weighting
print("Comparing Weighting Schemes:\n")
weights_options = ['uniform', 'distance']
for weight in weights_options:
    knn = KNeighborsClassifier(n_neighbors=5, weights=weight)
    scores = cross_val_score(knn, X_scaled, y, cv=5)
    print(f'{weight.capitalize():12s}: {scores.mean():.3f} (+/- {scores.std()*2:.3f})')

# Interpretation
print("\n--- Interpretation ---")
print("Uniform: all K neighbors vote equally")
print("Distance: closer neighbors have more influence (w = 1/d^2)")
print("Distance weighting helps when some neighbors are much closer.")

## Summary

Key takeaways:

1. KNN is instance-based (lazy learner) for classification
2. K-Means is iterative clustering for unsupervised grouping
3. Feature scaling is critical for both algorithms
4. Cluster profiling provides actionable business insights