# K-Nearest Neighbors (KNN) Classification

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Digital-AI-Finance/methods-algorithms/blob/master/notebooks/L03_knn.ipynb)

**Learning Objectives:**
- Understand how KNN classifies new data points using neighbor voting
- Visualize decision boundaries and see how K affects model complexity
- Learn why feature scaling is critical for distance-based methods
- Use cross-validation to select the optimal K

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

np.random.seed(42)

plt.rcParams.update({
    'figure.figsize': (10, 6),
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
})

# ML color palette
ML_PURPLE = '#3333B2'
ML_BLUE = '#0066CC'
ML_ORANGE = '#FF7F0E'
ML_GREEN = '#2CA02C'
ML_RED = '#D62728'

print('Setup complete.')

## 1. Generate Data

In [None]:
# Create a 2-class dataset with 2 features for easy visualization
X, y = make_classification(
    n_samples=200, n_features=2, n_redundant=0, n_informative=2,
    n_clusters_per_class=1, flip_y=0.1, random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(f'Training samples: {len(X_train)}')
print(f'Test samples:     {len(X_test)}')
print(f'Class distribution: {np.bincount(y)}')

In [None]:
# Visual 1: Scatter plot of raw data
fig, ax = plt.subplots(figsize=(10, 6))
scatter = ax.scatter(X[:, 0], X[:, 1], c=[ML_BLUE if yi == 0 else ML_ORANGE for yi in y],
                     edgecolors='white', s=60, alpha=0.8)
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_title('Two-Class Dataset')
ax.legend(handles=[
    mpatches.Patch(color=ML_BLUE, label='Class 0'),
    mpatches.Patch(color=ML_ORANGE, label='Class 1')
], loc='upper right')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 2. How KNN Works

In [None]:
# Visual 2: Show a query point and its K=3 nearest neighbors
query_point = np.array([[0.5, 0.5]])

knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_3.fit(X_train, y_train)

distances, indices = knn_3.kneighbors(query_point)

fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(X_train[:, 0], X_train[:, 1],
           c=[ML_BLUE if yi == 0 else ML_ORANGE for yi in y_train],
           edgecolors='white', s=50, alpha=0.5)

# Highlight the K=3 nearest neighbors
for idx, dist in zip(indices[0], distances[0]):
    neighbor = X_train[idx]
    color = ML_BLUE if y_train[idx] == 0 else ML_ORANGE
    ax.scatter(neighbor[0], neighbor[1], c=color, s=200, edgecolors='black', linewidths=2, zorder=5)
    ax.plot([query_point[0, 0], neighbor[0]], [query_point[0, 1], neighbor[1]],
            'k--', alpha=0.6, linewidth=1.5)
    mid_x = (query_point[0, 0] + neighbor[0]) / 2
    mid_y = (query_point[0, 1] + neighbor[1]) / 2
    ax.annotate(f'd={dist:.2f}', (mid_x, mid_y), fontsize=9,
                bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.8))

# Draw circle around query point encompassing all 3 neighbors
radius = distances[0].max() * 1.05
circle = plt.Circle(query_point[0], radius, fill=False, color=ML_RED,
                     linestyle='--', linewidth=2)
ax.add_patch(circle)

# Query point
ax.scatter(query_point[0, 0], query_point[0, 1], c=ML_RED, marker='*',
           s=400, edgecolors='black', linewidths=1.5, zorder=10, label='Query point')

pred = knn_3.predict(query_point)[0]
ax.set_title(f'KNN with K=3: Query Point Classified as Class {pred}')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Visual 3: Decision boundaries for K=1, K=5, K=15
def plot_decision_boundary(ax, X, y, k, title):
    h = 0.05
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    cmap_bg = ListedColormap(['#cce0ff', '#ffe0cc'])
    ax.contourf(xx, yy, Z, alpha=0.4, cmap=cmap_bg)
    ax.scatter(X[:, 0], X[:, 1], c=[ML_BLUE if yi == 0 else ML_ORANGE for yi in y],
               edgecolors='white', s=30, alpha=0.8)
    acc = knn.score(X, y)
    ax.set_title(f'{title}\nTrain Acc: {acc:.2f}')
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.grid(True, alpha=0.3)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, k, title in zip(axes, [1, 5, 15], ['K=1 (Overfitting)', 'K=5 (Balanced)', 'K=15 (Underfitting)']):
    plot_decision_boundary(ax, X_train, y_train, k, title)
plt.suptitle('Effect of K on Decision Boundaries', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

## 3. Feature Scaling Matters

In [None]:
# Visual 4: Before/after scaling comparison
# Create data with very different scales
X_unscaled = X_train.copy()
X_unscaled[:, 0] = X_unscaled[:, 0] * 1000  # Feature 1 in range ~[-3000, 3000]
X_unscaled[:, 1] = X_unscaled[:, 1] * 0.01   # Feature 2 in range ~[-0.03, 0.03]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_unscaled)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Unscaled
axes[0].scatter(X_unscaled[:, 0], X_unscaled[:, 1],
                c=[ML_BLUE if yi == 0 else ML_ORANGE for yi in y_train],
                edgecolors='white', s=50, alpha=0.7)
axes[0].set_title('Before Scaling (Unscaled)')
axes[0].set_xlabel(f'Feature 1 (range: {X_unscaled[:, 0].min():.0f} to {X_unscaled[:, 0].max():.0f})')
axes[0].set_ylabel(f'Feature 2 (range: {X_unscaled[:, 1].min():.3f} to {X_unscaled[:, 1].max():.3f})')
axes[0].grid(True, alpha=0.3)
axes[0].annotate('Feature 1 dominates\ndistance calculations!',
                 xy=(0.5, 0.05), xycoords='axes fraction', fontsize=11,
                 ha='center', color=ML_RED, fontweight='bold',
                 bbox=dict(boxstyle='round', facecolor='lightyellow'))

# Scaled
axes[1].scatter(X_scaled[:, 0], X_scaled[:, 1],
                c=[ML_BLUE if yi == 0 else ML_ORANGE for yi in y_train],
                edgecolors='white', s=50, alpha=0.7)
axes[1].set_title('After StandardScaler')
axes[1].set_xlabel('Feature 1 (standardized)')
axes[1].set_ylabel('Feature 2 (standardized)')
axes[1].grid(True, alpha=0.3)
axes[1].annotate('Both features contribute\nequally to distances',
                 xy=(0.5, 0.05), xycoords='axes fraction', fontsize=11,
                 ha='center', color=ML_GREEN, fontweight='bold',
                 bbox=dict(boxstyle='round', facecolor='lightyellow'))

plt.suptitle('Why Feature Scaling Matters for KNN', fontsize=14)
plt.tight_layout()
plt.show()

## 4. Finding the Best K

In [None]:
# Visual 5: Cross-validation accuracy curve
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

k_range = range(1, 21)
cv_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_s, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

best_k = list(k_range)[np.argmax(cv_scores)]
best_score = max(cv_scores)

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(list(k_range), cv_scores, 'o-', color=ML_PURPLE, linewidth=2, markersize=6)
ax.axvline(x=best_k, color=ML_RED, linestyle='--', linewidth=2, label=f'Best K={best_k}')
ax.scatter([best_k], [best_score], color=ML_RED, s=200, zorder=5, edgecolors='black')
ax.annotate(f'Best K={best_k}\nAcc={best_score:.3f}',
            xy=(best_k, best_score), xytext=(best_k + 2, best_score - 0.02),
            fontsize=11, arrowprops=dict(arrowstyle='->', color='black'),
            bbox=dict(boxstyle='round', facecolor='lightyellow'))
ax.set_xlabel('K (Number of Neighbors)')
ax.set_ylabel('5-Fold Cross-Validation Accuracy')
ax.set_title('Selecting the Optimal K via Cross-Validation')
ax.set_xticks(list(k_range))
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f'Best K: {best_k} with CV accuracy: {best_score:.3f}')

## 5. Distance Metrics

In [None]:
# Visual 6: Unit balls for Euclidean, Manhattan, Chebyshev
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

theta = np.linspace(0, 2 * np.pi, 500)

# Euclidean (L2) - circle
x_l2 = np.cos(theta)
y_l2 = np.sin(theta)
axes[0].fill(x_l2, y_l2, alpha=0.3, color=ML_BLUE)
axes[0].plot(x_l2, y_l2, color=ML_BLUE, linewidth=2)
axes[0].set_title('Euclidean (p=2)\nCircle', fontsize=13)
axes[0].set_aspect('equal')
axes[0].annotate(r'$d = \sqrt{\sum(x_i - y_i)^2}$', xy=(0, -0.3),
                 fontsize=12, ha='center')

# Manhattan (L1) - diamond
t = np.linspace(0, 2 * np.pi, 500)
x_l1 = np.sign(np.cos(t)) * (1 - np.abs(np.sin(t)))
# Parametric diamond
x_l1 = np.array([1, 0, -1, 0, 1])
y_l1 = np.array([0, 1, 0, -1, 0])
axes[1].fill(x_l1, y_l1, alpha=0.3, color=ML_ORANGE)
axes[1].plot(x_l1, y_l1, color=ML_ORANGE, linewidth=2)
axes[1].set_title('Manhattan (p=1)\nDiamond', fontsize=13)
axes[1].set_aspect('equal')
axes[1].annotate(r'$d = \sum|x_i - y_i|$', xy=(0, -0.3),
                 fontsize=12, ha='center')

# Chebyshev (L-inf) - square
x_linf = np.array([1, 1, -1, -1, 1])
y_linf = np.array([1, -1, -1, 1, 1])
axes[2].fill(x_linf, y_linf, alpha=0.3, color=ML_GREEN)
axes[2].plot(x_linf, y_linf, color=ML_GREEN, linewidth=2)
axes[2].set_title(r'Chebyshev ($p=\infty$)' + '\nSquare', fontsize=13)
axes[2].set_aspect('equal')
axes[2].annotate(r'$d = \max|x_i - y_i|$', xy=(0, -0.3),
                 fontsize=12, ha='center')

for ax in axes:
    ax.set_xlim(-1.5, 1.5)
    ax.set_ylim(-1.5, 1.5)
    ax.axhline(0, color='gray', linewidth=0.5)
    ax.axvline(0, color='gray', linewidth=0.5)
    ax.grid(True, alpha=0.3)
    ax.set_xlabel('x')
    ax.set_ylabel('y')

plt.suptitle('Unit Balls for Different Distance Metrics', fontsize=14)
plt.tight_layout()
plt.show()

## 6. Final Model & Evaluation

In [None]:
# Train final KNN with best K
knn_final = KNeighborsClassifier(n_neighbors=best_k)
knn_final.fit(X_train_s, y_train)

y_pred = knn_final.predict(X_test_s)
acc = accuracy_score(y_test, y_pred)

print(f'Test Accuracy with K={best_k}: {acc:.3f}')
print()
print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1']))

In [None]:
# Visual 7: Decision boundary with test points overlaid
fig, ax = plt.subplots(figsize=(10, 6))

h = 0.05
x_min, x_max = X_train_s[:, 0].min() - 1, X_train_s[:, 0].max() + 1
y_min, y_max = X_train_s[:, 1].min() - 1, X_train_s[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn_final.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

cmap_bg = ListedColormap(['#cce0ff', '#ffe0cc'])
ax.contourf(xx, yy, Z, alpha=0.4, cmap=cmap_bg)

# Training points (small, faded)
ax.scatter(X_train_s[:, 0], X_train_s[:, 1],
           c=[ML_BLUE if yi == 0 else ML_ORANGE for yi in y_train],
           s=20, alpha=0.3, label='Train')

# Test points (large, bold)
ax.scatter(X_test_s[:, 0], X_test_s[:, 1],
           c=[ML_BLUE if yi == 0 else ML_ORANGE for yi in y_test],
           s=80, edgecolors='black', linewidths=1.5, alpha=0.9, label='Test')

# Mark misclassified
misclassified = y_test != y_pred
if misclassified.any():
    ax.scatter(X_test_s[misclassified, 0], X_test_s[misclassified, 1],
               facecolors='none', edgecolors=ML_RED, s=200, linewidths=2.5,
               label='Misclassified')

ax.set_xlabel('Feature 1 (scaled)')
ax.set_ylabel('Feature 2 (scaled)')
ax.set_title(f'Final KNN (K={best_k}) Decision Boundary with Test Points')
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Visual 8: Confusion matrix heatmap
cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
ax.set_title(f'Confusion Matrix (K={best_k})', fontsize=14)
plt.colorbar(im, ax=ax, shrink=0.8)

classes = ['Class 0', 'Class 1']
tick_marks = [0, 1]
ax.set_xticks(tick_marks)
ax.set_xticklabels(classes)
ax.set_yticks(tick_marks)
ax.set_yticklabels(classes)

# Add text annotations
thresh = cm.max() / 2
for i in range(2):
    for j in range(2):
        ax.text(j, i, str(cm[i, j]), ha='center', va='center',
                fontsize=20, fontweight='bold',
                color='white' if cm[i, j] > thresh else 'black')

ax.set_xlabel('Predicted Label')
ax.set_ylabel('True Label')
ax.grid(False)
plt.tight_layout()
plt.show()

## Summary

**Key Takeaways:**
- **KNN is a lazy learner** -- it stores all training data and classifies new points by majority vote among the K nearest neighbors
- **K controls the bias-variance tradeoff**: small K = complex boundary (overfitting), large K = smooth boundary (underfitting)
- **Feature scaling is essential** because KNN relies on distances -- unscaled features with larger ranges dominate the distance calculation
- **Use cross-validation** to find the optimal K rather than guessing