## Student Information & LOs

**Name:** [Your Name]  
**Roll Number:** [Your Roll Number]  
**Date:** [DD/MM/YYYY]  

### Learning Outcomes
- [ ] LO1: Understand clustering concepts
- [ ] LO2: Implement K-Means algorithm
- [ ] LO3: Implement Hierarchical clustering
- [ ] LO4: Implement DBSCAN
- [ ] LO5: Determine optimal number of clusters
- [ ] LO6: Calculate silhouette score
- [ ] LO7: Visualize cluster assignments
- [ ] LO8: Compare clustering algorithms
- [ ] LO9: Handle outliers in clustering
- [ ] LO10: Select appropriate algorithm

In [None]:
# Environment Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs, load_iris
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')

print("üìö Libraries imported successfully!")
plt.style.use('seaborn-v0_8-darkgrid')

## Phase 1: Data Preparation

In [None]:
# Create clustering dataset
X, y_true = make_blobs(n_samples=300, centers=4, n_features=2, random_state=42, cluster_std=0.8)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Dataset shape: {X_scaled.shape}")
print(f"True clusters: {len(np.unique(y_true))}")

# Visualize original data
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], alpha=0.6)
plt.title('Original Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

## Phase 2: K-Means Clustering

In [None]:
# Train K-Means
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)
print(f"K-Means Silhouette Score: {kmeans_silhouette:.4f}")
print(f"Inertia: {kmeans.inertia_:.4f}")

# Visualize K-Means
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
            c='red', marker='X', s=200, label='Centroids')
plt.title('K-Means Clustering')
plt.legend()
plt.show()

## Phase 3: Find Optimal K

In [None]:
# Elbow method
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    inertias.append(km.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, km.labels_))

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(K_range, inertias, marker='o')
axes[0].set_xlabel('K')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True, alpha=0.3)

axes[1].plot(K_range, silhouette_scores, marker='o')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Method')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"Optimal K: {optimal_k}")

## Phase 4: Hierarchical Clustering

In [None]:
# Hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=4, linkage='ward')
hier_labels = hierarchical.fit_predict(X_scaled)

hier_silhouette = silhouette_score(X_scaled, hier_labels)
print(f"Hierarchical Silhouette Score: {hier_silhouette:.4f}")

# Visualize
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=hier_labels, cmap='viridis', alpha=0.6)
plt.title('Hierarchical Clustering (Ward)')
plt.show()

## Phase 5: Dendrogram

In [None]:
# Create dendrogram (using sample for speed)
sample_indices = np.random.choice(X_scaled.shape[0], 50, replace=False)
X_sample = X_scaled[sample_indices]

linkage_matrix = linkage(X_sample, method='ward')

plt.figure(figsize=(12, 5))
dendrogram(linkage_matrix)
plt.title('Dendrogram')
plt.xlabel('Sample')
plt.ylabel('Distance')
plt.show()

## Phase 6: DBSCAN Clustering

In [None]:
# DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)

n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_outliers = list(dbscan_labels).count(-1)

print(f"DBSCAN - Clusters found: {n_clusters}")
print(f"DBSCAN - Outliers: {n_outliers}")

if n_clusters > 1:
    dbscan_silhouette = silhouette_score(X_scaled, dbscan_labels)
    print(f"DBSCAN Silhouette Score: {dbscan_silhouette:.4f}")

# Visualize DBSCAN
colors = plt.cm.Spectral(np.linspace(0, 1, len(set(dbscan_labels))))
for label in set(dbscan_labels):
    if label == -1:
        color = 'black'
        marker = 'x'
    else:
        color = colors[label]
        marker = 'o'
    
    mask = dbscan_labels == label
    plt.scatter(X_scaled[mask, 0], X_scaled[mask, 1], 
               c=[color], marker=marker, s=50, alpha=0.6)

plt.title('DBSCAN Clustering')
plt.show()

## Phase 7: Algorithm Comparison

In [None]:
# Comparison metrics
comparison_data = [
    {
        'Algorithm': 'K-Means',
        'Clusters': 4,
        'Silhouette': kmeans_silhouette,
        'Davies-Bouldin': davies_bouldin_score(X_scaled, kmeans_labels),
        'Calinski-Harabasz': calinski_harabasz_score(X_scaled, kmeans_labels)
    },
    {
        'Algorithm': 'Hierarchical',
        'Clusters': 4,
        'Silhouette': hier_silhouette,
        'Davies-Bouldin': davies_bouldin_score(X_scaled, hier_labels),
        'Calinski-Harabasz': calinski_harabasz_score(X_scaled, hier_labels)
    }
]

if n_clusters > 1:
    comparison_data.append({
        'Algorithm': 'DBSCAN',
        'Clusters': n_clusters,
        'Silhouette': silhouette_score(X_scaled, dbscan_labels),
        'Davies-Bouldin': davies_bouldin_score(X_scaled, dbscan_labels),
        'Calinski-Harabasz': calinski_harabasz_score(X_scaled, dbscan_labels)
    })

comparison_df = pd.DataFrame(comparison_data)
print("\nClustering Algorithm Comparison:")
print(comparison_df.to_string(index=False))

## Phase 8: Practical Tests

In [None]:
# TEST 1: K-MEANS
print("üß™ TEST 1: K-MEANS CLUSTERING")
try:
    assert kmeans_silhouette > 0.4, "K-Means silhouette too low!"
    assert len(np.unique(kmeans_labels)) == 4, "K-Means didn't create 4 clusters!"
    assert kmeans.cluster_centers_.shape == (4, 2), "Centroids shape incorrect!"
    print(f"‚úÖ TEST 1 PASSED: Silhouette = {kmeans_silhouette:.4f}")
    test1_result = "PASSED"
except AssertionError as e:
    print(f"‚ùå TEST 1 FAILED: {e}")
    test1_result = "FAILED"

In [None]:
# TEST 2: HIERARCHICAL
print("\nüß™ TEST 2: HIERARCHICAL CLUSTERING")
try:
    assert hier_silhouette > 0.4, "Hierarchical silhouette too low!"
    assert len(np.unique(hier_labels)) == 4, "Hierarchical didn't create 4 clusters!"
    print(f"‚úÖ TEST 2 PASSED: Silhouette = {hier_silhouette:.4f}")
    test2_result = "PASSED"
except AssertionError as e:
    print(f"‚ùå TEST 2 FAILED: {e}")
    test2_result = "FAILED"

In [None]:
# TEST 3: DBSCAN
print("\nüß™ TEST 3: DBSCAN CLUSTERING")
try:
    assert n_clusters > 0, "DBSCAN found no clusters!"
    assert n_outliers >= 0, "Outlier count invalid!"
    print(f"‚úÖ TEST 3 PASSED: {n_clusters} clusters, {n_outliers} outliers")
    test3_result = "PASSED"
except AssertionError as e:
    print(f"‚ùå TEST 3 FAILED: {e}")
    test3_result = "FAILED"

In [None]:
# TEST 4: OPTIMAL K
print("\nüß™ TEST 4: OPTIMAL K SELECTION")
try:
    assert optimal_k > 1, "Optimal K invalid!"
    assert optimal_k < 11, "Optimal K out of range!"
    print(f"‚úÖ TEST 4 PASSED: Optimal K = {optimal_k}")
    test4_result = "PASSED"
except AssertionError as e:
    print(f"‚ùå TEST 4 FAILED: {e}")
    test4_result = "FAILED"

In [None]:
# TEST 5: COMPARISON
print("\nüß™ TEST 5: ALGORITHM COMPARISON")
try:
    assert len(comparison_df) >= 2, "Comparison incomplete!"
    assert all(comparison_df['Silhouette'] > 0), "Invalid silhouette scores!"
    assert all(comparison_df['Davies-Bouldin'] > 0), "Invalid DB scores!"
    print(f"‚úÖ TEST 5 PASSED: {len(comparison_df)} algorithms compared")
    test5_result = "PASSED"
except AssertionError as e:
    print(f"‚ùå TEST 5 FAILED: {e}")
    test5_result = "FAILED"

## Results Summary

In [None]:
test_summary = pd.DataFrame([
    {'Test': 'Test 1: K-Means', 'Result': test1_result},
    {'Test': 'Test 2: Hierarchical', 'Result': test2_result},
    {'Test': 'Test 3: DBSCAN', 'Result': test3_result},
    {'Test': 'Test 4: Optimal K', 'Result': test4_result},
    {'Test': 'Test 5: Comparison', 'Result': test5_result}
])

print("\n" + "="*60)
print("TEST RESULTS SUMMARY")
print("="*60)
print(test_summary.to_string(index=False))
passed = sum([1 for r in [test1_result, test2_result, test3_result, test4_result, test5_result] if r == "PASSED"])
print(f"\nüìä SCORE: {passed}/5 TESTS PASSED ({passed*100/5:.0f}%)")
print("="*60)