# Clustering Algorithm Analysis

This notebook demonstrates how to run and compare different clustering algorithms using our framework.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.append('../src')

from clustering_analysis import (
    ClusteringExperiment,
    KMeansClusterer, FuzzyCMeansClusterer, GaussianMixtureClusterer,
    DBSCANClusterer, SpectralClusterer,
    SyntheticDataGenerator
)

## Load Sample Dataset

Let's load a sample dataset for analysis.

In [None]:
# Initialize data generator
generator = SyntheticDataGenerator("../data/synthetic")

# Generate a sample dataset
bar_omega = 0.1
K = 3
p = 2  # 2D for visualization
n = 1000

X, y_true = generator.generate_dataset(bar_omega, K, p, n, random_state=42)

print(f"Dataset shape: {X.shape}")
print(f"True clusters: {len(np.unique(y_true))}")
print(f"Cluster distribution: {dict(zip(*np.unique(y_true, return_counts=True)))}")

## Visualize Original Dataset

In [None]:
# Plot the original dataset
plt.figure(figsize=(10, 6))

colors = ['red', 'blue', 'green', 'orange', 'purple']
for cluster_id in np.unique(y_true):
    mask = y_true == cluster_id
    plt.scatter(X[mask, 0], X[mask, 1], 
               c=colors[cluster_id], alpha=0.7, 
               label=f'True Cluster {cluster_id}', s=30)

plt.title(f'Original Dataset (BarOmega = {bar_omega})')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Individual Algorithm Testing

Let's test each clustering algorithm individually.

### K-Means Clustering

In [None]:
# Test K-Means
from sklearn.preprocessing import StandardScaler

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize and fit K-Means
kmeans = KMeansClusterer(n_clusters=K)
kmeans.fit(X_scaled)
y_kmeans = kmeans.labels_

print(f"K-Means execution time: {kmeans.get_execution_time():.4f} seconds")
print(f"Predicted clusters: {len(np.unique(y_kmeans))}")

# Plot results
plt.figure(figsize=(12, 5))

# True labels
plt.subplot(1, 2, 1)
for cluster_id in np.unique(y_true):
    mask = y_true == cluster_id
    plt.scatter(X[mask, 0], X[mask, 1], c=colors[cluster_id], alpha=0.7, s=30)
plt.title('True Labels')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# K-Means results
plt.subplot(1, 2, 2)
for cluster_id in np.unique(y_kmeans):
    mask = y_kmeans == cluster_id
    plt.scatter(X[mask, 0], X[mask, 1], c=colors[cluster_id], alpha=0.7, s=30)
plt.title('K-Means Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

plt.tight_layout()
plt.show()

### DBSCAN Clustering

In [None]:
# Test DBSCAN with parameter optimization
dbscan = DBSCANClusterer(n_clusters=K)
dbscan.fit(X_scaled, optimize=True, true_labels=y_true)
y_dbscan = dbscan.labels_

print(f"DBSCAN execution time: {dbscan.get_execution_time():.4f} seconds")
print(f"Best parameters: {dbscan.best_params}")
print(f"Predicted clusters: {len(np.unique(y_dbscan[y_dbscan != -1]))} (excluding noise)")
print(f"Noise points: {sum(y_dbscan == -1)}")

# Plot results
plt.figure(figsize=(12, 5))

# True labels
plt.subplot(1, 2, 1)
for cluster_id in np.unique(y_true):
    mask = y_true == cluster_id
    plt.scatter(X[mask, 0], X[mask, 1], c=colors[cluster_id], alpha=0.7, s=30)
plt.title('True Labels')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# DBSCAN results
plt.subplot(1, 2, 2)
# Plot noise points in black
noise_mask = y_dbscan == -1
if np.any(noise_mask):
    plt.scatter(X[noise_mask, 0], X[noise_mask, 1], c='black', alpha=0.5, s=10, label='Noise')

# Plot clusters
for cluster_id in np.unique(y_dbscan):
    if cluster_id != -1:
        mask = y_dbscan == cluster_id
        plt.scatter(X[mask, 0], X[mask, 1], c=colors[cluster_id % len(colors)], alpha=0.7, s=30)

plt.title('DBSCAN Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
if np.any(noise_mask):
    plt.legend()

plt.tight_layout()
plt.show()

## Comprehensive Algorithm Comparison

Let's run all algorithms and compare their performance.

In [None]:
# Initialize all algorithms
algorithms = {
    'K-Means': KMeansClusterer(n_clusters=K),
    'Fuzzy C-Means': FuzzyCMeansClusterer(n_clusters=K),
    'Gaussian Mixture': GaussianMixtureClusterer(n_clusters=K),
    'DBSCAN': DBSCANClusterer(n_clusters=K),
    'Spectral': SpectralClusterer(n_clusters=K)
}

# Run all algorithms
results = {}
predictions = {}

for name, algorithm in algorithms.items():
    print(f"\nRunning {name}...")
    
    try:
        # Fit algorithm
        if name in ['DBSCAN', 'Spectral']:
            algorithm.fit(X_scaled, optimize=True, true_labels=y_true)
        else:
            algorithm.fit(X_scaled)
        
        # Store results
        results[name] = {
            'execution_time': algorithm.get_execution_time(),
            'n_clusters': len(np.unique(algorithm.labels_[algorithm.labels_ != -1])),
            'algorithm': algorithm
        }
        predictions[name] = algorithm.labels_
        
        print(f"  Execution time: {results[name]['execution_time']:.4f} seconds")
        print(f"  Clusters found: {results[name]['n_clusters']}")
        
    except Exception as e:
        print(f"  Error: {e}")
        results[name] = {'error': str(e)}
        predictions[name] = None

## Visualize All Results

In [None]:
# Plot all results
n_algorithms = len([name for name, pred in predictions.items() if pred is not None])
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

# Plot true labels first
ax = axes[0]
for cluster_id in np.unique(y_true):
    mask = y_true == cluster_id
    ax.scatter(X[mask, 0], X[mask, 1], c=colors[cluster_id], alpha=0.7, s=20)
ax.set_title('True Labels')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.grid(True, alpha=0.3)

# Plot algorithm results
plot_idx = 1
for name, y_pred in predictions.items():
    if y_pred is not None and plot_idx < len(axes):
        ax = axes[plot_idx]
        
        # Handle noise points (DBSCAN)
        if -1 in y_pred:
            noise_mask = y_pred == -1
            ax.scatter(X[noise_mask, 0], X[noise_mask, 1], 
                      c='black', alpha=0.3, s=10)
        
        # Plot clusters
        unique_labels = np.unique(y_pred)
        for i, cluster_id in enumerate(unique_labels):
            if cluster_id != -1:
                mask = y_pred == cluster_id
                ax.scatter(X[mask, 0], X[mask, 1], 
                          c=colors[i % len(colors)], alpha=0.7, s=20)
        
        # Add execution time to title
        exec_time = results[name].get('execution_time', 0)
        ax.set_title(f'{name}\n({exec_time:.3f}s)')
        ax.set_xlabel('Feature 1')
        ax.set_ylabel('Feature 2')
        ax.grid(True, alpha=0.3)
        
        plot_idx += 1

# Hide unused subplots
for i in range(plot_idx, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

## Calculate and Compare Metrics

In [None]:
# Calculate metrics for each algorithm
from clustering_analysis import ClusteringMetrics

metrics_calculator = ClusteringMetrics()
metrics_results = []

for name, y_pred in predictions.items():
    if y_pred is not None:
        print(f"\nCalculating metrics for {name}...")
        
        # Calculate all metrics
        metrics = metrics_calculator.calculate_all_metrics(
            X_scaled, y_pred, y_true
        )
        
        # Add algorithm name and execution time
        metrics['algorithm'] = name
        metrics['execution_time'] = results[name].get('execution_time', 0)
        metrics_results.append(metrics)
        
        # Print key metrics
        print(f"  Adjusted Rand Score: {metrics.get('adjusted_rand_score', 'N/A'):.3f}")
        print(f"  Silhouette Score: {metrics.get('silhouette_score', 'N/A'):.3f}")
        print(f"  Execution Time: {metrics['execution_time']:.3f}s")

# Create DataFrame for easy comparison
metrics_df = pd.DataFrame(metrics_results)
print("\n" + "="*50)
print("METRICS COMPARISON")
print("="*50)
display(metrics_df.round(3))

## Performance Visualization

In [None]:
# Create performance comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Adjusted Rand Score
if 'adjusted_rand_score' in metrics_df.columns:
    axes[0, 0].bar(metrics_df['algorithm'], metrics_df['adjusted_rand_score'])
    axes[0, 0].set_title('Adjusted Rand Score')
    axes[0, 0].set_ylabel('Score')
    axes[0, 0].tick_params(axis='x', rotation=45)

# Silhouette Score
if 'silhouette_score' in metrics_df.columns:
    axes[0, 1].bar(metrics_df['algorithm'], metrics_df['silhouette_score'])
    axes[0, 1].set_title('Silhouette Score')
    axes[0, 1].set_ylabel('Score')
    axes[0, 1].tick_params(axis='x', rotation=45)

# Execution Time
axes[1, 0].bar(metrics_df['algorithm'], metrics_df['execution_time'])
axes[1, 0].set_title('Execution Time')
axes[1, 0].set_ylabel('Time (seconds)')
axes[1, 0].tick_params(axis='x', rotation=45)

# Performance vs Time scatter
if 'adjusted_rand_score' in metrics_df.columns:
    axes[1, 1].scatter(metrics_df['execution_time'], metrics_df['adjusted_rand_score'])
    for i, txt in enumerate(metrics_df['algorithm']):
        axes[1, 1].annotate(txt, (metrics_df['execution_time'].iloc[i], 
                                 metrics_df['adjusted_rand_score'].iloc[i]))
    axes[1, 1].set_xlabel('Execution Time (seconds)')
    axes[1, 1].set_ylabel('Adjusted Rand Score')
    axes[1, 1].set_title('Performance vs Speed')

plt.tight_layout()
plt.show()

## Summary and Recommendations

Based on the analysis above, let's summarize the performance of each algorithm:

In [None]:
# Create summary table
summary_data = []

for _, row in metrics_df.iterrows():
    algorithm = row['algorithm']
    
    # Performance rating based on ARI
    ari = row.get('adjusted_rand_score', 0)
    if ari > 0.8:
        performance = "Excellent"
    elif ari > 0.6:
        performance = "Good"
    elif ari > 0.4:
        performance = "Fair"
    else:
        performance = "Poor"
    
    # Speed rating
    exec_time = row['execution_time']
    if exec_time < 0.01:
        speed = "Very Fast"
    elif exec_time < 0.1:
        speed = "Fast"
    elif exec_time < 1.0:
        speed = "Moderate"
    else:
        speed = "Slow"
    
    summary_data.append({
        'Algorithm': algorithm,
        'Performance': performance,
        'Speed': speed,
        'ARI': f"{ari:.3f}",
        'Silhouette': f"{row.get('silhouette_score', 0):.3f}",
        'Time (s)': f"{exec_time:.4f}"
    })

summary_df = pd.DataFrame(summary_data)
print("ALGORITHM PERFORMANCE SUMMARY")
print("="*50)
display(summary_df)

## Conclusion

In this notebook, we:

1. **Loaded** a synthetic dataset with controlled cluster overlap
2. **Applied** five different clustering algorithms
3. **Visualized** the clustering results
4. **Calculated** comprehensive evaluation metrics
5. **Compared** algorithm performance

### Key Findings:

The performance of clustering algorithms depends heavily on:
- **Dataset characteristics** (overlap, shape, density)
- **Algorithm assumptions** (spherical vs arbitrary shapes)
- **Parameter tuning** (especially for DBSCAN and Spectral)
- **Data preprocessing** (standardization importance)

### Next Steps:

1. Test on datasets with different overlap levels
2. Analyze performance vs dataset size and dimensions
3. Explore real-world datasets
4. Fine-tune algorithm parameters

This framework provides a systematic approach to clustering algorithm evaluation and comparison.