# Clustering Results Visualization

This notebook demonstrates how to load and visualize clustering experiment results.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.append('../src')

from clustering_analysis.visualization import ClusteringVisualizer

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load Experiment Results

First, let's load the results from our clustering experiments.

In [None]:
# Define paths
results_dir = Path('../data/results/metrics')
figures_dir = Path('../data/results/figures')

# Create directories if they don't exist
results_dir.mkdir(parents=True, exist_ok=True)
figures_dir.mkdir(parents=True, exist_ok=True)

# Load all results files
results_files = list(results_dir.glob("*_metrics.csv"))
print(f"Found {len(results_files)} results files:")
for f in results_files:
    print(f"  - {f.name}")

# If no results files, create sample data
if not results_files:
    print("\nNo results files found. Creating sample data...")
    
    # Create sample results
    algorithms = ['K-Means', 'Fuzzy C-Means', 'Gaussian Mixture', 'DBSCAN', 'Spectral']
    bar_omega_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    
    sample_results = []
    
    np.random.seed(42)
    
    for algorithm in algorithms:
        for bar_omega in bar_omega_values:
            # Simulate performance degradation with overlap
            base_ari = 0.95 - (bar_omega * 1.2) + np.random.normal(0, 0.05)
            base_sil = 0.7 - (bar_omega * 0.8) + np.random.normal(0, 0.05)
            
            # Algorithm-specific adjustments
            if algorithm == 'DBSCAN':
                base_ari += 0.1 if bar_omega > 0.2 else -0.1
            elif algorithm == 'K-Means':
                base_ari -= 0.1 if bar_omega > 0.3 else 0
            
            sample_results.append({
                'algorithm': algorithm,
                'experiment': 'bar_omega_variation',
                'bar_omega': bar_omega,
                'K': 3,
                'P': 5,
                'N': 5000,
                'adjusted_rand_score': max(0, min(1, base_ari)),
                'silhouette_score': max(-1, min(1, base_sil)),
                'fit_time': np.random.exponential(0.1),
                'dunn_index': max(0, np.random.gamma(2, 0.1)),
                'calinski_harabasz_score': np.random.gamma(100, 1)
            })
    
    # Save sample results
    sample_df = pd.DataFrame(sample_results)
    sample_file = results_dir / 'sample_metrics.csv'
    sample_df.to_csv(sample_file, index=False)
    
    print(f"Created sample data with {len(sample_results)} results")
    results_files = [sample_file]

## Combine All Results

In [None]:
# Load and combine all results
all_results = []

for results_file in results_files:
    print(f"Loading {results_file.name}...")
    df = pd.read_csv(results_file)
    all_results.extend(df.to_dict('records'))
    print(f"  Loaded {len(df)} results")

print(f"\nTotal results loaded: {len(all_results)}")

# Convert to DataFrame for analysis
results_df = pd.DataFrame(all_results)

print(f"Columns: {list(results_df.columns)}")
print(f"Algorithms: {results_df['algorithm'].unique()}")

# Display first few rows
display(results_df.head())

## Algorithm Performance Overview

In [None]:
# Create overview statistics
if 'adjusted_rand_score' in results_df.columns:
    performance_summary = results_df.groupby('algorithm')['adjusted_rand_score'].agg([
        'count', 'mean', 'std', 'min', 'max'
    ]).round(3)
    
    print("Algorithm Performance Summary (Adjusted Rand Index):")
    display(performance_summary)

# Execution time summary
if 'fit_time' in results_df.columns:
    time_summary = results_df.groupby('algorithm')['fit_time'].agg([
        'count', 'mean', 'std', 'min', 'max'
    ]).round(4)
    
    print("\nExecution Time Summary (seconds):")
    display(time_summary)

## Performance Comparison Visualizations

In [None]:
# Initialize visualizer
visualizer = ClusteringVisualizer(output_dir=str(figures_dir))

# 1. Algorithm comparison boxplot
if 'adjusted_rand_score' in results_df.columns:
    plt.figure(figsize=(12, 6))
    
    sns.boxplot(data=results_df, x='algorithm', y='adjusted_rand_score')
    plt.title('Algorithm Performance Comparison\n(Adjusted Rand Index)', fontsize=14)
    plt.ylabel('Adjusted Rand Index', fontsize=12)
    plt.xlabel('Algorithm', fontsize=12)
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    # Add mean markers
    means = results_df.groupby('algorithm')['adjusted_rand_score'].mean()
    for i, (algorithm, mean_val) in enumerate(means.items()):
        plt.scatter(i, mean_val, color='red', s=100, zorder=5, marker='D')
        plt.text(i, mean_val + 0.02, f'{mean_val:.3f}', ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

## Parameter Sensitivity Analysis

In [None]:
# Bar Omega sensitivity analysis
if 'bar_omega' in results_df.columns and results_df['bar_omega'].nunique() > 1:
    plt.figure(figsize=(14, 8))
    
    # Filter for specific metrics
    metric_cols = ['adjusted_rand_score', 'silhouette_score']
    available_metrics = [col for col in metric_cols if col in results_df.columns]
    
    for i, metric in enumerate(available_metrics, 1):
        plt.subplot(2, 2, i)
        
        # Line plot for each algorithm
        for algorithm in results_df['algorithm'].unique():
            algo_data = results_df[results_df['algorithm'] == algorithm]
            
            if len(algo_data) > 1:
                # Group by bar_omega and calculate mean and std
                grouped = algo_data.groupby('bar_omega')[metric].agg(['mean', 'std']).reset_index()
                
                plt.errorbar(grouped['bar_omega'], grouped['mean'], 
                           yerr=grouped['std'], label=algorithm, marker='o', capsize=3)
        
        plt.xlabel('Bar Omega (Cluster Overlap)', fontsize=10)
        plt.ylabel(metric.replace('_', ' ').title(), fontsize=10)
        plt.title(f'{metric.replace("_", " ").title()} vs Cluster Overlap', fontsize=11)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, alpha=0.3)
    
    # Execution time analysis
    if 'fit_time' in results_df.columns:
        plt.subplot(2, 2, len(available_metrics) + 1)
        
        for algorithm in results_df['algorithm'].unique():
            algo_data = results_df[results_df['algorithm'] == algorithm]
            grouped = algo_data.groupby('bar_omega')['fit_time'].mean().reset_index()
            plt.plot(grouped['bar_omega'], grouped['fit_time'], 
                    label=algorithm, marker='o')
        
        plt.xlabel('Bar Omega (Cluster Overlap)', fontsize=10)
        plt.ylabel('Execution Time (seconds)', fontsize=10)
        plt.title('Execution Time vs Cluster Overlap', fontsize=11)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Performance vs Speed Analysis

In [None]:
# Performance vs execution time scatter plot
if 'adjusted_rand_score' in results_df.columns and 'fit_time' in results_df.columns:
    plt.figure(figsize=(12, 8))
    
    # Calculate mean performance and time for each algorithm
    algo_summary = results_df.groupby('algorithm').agg({
        'adjusted_rand_score': 'mean',
        'fit_time': 'mean'
    }).reset_index()
    
    # Scatter plot
    colors = plt.cm.Set3(np.linspace(0, 1, len(algo_summary)))
    
    for i, (_, row) in enumerate(algo_summary.iterrows()):
        plt.scatter(row['fit_time'], row['adjusted_rand_score'], 
                   s=200, alpha=0.7, color=colors[i], label=row['algorithm'])
        
        # Add algorithm name near point
        plt.annotate(row['algorithm'], 
                    (row['fit_time'], row['adjusted_rand_score']),
                    xytext=(10, 5), textcoords='offset points', fontsize=10)
    
    plt.xlabel('Average Execution Time (seconds)', fontsize=12)
    plt.ylabel('Average Adjusted Rand Index', fontsize=12)
    plt.title('Algorithm Performance vs Speed Trade-off', fontsize=14)
    
    # Add quadrant lines
    mean_time = algo_summary['fit_time'].mean()
    mean_performance = algo_summary['adjusted_rand_score'].mean()
    
    plt.axhline(y=mean_performance, color='gray', linestyle='--', alpha=0.5)
    plt.axvline(x=mean_time, color='gray', linestyle='--', alpha=0.5)
    
    # Add quadrant labels
    plt.text(mean_time * 0.1, mean_performance * 1.05, 'Fast & Good', 
             fontsize=12, fontweight='bold', ha='left')
    plt.text(mean_time * 1.5, mean_performance * 1.05, 'Slow & Good', 
             fontsize=12, fontweight='bold', ha='left')
    plt.text(mean_time * 0.1, mean_performance * 0.95, 'Fast & Poor', 
             fontsize=12, fontweight='bold', ha='left')
    plt.text(mean_time * 1.5, mean_performance * 0.95, 'Slow & Poor', 
             fontsize=12, fontweight='bold', ha='left')
    
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## Create Comprehensive Visualization Report

In [None]:
# Use the visualizer to create comprehensive report
print("Creating comprehensive visualization report...")

try:
    visualizer.create_comprehensive_report(all_results)
    print(f"\nComprehensive report created in: {figures_dir}")
    
    # List generated files
    generated_files = list(figures_dir.glob("*.png"))
    print(f"\nGenerated {len(generated_files)} visualization files:")
    for f in generated_files:
        print(f"  - {f.name}")
        
    # Check if summary report exists
    report_file = figures_dir / 'analysis_report.md'
    if report_file.exists():
        print(f"\nAnalysis report: {report_file}")
        
except Exception as e:
    print(f"Error creating comprehensive report: {e}")
    import traceback
    traceback.print_exc()

## Algorithm Ranking Analysis

In [None]:
# Create algorithm ranking based on different criteria
ranking_metrics = ['adjusted_rand_score', 'silhouette_score', 'fit_time']
available_ranking_metrics = [m for m in ranking_metrics if m in results_df.columns]

if available_ranking_metrics:
    print("Algorithm Ranking Analysis")
    print("=" * 50)
    
    rankings = {}
    
    for metric in available_ranking_metrics:
        # Calculate mean scores
        metric_means = results_df.groupby('algorithm')[metric].mean().sort_values(
            ascending=(metric == 'fit_time')  # Lower is better for time
        )
        
        rankings[metric] = metric_means
        
        print(f"\n{metric.replace('_', ' ').title()} Ranking:")
        for rank, (algorithm, score) in enumerate(metric_means.items(), 1):
            print(f"  {rank}. {algorithm}: {score:.4f}")
    
    # Create ranking visualization
    if len(available_ranking_metrics) > 1:
        fig, axes = plt.subplots(1, len(available_ranking_metrics), 
                                figsize=(5 * len(available_ranking_metrics), 6))
        
        if len(available_ranking_metrics) == 1:
            axes = [axes]
        
        for i, metric in enumerate(available_ranking_metrics):
            ranking = rankings[metric]
            
            bars = axes[i].barh(range(len(ranking)), ranking.values)
            axes[i].set_yticks(range(len(ranking)))
            axes[i].set_yticklabels(ranking.index)
            axes[i].set_xlabel(metric.replace('_', ' ').title())
            axes[i].set_title(f'Ranking by {metric.replace("_", " ").title()}')
            
            # Color bars by rank
            colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, len(ranking)))
            for bar, color in zip(bars, colors):
                bar.set_color(color)
        
        plt.tight_layout()
        plt.show()

## Summary and Conclusions

In [None]:
# Generate summary statistics and conclusions
print("Clustering Analysis Summary")
print("=" * 50)
print(f"Total experiments analyzed: {len(results_df)}")
print(f"Algorithms compared: {', '.join(results_df['algorithm'].unique())}")
print(f"Unique experimental conditions: {len(results_df.drop_duplicates(['bar_omega', 'K', 'P', 'N']))}")

# Best performing algorithm overall
if 'adjusted_rand_score' in results_df.columns:
    best_algorithm = results_df.groupby('algorithm')['adjusted_rand_score'].mean().idxmax()
    best_score = results_df.groupby('algorithm')['adjusted_rand_score'].mean().max()
    print(f"\nBest performing algorithm: {best_algorithm} (ARI: {best_score:.3f})")

# Fastest algorithm
if 'fit_time' in results_df.columns:
    fastest_algorithm = results_df.groupby('algorithm')['fit_time'].mean().idxmin()
    fastest_time = results_df.groupby('algorithm')['fit_time'].mean().min()
    print(f"Fastest algorithm: {fastest_algorithm} ({fastest_time:.4f} seconds)")

# Most consistent algorithm (lowest std in performance)
if 'adjusted_rand_score' in results_df.columns:
    most_consistent = results_df.groupby('algorithm')['adjusted_rand_score'].std().idxmin()
    consistency_score = results_df.groupby('algorithm')['adjusted_rand_score'].std().min()
    print(f"Most consistent algorithm: {most_consistent} (std: {consistency_score:.3f})")

print("\n" + "=" * 50)
print("Analysis Complete!")
print(f"Visualizations saved in: {figures_dir}")
print("\nRecommendations:")
print("1. Check algorithm-specific performance under different overlap conditions")
print("2. Consider computational constraints when selecting algorithms")
print("3. Validate results on real datasets before final selection")
print("4. Fine-tune parameters for best-performing algorithms")