In [8]:
import os
import sys
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.io as pio
import time
import psutil
import threading
import ctypes
from kmeans import (
    KMeansSingle, 
    KMeansMulti, 
    KMeansSK, 
    DataGenerator, 
    KMeansBenchmark, 
    ClusteringVisualizer,
    HAS_IPYWIDGETS,
    InteractiveDashboard
)


In [9]:


pio.renderers.default = "browser"

def compile_openmp_library(verbose=True):

    if verbose:
        print("Compiling OpenMP library...")

    if sys.platform.startswith('win'):
        compiler_cmd = "gcc -fopenmp -O3 -shared -o kmeans_openmp.dll -fPIC kmeans_openmp.c"
        lib_name = "kmeans_openmp.dll"
    else:
        compiler_cmd = "gcc -fopenmp -O3 -shared -o kmeans_openmp.so -fPIC kmeans_openmp.c"
        lib_name = "kmeans_openmp.so"
    
    try:
        result = subprocess.run(
            compiler_cmd, 
            shell=True,
            check=True,
            stdout=subprocess.PIPE, 
            stderr=subprocess.PIPE
        )
        if verbose:
            print(f"Compilation successful: {lib_name}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Compilation failed: {e}")
        print(f"Stderr: {e.stderr.decode('utf-8')}")
        return False




In [None]:

def demo_simple_clustering(n_samples=10000, n_clusters=5):
    print(f"\n=== Demo: Simple Clustering (n={n_samples}, k={n_clusters}) ===")
    

    X, y_true = DataGenerator.make_blobs_data(
        n_samples=n_samples,
        n_features=2,
        n_clusters=n_clusters,
        random_state=42
    )
    
    
    models = {
        "Single-Core": KMeansSingle(n_clusters=n_clusters, random_state=42),
        "Multi-Core": KMeansMulti(n_clusters=n_clusters, random_state=42),
        "Scikit-Learn": KMeansSK(n_clusters=n_clusters, random_state=42)
    }
    
    for name, model in models.items():
        print(f"\nRunning {name}...")
        model.fit(X)
        
        
        inertia = model.inertia_
        print(f"Inertia: {inertia:.2f}")
        
        
        from sklearn.metrics import silhouette_score, calinski_harabasz_score
        sil_score = silhouette_score(X, model.labels_)
        ch_score = calinski_harabasz_score(X, model.labels_)
        print(f"Silhouette Score: {sil_score:.4f}")
        print(f"Calinski-Harabasz Score: {ch_score:.2f}")
        
        
        if X.shape[1] == 2:
            fig = ClusteringVisualizer.plot_2d_clusters(
                X, 
                model.labels_,
                model.centroids_,
                title=f"K-means Clustering ({name}, k={n_clusters})"
            )
            
            fig.show()


def run_benchmark_comparison():
    """
    Run a comprehensive benchmark comparing all implementations.
    """
    print("\n=== Running Benchmark Comparison ===")

    benchmark = KMeansBenchmark(random_state=42)

    methods = ['single', 'multi', 'sklearn']
    sample_sizes = [1000, 5000, 20000, 50000, 100000]
    
    results_df = benchmark.run_scaling_benchmark(
        methods=methods,
        n_samples_range=sample_sizes,
        n_features=2,
        n_clusters=5
    )

    results_df.to_csv("kmeans_benchmark_results.csv", index=False)
    print(f"Benchmark results saved to kmeans_benchmark_results.csv")

    metrics = ['time', 'mem_usage_mb', 'energy_est', 'silhouette']
    figures = benchmark.plot_results(results_df, metrics=metrics)

    if figures:
        print("Showing execution time comparison...")
        figures[0].show()

    print("Creating comparison dashboard...")
    dashboard_fig = ClusteringVisualizer.plot_comparison_dashboard(
        results_df,
        methods=methods,
        n_samples=sample_sizes,
        n_clusters=5,
        features=2
    )
    dashboard_fig.show()
    
    return results_df


def launch_interactive_dashboard():
    """
    Launch the interactive dashboard (only works in Jupyter).
    """
    if not HAS_IPYWIDGETS:
        print("Interactive dashboard requires ipywidgets. Please install with:")
        print("pip install ipywidgets")
        print("And ensure you're running in a Jupyter notebook.")
        return
    
    dashboard = InteractiveDashboard()
    dashboard.display()
    print("Interactive dashboard launched. Use the controls to experiment with K-means.")




In [11]:

class ResourceMonitor:
    
    def __init__(self, interval=0.1):

        self.interval = interval
        self.cpu_percentages = []
        self.memory_usages = []
        self.timestamps = []
        self.start_time = None
        self.running = False
        self.thread = None
        self.process = psutil.Process(os.getpid())
    
    def _monitor(self):

        while self.running:

            cpu_percent = psutil.cpu_percent(interval=0)
            memory_info = self.process.memory_info()
            memory_mb = memory_info.rss / (1024 * 1024)
            

            current_time = time.time() - self.start_time
            

            self.cpu_percentages.append(cpu_percent)
            self.memory_usages.append(memory_mb)
            self.timestamps.append(current_time)
            

            time.sleep(self.interval)
    
    def start(self):

        self.cpu_percentages = []
        self.memory_usages = []
        self.timestamps = []
        self.start_time = time.time()
        self.running = True
        self.thread = threading.Thread(target=self._monitor)
        self.thread.daemon = True
        self.thread.start()
        
    def stop(self):

        self.running = False
        if self.thread:
            self.thread.join(timeout=1.0)
        return {
            'timestamps': self.timestamps,
            'cpu_percentages': self.cpu_percentages,
            'memory_usages': self.memory_usages
        }
    
    def plot_results(self):

        import plotly.graph_objects as go
        from plotly.subplots import make_subplots

        fig = make_subplots(
            rows=2, 
            cols=1,
            subplot_titles=("CPU Usage (%)", "Memory Usage (MB)"),
            vertical_spacing=0.15
        )

        fig.add_trace(
            go.Scatter(
                x=self.timestamps,
                y=self.cpu_percentages,
                mode='lines',
                name='CPU %',
                line=dict(color='blue')
            ),
            row=1, col=1
        )

        fig.add_trace(
            go.Scatter(
                x=self.timestamps,
                y=self.memory_usages,
                mode='lines',
                name='Memory (MB)',
                line=dict(color='red')
            ),
            row=2, col=1
        )

        fig.update_layout(
            title_text="Resource Utilization Over Time",
            height=700,
            width=1000
        )

        fig.update_xaxes(title_text="Time (seconds)", row=1, col=1)
        fig.update_xaxes(title_text="Time (seconds)", row=2, col=1)
        fig.update_yaxes(title_text="CPU Usage (%)", row=1, col=1)
        fig.update_yaxes(title_text="Memory Usage (MB)", row=2, col=1)
        
        return fig


In [12]:

def run_parallel_efficiency_analysis(n_samples=20000, n_clusters=5, n_features=2):

    print("\n=== Running Parallel Efficiency Analysis ===")

    X, _ = DataGenerator.make_blobs_data(
        n_samples=n_samples,
        n_features=n_features,
        n_clusters=n_clusters,
        random_state=42
    )

    single_core = KMeansSingle(n_clusters=n_clusters, random_state=42)
    start_time = time.time()
    single_core.fit(X)
    single_time = time.time() - start_time
    print(f"Single-core time: {single_time:.4f} seconds")
    
    multi_core = KMeansMulti(n_clusters=n_clusters, random_state=42)
    start_time = time.time()
    multi_core.fit(X)
    multi_time = time.time() - start_time
    print(f"Multi-core time: {multi_time:.4f} seconds")
    
    speedup = single_time / multi_time
    num_cores = os.cpu_count()
    efficiency = speedup / num_cores
    
    print(f"Number of CPU cores: {num_cores}")
    print(f"Speedup: {speedup:.2f}x")
    print(f"Parallel efficiency: {efficiency:.2f}")
    
    results = {
        'Method': ['Single-Core', 'Multi-Core'],
        'Time (s)': [single_time, multi_time],
        'Relative Speed': [1.0, speedup],
        'Cores Used': [1, num_cores],
        'Efficiency': [1.0, efficiency]
    }
    return pd.DataFrame(results)


def run_thread_scaling_test(n_samples=20000, n_clusters=5, thread_counts=None):

    print("\n=== Running Thread Scaling Test ===")
    
    # 确定要测试的线程数
    max_cores = os.cpu_count()
    if thread_counts is None:
        thread_counts = [1]
        power = 1
        while 2**power <= max_cores:
            thread_counts.append(2**power)
            power += 1

        if max_cores not in thread_counts:
            thread_counts.append(max_cores)

    X, _ = DataGenerator.make_blobs_data(
        n_samples=n_samples,
        n_features=2,
        n_clusters=n_clusters,
        random_state=42
    )

    results = []
    
    for num_threads in thread_counts:
        print(f"Testing with {num_threads} thread(s)...")

        os.environ["OMP_NUM_THREADS"] = str(num_threads)

        start_time = time.time()
        model = KMeansMulti(n_clusters=n_clusters, random_state=42)
        model.fit(X)
        elapsed_time = time.time() - start_time

        if num_threads == 1:
            single_thread_time = elapsed_time
            speedup = 1.0
        else:
            speedup = single_thread_time / elapsed_time

        efficiency = speedup / num_threads

        results.append({
            'num_threads': num_threads,
            'time': elapsed_time,
            'speedup': speedup,
            'efficiency': efficiency
        })
        
        print(f"  Time: {elapsed_time:.4f}s, Speedup: {speedup:.2f}x, Efficiency: {efficiency:.2f}")
    df = pd.DataFrame(results)

    import plotly.express as px
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go

    fig = make_subplots(
        rows=1, 
        cols=2,
        subplot_titles=("Execution Time vs Threads", "Speedup & Efficiency vs Threads"),
        horizontal_spacing=0.15
    )

    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=df['time'],
            mode='lines+markers',
            name='Execution Time',
            line=dict(color='blue')
        ),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=df['speedup'],
            mode='lines+markers',
            name='Speedup',
            line=dict(color='green')
        ),
        row=1, col=2
    )
    
    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=df['efficiency'],
            mode='lines+markers',
            name='Efficiency',
            line=dict(color='red')
        ),
        row=1, col=2
    )

    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=df['num_threads'],
            mode='lines',
            name='Ideal Speedup',
            line=dict(color='green', dash='dash')
        ),
        row=1, col=2
    )

    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=[1.0] * len(df),
            mode='lines',
            name='Ideal Efficiency',
            line=dict(color='red', dash='dash')
        ),
        row=1, col=2
    )

    fig.update_layout(
        title_text=f"K-means Thread Scaling (n_samples={n_samples}, n_clusters={n_clusters})",
        height=500,
        width=1200
    )

    fig.update_xaxes(title_text="Number of Threads", row=1, col=1)
    fig.update_xaxes(title_text="Number of Threads", row=1, col=2)
    fig.update_yaxes(title_text="Execution Time (s)", row=1, col=1)
    fig.update_yaxes(title_text="Speedup / Efficiency", row=1, col=2)

    fig.show()
    
    return df


def run_resource_monitored_comparison(n_samples=20000, n_clusters=5):
    """
    使用资源监控运行各种K-means实现，将结果整合到一个图表中展示
    """
    print(f"\n=== Resource-Monitored K-means Comparison (n={n_samples}, k={n_clusters}) ===")
    
    X, _ = DataGenerator.make_blobs_data(
        n_samples=n_samples,
        n_features=2,
        n_clusters=n_clusters,
        random_state=42
    )
    
    models = {
        "Single-Core": KMeansSingle(n_clusters=n_clusters, random_state=42),
        "Multi-Core": KMeansMulti(n_clusters=n_clusters, random_state=42),
        "Scikit-Learn": KMeansSK(n_clusters=n_clusters, random_state=42)
    }
    
    results = {}
    
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go

    fig = make_subplots(
        rows=2, 
        cols=3,
        subplot_titles=(
            "Single-Core CPU Usage", "Multi-Core CPU Usage", "Scikit-learn CPU Usage",
            "Single-Core Memory Usage", "Multi-Core Memory Usage", "Scikit-learn Memory Usage"
        ),
        vertical_spacing=0.15,
        horizontal_spacing=0.05
    )
    
    colors = {
        "Single-Core": "blue",
        "Multi-Core": "green",
        "Scikit-Learn": "red"
    }
    

    col_idx = 1
    for name, model in models.items():
        print(f"\nRunning {name} with resource monitoring...")
        

        monitor = ResourceMonitor(interval=0.05)
        monitor.start()
        

        start_time = time.time()
        model.fit(X)
        execution_time = time.time() - start_time
        

        monitoring_data = monitor.stop()
        

        results[name] = {
            'execution_time': execution_time,
            'monitoring_data': monitoring_data,
            'model': model
        }
        
        print(f"  Execution time: {execution_time:.4f} seconds")
        print(f"  Peak CPU usage: {max(monitoring_data['cpu_percentages']):.1f}%")
        print(f"  Peak memory usage: {max(monitoring_data['memory_usages']):.1f} MB")
        

        fig.add_trace(
            go.Scatter(
                x=monitoring_data['timestamps'],
                y=monitoring_data['cpu_percentages'],
                mode='lines',
                name=f'{name} CPU',
                line=dict(color=colors[name])
            ),
            row=1, col=col_idx
        )
        
        fig.add_trace(
            go.Scatter(
                x=monitoring_data['timestamps'],
                y=monitoring_data['memory_usages'],
                mode='lines',
                name=f'{name} Memory',
                line=dict(color=colors[name])
            ),
            row=2, col=col_idx
        )
        
        col_idx += 1
    
    fig.update_layout(
        title_text=f"Resource Usage Comparison: K-means Implementations (n={n_samples}, k={n_clusters})",
        height=800,
        width=1200,
        showlegend=False
    )
    
    for i in range(1, 4):
        fig.update_yaxes(title_text="CPU (%)", row=1, col=i)
        fig.update_yaxes(title_text="Memory (MB)", row=2, col=i)
        fig.update_xaxes(title_text="Time (s)", row=2, col=i)
    
    fig.show()
    
    return results


In [13]:

def run_comprehensive_performance_analysis(n_samples=20000, n_clusters=5, thread_counts=None):

    print("\n=== Comprehensive Performance Analysis ===")
    
    X, _ = DataGenerator.make_blobs_data(
        n_samples=n_samples,
        n_features=2,
        n_clusters=n_clusters,
        random_state=42
    )
    
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    fig = make_subplots(
        rows=3, 
        cols=2,
        subplot_titles=(
            "Method Comparison - Execution Time", "Method Comparison - Efficiency",
            "Thread Scaling - Execution Time", "Thread Scaling - Speedup & Efficiency",
            "Resource Monitoring - CPU Usage", "Resource Monitoring - Memory Usage"
        ),
        vertical_spacing=0.12,
        horizontal_spacing=0.08,
        specs=[
            [{"type": "bar"}, {"type": "bar"}],
            [{"type": "scatter"}, {"type": "scatter"}],
            [{"type": "scatter"}, {"type": "scatter"}]
        ]
    )

    print("Running method comparison...")

    single_core = KMeansSingle(n_clusters=n_clusters, random_state=42)
    start_time = time.time()
    single_core.fit(X)
    single_time = time.time() - start_time
    

    multi_core = KMeansMulti(n_clusters=n_clusters, random_state=42)
    start_time = time.time()
    multi_core.fit(X)
    multi_time = time.time() - start_time
    

    sklearn_core = KMeansSK(n_clusters=n_clusters, random_state=42)
    start_time = time.time()
    sklearn_core.fit(X)
    sklearn_time = time.time() - start_time
    
    speedup = single_time / multi_time
    num_cores = os.cpu_count()
    efficiency = speedup / num_cores
    
    methods = ["Single-Core", "Multi-Core", "Scikit-Learn"]
    times = [single_time, multi_time, sklearn_time]
    efficiencies = [1.0, efficiency, single_time/sklearn_time/1]
    

    fig.add_trace(
        go.Bar(
            x=methods,
            y=times,
            name="Execution Time",
            text=[f"{t:.4f}s" for t in times],
            textposition="auto",
            marker_color=["blue", "green", "red"]
        ),
        row=1, col=1
    )
    

    fig.add_trace(
        go.Bar(
            x=methods,
            y=efficiencies,
            name="Efficiency",
            text=[f"{e:.2f}" for e in efficiencies],
            textposition="auto",
            marker_color=["blue", "green", "red"]
        ),
        row=1, col=2
    )
    
    print("Running thread scaling test...")
    
    max_cores = os.cpu_count()
    if thread_counts is None:
        thread_counts = [1]
        power = 1
        while 2**power <= max_cores:
            thread_counts.append(2**power)
            power += 1

        if max_cores not in thread_counts:
            thread_counts.append(max_cores)
    

    scaling_results = []
    
    for num_threads in thread_counts:
        print(f"  Testing with {num_threads} thread(s)...")
        
        os.environ["OMP_NUM_THREADS"] = str(num_threads)
        
        start_time = time.time()
        model = KMeansMulti(n_clusters=n_clusters, random_state=42)
        model.fit(X)
        elapsed_time = time.time() - start_time
        
        if num_threads == 1:
            single_thread_time = elapsed_time
            speedup = 1.0
        else:
            speedup = single_thread_time / elapsed_time
        

        efficiency = speedup / num_threads
        

        scaling_results.append({
            'num_threads': num_threads,
            'time': elapsed_time,
            'speedup': speedup,
            'efficiency': efficiency
        })
    

    scaling_df = pd.DataFrame(scaling_results)

    fig.add_trace(
        go.Scatter(
            x=scaling_df['num_threads'],
            y=scaling_df['time'],
            mode='lines+markers',
            name='Execution Time',
            line=dict(color='blue')
        ),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Scatter(
            x=scaling_df['num_threads'],
            y=scaling_df['speedup'],
            mode='lines+markers',
            name='Speedup',
            line=dict(color='green')
        ),
        row=2, col=2
    )
    
    fig.add_trace(
        go.Scatter(
            x=scaling_df['num_threads'],
            y=scaling_df['efficiency'],
            mode='lines+markers',
            name='Efficiency',
            line=dict(color='red')
        ),
        row=2, col=2
    )
    
    fig.add_trace(
        go.Scatter(
            x=scaling_df['num_threads'],
            y=scaling_df['num_threads'],
            mode='lines',
            name='Ideal Speedup',
            line=dict(color='green', dash='dash')
        ),
        row=2, col=2
    )
    
    print("Running resource monitoring test...")
    monitor = ResourceMonitor(interval=0.05)
    monitor.start()
    
    model = KMeansMulti(n_clusters=n_clusters, random_state=42)
    model.fit(X)
    
    monitoring_data = monitor.stop()
    
    fig.add_trace(
        go.Scatter(
            x=monitoring_data['timestamps'],
            y=monitoring_data['cpu_percentages'],
            mode='lines',
            name='CPU Usage',
            line=dict(color='purple')
        ),
        row=3, col=1
    )
    
    fig.add_trace(
        go.Scatter(
            x=monitoring_data['timestamps'],
            y=monitoring_data['memory_usages'],
            mode='lines',
            name='Memory Usage',
            line=dict(color='orange')
        ),
        row=3, col=2
    )
    
    fig.update_layout(
        title_text=f"Comprehensive K-means Performance Analysis (n={n_samples}, k={n_clusters})",
        height=1200,
        width=1200,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    
    fig.update_yaxes(title_text="Time (seconds)", row=1, col=1)
    fig.update_yaxes(title_text="Efficiency", row=1, col=2)
    
    fig.update_xaxes(title_text="Number of Threads", row=2, col=1)
    fig.update_xaxes(title_text="Number of Threads", row=2, col=2)
    fig.update_yaxes(title_text="Time (seconds)", row=2, col=1)
    fig.update_yaxes(title_text="Value", row=2, col=2)
    
    fig.update_xaxes(title_text="Time (seconds)", row=3, col=1)
    fig.update_xaxes(title_text="Time (seconds)", row=3, col=2)
    fig.update_yaxes(title_text="CPU Usage (%)", row=3, col=1)
    fig.update_yaxes(title_text="Memory Usage (MB)", row=3, col=2)

    fig.show()
    
    return {
        "method_comparison": {
            "methods": methods,
            "times": times,
            "efficiencies": efficiencies
        },
        "thread_scaling": scaling_df,
        "resource_monitoring": monitoring_data
    }


In [14]:

def main():

    print("="*80)
    print("K-Means Clustering Implementation and Benchmarking Tool")
    print("="*80)
    
    lib_exists = os.path.exists("kmeans_openmp.so") or os.path.exists("kmeans_openmp.dll")
    
    if not lib_exists:
        success = compile_openmp_library()
        if not success:
            print("WARNING: Failed to compile OpenMP library. Multi-core K-means will not be available.")

    demo_simple_clustering(n_samples=10000, n_clusters=5)
    

    print("\n=== Enhanced Performance Analysis ===")
    comprehensive_results = run_comprehensive_performance_analysis(n_samples=20000, n_clusters=5)

    results = run_benchmark_comparison()
    
    print("\n=== Summary ===")
    print("Best method by execution time:")
    best_time = results.loc[results.groupby('n_samples')['time'].idxmin()]
    print(best_time[['n_samples', 'method', 'time']])
    
    print("\nBest method by clustering quality (silhouette score):")
    best_quality = results.loc[results.groupby('n_samples')['silhouette'].idxmax()]
    print(best_quality[['n_samples', 'method', 'silhouette']])

    print("\nTo launch the interactive dashboard in a Jupyter notebook, use:")
    print("from main import launch_interactive_dashboard")
    print("launch_interactive_dashboard()")


if __name__ == "__main__":
    main() 

K-Means Clustering Implementation and Benchmarking Tool

=== Demo: Simple Clustering (n=10000, k=5) ===

Running Single-Core...
Inertia: 30581.51
Silhouette Score: 0.6131
Calinski-Harabasz Score: 45895.13

Running Multi-Core...
Inertia: 19490.06
Silhouette Score: 0.6695
Calinski-Harabasz Score: 73435.21

Running Scikit-Learn...
Inertia: 19490.06
Silhouette Score: 0.6695
Calinski-Harabasz Score: 73435.21

=== Enhanced Performance Analysis ===

=== Comprehensive Performance Analysis ===
Running method comparison...
Running thread scaling test...
  Testing with 1 thread(s)...
  Testing with 2 thread(s)...
  Testing with 4 thread(s)...
  Testing with 8 thread(s)...
  Testing with 11 thread(s)...
Running resource monitoring test...

=== Running Benchmark Comparison ===
Running single with 1000 samples...
Running single with 5000 samples...
Running single with 20000 samples...
Running single with 50000 samples...
Running multi with 1000 samples...
Running multi with 5000 samples...
Running m