## Comparing different algorithms based on different scores

### Scores:  
(a) Calinski- Harbask Score  
(b) Davies Bouldin Score  
(c) Dunn- Index  
(d) Silhouette Score  

### Algorithms: 
(a) K-Means  
(b) DBSCANS  
(c) HDBSCANS  
(d) OPTICS  

In [2]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

from OPTICS import optics_clustering
from kmeans import k_means
from HDBSCAN import hdbscan_clustering
from DBSCAN import dbscan_clustering

from cluster_data import normalize_data, unnormalize
from cluster_plotter import ClusterPlotter
from cluster_data import run_clustering
import cluster_data
import scores
import cluster_plotter

# Files for uncorr_observed_data
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018  # Exclude 2018, missing
}

# Standard year ranges
standard_year_ranges = {f"{start}-{start + 3}": np.arange(start, start + 4) for start in [2002, 2006, 2010, 2014]}
standard_year_ranges["2019-2023"] = np.arange(2019, 2024)

# Running ranges
running_ranges = cluster_data.generate_running_year_ranges(2002, 2023, 4)
year_ranges = standard_year_ranges

# Bin observed data
binned_data = cluster_data.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)
results_per_year_range = {}

# Directory for saving plots
plot_dir = "Images/comparison_algorithms"
os.makedirs(plot_dir, exist_ok=True)

# Process each binned data set
for cluster_data, year_range in binned_data:
    print(f"\nRunning clustering for Year Range: {year_range}")

    # Prepare data array
    data_array = np.array([cluster_data.inc, cluster_data.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)

    # Algorithms and parameters
    algorithms = {
        'K-Means': {'model': k_means, 'params': [{'n_clusters': 3}, {'n_clusters': 4}, {'n_clusters': 5}]},
        'DBSCAN': {'model': dbscan_clustering, 'params': [{'eps': 0.05, 'min_samples': 25}, {'eps': 0.01, 'min_samples': 25}]},
        'HDBSCAN': {'model': hdbscan_clustering, 'params': [{'min_samples': 5, 'min_cluster_size': 15}, {'min_samples': 10, 'min_cluster_size': 20}]},
        'OPTICS': {'model': optics_clustering, 'params': [{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}, {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}]}
    }

    results = []
    calinski_scores = []
    dunn_scores = []
    global_min_calinski = float("inf")
    global_max_calinski = -float("inf")
    global_min_dunn = float("inf")
    global_max_dunn = -float("inf")

    for algo_name, algo_info in algorithms.items():
        for params in algo_info['params']:
            print(f"\nRunning {algo_name} with parameters {params}")

            if algo_name == "K-Means":
                model = k_means(normalized_data, k=params['n_clusters'])
                labels = model.labels
                n_clusters = len(set(labels))

            elif algo_name == "DBSCAN":
                model = dbscan_clustering(normalized_data, eps=params['eps'], min_samples=params['min_samples'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            elif algo_name == "HDBSCAN":
                model = hdbscan_clustering(normalized_data, min_cluster_size=params['min_cluster_size'], min_samples=params['min_samples'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            elif algo_name == "OPTICS":
                model = optics_clustering(normalized_data, min_samples=params['min_samples'], max_eps=params['max_eps'], xi=params['xi'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            if n_clusters > 1:
                try:
                    davies_bouldin = 1 - scores.DB_score(model)
                    calinski_harabasz = scores.CH_score(model)
                    silhouette = scores.sil_score(model)
                    dunn = scores.dunn_index_score(model)

                    calinski_scores.append(calinski_harabasz)
                    dunn_scores.append(dunn)

                    results.append({
                        "Algorithm": algo_name,
                        "Parameters": params,
                        "Davies-Bouldin": davies_bouldin,
                        "Calinski-Harabasz": calinski_harabasz,
                        "Silhouette Score": silhouette,
                        "Dunn Index": dunn,
                        "Clusters": n_clusters
                    })

                except ValueError:
                    pass

    if calinski_scores:
        global_min_calinski = np.min(calinski_scores)
        global_max_calinski = np.max(calinski_scores)

    if dunn_scores:
        global_min_dunn = np.min(dunn_scores)
        global_max_dunn = np.max(dunn_scores)

    for result in results:
        if result["Calinski-Harabasz"] is not None:
            result["Calinski-Harabasz"] = scores.normalize_CH_score(
                result["Calinski-Harabasz"], global_min_calinski, global_max_calinski
            )
        if result["Dunn Index"] is not None:
            result["Dunn Index"] = scores.normalize_dunn_index(
                result["Dunn Index"], global_min_dunn, global_max_dunn
            )

    df = pd.DataFrame(results)

    best_params_per_algorithm = {}

    for algo_name in algorithms.keys():
        algo_df = df[df["Algorithm"] == algo_name]
        if algo_df.empty:
            print(f"No data for {algo_name}, skipping...")
            continue

        best_params = {}
        for score in ["Davies-Bouldin", "Calinski-Harabasz", "Silhouette Score", "Dunn Index"]:
            if algo_df[score].isna().all():
                print(f"Warning: All values for {score} are NaN for {algo_name}. Skipping...")
                best_params[score] = {"Parameters": None, "Score": None}
            else:
                if score in ["Davies-Bouldin", "Dunn Index"]:
                    best_row = algo_df.loc[algo_df[score].idxmin()]
                else:
                    best_row = algo_df.loc[algo_df[score].idxmax()]
                best_params[score] = {
                    "Parameters": best_row["Parameters"],
                    "Score": best_row[score]
                }

        best_params_per_algorithm[algo_name] = pd.DataFrame(best_params).T

    for algo_name, best_params_df in best_params_per_algorithm.items():
        print(f"\nBest Parameters for {algo_name}")
        display(best_params_df)
        table_filename = os.path.join(plot_dir, f"best_params_{algo_name}_{'-'.join(map(str, year_range))}.png")
        cluster_plotter.save_table_as_image(
            best_params_df,
            table_filename,
            title=f"Best Parameters for {algo_name}"
        )

    # Heatmap
    score_columns = ["Davies-Bouldin", "Calinski-Harabasz", "Silhouette Score", "Dunn Index"]
    heatmap_data = df.pivot_table(values=score_columns, index=["Algorithm"], aggfunc="mean")
    plt.figure(figsize=(12, 8))
    sns.heatmap(heatmap_data, annot=True, cmap="Reds", linewidths=0.5)
    plt.title("Heatmap of Scores for Different Parameter Sets")
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, f"heatmap_{'-'.join(map(str, year_range))}.png"))
    plt.close()

    # Radar plot
    score_values = {}
    for algo_name in algorithms.keys():
        best_davies_bouldin = df[df["Algorithm"] == algo_name]["Davies-Bouldin"].max()
        best_calinski_harabasz = df[df["Algorithm"] == algo_name]["Calinski-Harabasz"].max()
        best_silhouette = df[df["Algorithm"] == algo_name]["Silhouette Score"].max()
        best_dunn = df[df["Algorithm"] == algo_name]["Dunn Index"].max()
        score_values[algo_name] = [best_davies_bouldin, best_calinski_harabasz, best_silhouette, best_dunn]

    categories = ["Davies-Bouldin", "Calinski-Harabasz", "Silhouette Score", "Dunn Index"]
    N = len(categories)
    angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
    angles += angles[:1]
    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    for algo in score_values:
        values = score_values[algo]
        values += values[:1]
        ax.plot(angles, values, label=algo, linewidth=2)
        ax.fill(angles, values, alpha=0.25)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)
    plt.title("Comparison of Clustering Algorithms (K-Means, DBSCAN, HDBSCAN, OPTICS)", size=15)
    plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, f"radar_plot_{'-'.join(map(str, year_range))}.png"))
    plt.close()

    # Correlation matrix
    correlation_matrix = df[score_columns].corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap="Blues", cbar=True, fmt='.2f', linewidths=0.5)
    plt.title("Correlation Matrix of Clustering Scores")
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, f"correlation_matrix_{'-'.join(map(str, year_range))}.png"))
    plt.close()


Running clustering for Year Range: 2002-2005

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 25}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 4},0.209839
Calinski-Harabasz,{'n_clusters': 3},1.0
Silhouette Score,{'n_clusters': 5},0.459686
Dunn Index,{'n_clusters': 4},0.343581



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.01, 'min_samples': 25}",-1.901187
Calinski-Harabasz,"{'eps': 0.05, 'min_samples': 25}",0.053585
Silhouette Score,"{'eps': 0.05, 'min_samples': 25}",0.215159
Dunn Index,"{'eps': 0.01, 'min_samples': 25}",0.011469



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 10, 'min_cluster_size': 20}",-1.000928
Calinski-Harabasz,"{'min_samples': 10, 'min_cluster_size': 20}",0.0005
Silhouette Score,"{'min_samples': 5, 'min_cluster_size': 15}",-0.136247
Dunn Index,"{'min_samples': 10, 'min_cluster_size': 20}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",-1.136251
Calinski-Harabasz,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.039089
Silhouette Score,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",-0.242306
Dunn Index,"{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}",0.018425



Running clustering for Year Range: 2006-2009

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 25}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 5},0.164513
Calinski-Harabasz,{'n_clusters': 3},1.0
Silhouette Score,{'n_clusters': 5},0.422183
Dunn Index,{'n_clusters': 5},0.404486



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.01, 'min_samples': 25}",-0.869086
Calinski-Harabasz,"{'eps': 0.01, 'min_samples': 25}",0.011453
Silhouette Score,"{'eps': 0.01, 'min_samples': 25}",-0.218456
Dunn Index,"{'eps': 0.01, 'min_samples': 25}",0.182071



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 5, 'min_cluster_size': 15}",-0.707233
Calinski-Harabasz,"{'min_samples': 10, 'min_cluster_size': 20}",0.013609
Silhouette Score,"{'min_samples': 10, 'min_cluster_size': 20}",-0.166881
Dunn Index,"{'min_samples': 5, 'min_cluster_size': 15}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",-1.366662
Calinski-Harabasz,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.014751
Silhouette Score,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",-0.195511
Dunn Index,"{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}",0.033199



Running clustering for Year Range: 2010-2013

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 25}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 4},0.200715
Calinski-Harabasz,{'n_clusters': 5},1.0
Silhouette Score,{'n_clusters': 5},0.460841
Dunn Index,{'n_clusters': 3},0.256815



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.05, 'min_samples': 25}",-0.869367
Calinski-Harabasz,"{'eps': 0.05, 'min_samples': 25}",0.087468
Silhouette Score,"{'eps': 0.05, 'min_samples': 25}",0.163783
Dunn Index,"{'eps': 0.01, 'min_samples': 25}",0.120987



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 10, 'min_cluster_size': 20}",-0.778007
Calinski-Harabasz,"{'min_samples': 5, 'min_cluster_size': 15}",0.020047
Silhouette Score,"{'min_samples': 5, 'min_cluster_size': 15}",0.090677
Dunn Index,"{'min_samples': 5, 'min_cluster_size': 15}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",-0.955293
Calinski-Harabasz,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.021876
Silhouette Score,"{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}",-0.058436
Dunn Index,"{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}",0.058352



Running clustering for Year Range: 2014-2017

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 25}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 4},0.135107
Calinski-Harabasz,{'n_clusters': 5},1.0
Silhouette Score,{'n_clusters': 5},0.386652
Dunn Index,{'n_clusters': 3},0.413028



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.05, 'min_samples': 25}",-2.062006
Calinski-Harabasz,"{'eps': 0.05, 'min_samples': 25}",0.045174
Silhouette Score,"{'eps': 0.05, 'min_samples': 25}",0.187555
Dunn Index,"{'eps': 0.05, 'min_samples': 25}",0.327439



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 10, 'min_cluster_size': 20}",-1.272849
Calinski-Harabasz,"{'min_samples': 10, 'min_cluster_size': 20}",0.027888
Silhouette Score,"{'min_samples': 10, 'min_cluster_size': 20}",0.020374
Dunn Index,"{'min_samples': 5, 'min_cluster_size': 15}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}",-2.734553
Calinski-Harabasz,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.090969
Silhouette Score,"{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}",-0.055318
Dunn Index,"{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}",0.275315



Running clustering for Year Range: 2019-2023

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 25}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 4},0.179386
Calinski-Harabasz,{'n_clusters': 3},1.0
Silhouette Score,{'n_clusters': 5},0.403812
Dunn Index,{'n_clusters': 3},0.211766



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.05, 'min_samples': 25}",-3.31901
Calinski-Harabasz,"{'eps': 0.05, 'min_samples': 25}",0.026501
Silhouette Score,"{'eps': 0.05, 'min_samples': 25}",0.02881
Dunn Index,"{'eps': 0.01, 'min_samples': 25}",0.165296



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 5, 'min_cluster_size': 15}",-1.762645
Calinski-Harabasz,"{'min_samples': 10, 'min_cluster_size': 20}",0.021846
Silhouette Score,"{'min_samples': 5, 'min_cluster_size': 15}",-0.064504
Dunn Index,"{'min_samples': 5, 'min_cluster_size': 15}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",-0.929478
Calinski-Harabasz,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.034196
Silhouette Score,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",-0.152033
Dunn Index,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.105361
