## Comparing different algorithms based on different scores

### Scores:  
(a) Calinski- Harbask Score  
(b) Davies Bouldin Score  
(c) Dunn- Index  
(d) Silhouette Score  

### Algorithms: 
(a) K-Means  
(b) DBSCANS  
(c) HDBSCANS  
(d) OPTICS  

In [6]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

from OPTICS import optics_clustering
from kmeans import k_means
from HDBSCAN import hdbscan_clustering
from DBSCAN import dbscan_clustering

from cluster_data import normalize_data, unnormalize
from cluster_plotter import ClusterPlotter
from cluster_data import run_clustering
import cluster_data
import scores
import cluster_plotter

# Files for uncorr_observed_data
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018  # Exclude 2018, missing
}

# Standard year ranges
standard_year_ranges = {f"{start}-{start + 3}": np.arange(start, start + 4) for start in [2002, 2006, 2010, 2014]}
standard_year_ranges["2019-2023"] = np.arange(2019, 2024)

# Running ranges
running_ranges = cluster_data.generate_running_year_ranges(2002, 2023, 4)
year_ranges = standard_year_ranges

# Bin observed data
binned_data = cluster_data.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)
results_per_year_range = {}

# Directory for saving plots
plot_dir = "Images/comparison_algorithms"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# Process each binned data set
for cluster_data, year_range in binned_data:
    print(f"\nRunning clustering for Year Range: {year_range}")

    # Prepare data array
    data_array = np.array([cluster_data.inc, cluster_data.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)

    # Algorithms and parameters
    algorithms = {
        'K-Means': {'model': k_means, 'params': [{'n_clusters': 3}, {'n_clusters': 4}, {'n_clusters': 5}]},
        'DBSCAN': {'model': dbscan_clustering, 'params': [{'eps': 0.05, 'min_samples': 25}, {'eps': 0.01, 'min_samples': 30}, {'eps': 0.02, 'min_samples': 35}]},
        'HDBSCAN': {'model': hdbscan_clustering, 'params': [{'min_samples': 5, 'min_cluster_size': 15}, {'min_samples': 10, 'min_cluster_size': 20}, {'min_samples': 15, 'min_cluster_size': 25}]},
        'OPTICS': {'model': optics_clustering, 'params': [{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}, {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}, {'min_samples': 150, 'max_eps': 500, 'xi': 0.001}]}
    }

    results = []
    calinski_scores = []
    dunn_scores = []
    davies_bouldin_scores = []
    silhouette_scores = []
    global_min_calinski = float("inf")
    global_max_calinski = 0.0
    global_min_dunn = float("inf")
    global_max_dunn = 0.0
    global_min_sil = -1.0
    global_max_sil = 1.0
    global_min_db = 0.0
    global_max_db = float("inf")

    for algo_name, algo_info in algorithms.items():
        for params in algo_info['params']:
            print(f"\nRunning {algo_name} with parameters {params}")

            if algo_name == "K-Means":
                model = k_means(normalized_data, k=params['n_clusters'])
                labels = model.labels
                n_clusters = len(set(labels))

            elif algo_name == "DBSCAN":
                model = dbscan_clustering(normalized_data, eps=params['eps'], min_samples=params['min_samples'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            elif algo_name == "HDBSCAN":
                model = hdbscan_clustering(normalized_data, min_cluster_size=params['min_cluster_size'], min_samples=params['min_samples'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            elif algo_name == "OPTICS":
                model = optics_clustering(normalized_data, min_samples=params['min_samples'], max_eps=params['max_eps'], xi=params['xi'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            if n_clusters > 1:
                try:
                    davies_bouldin = scores.DB_score(model)  # Ensure this function is defined correctly
                    calinski_harabasz = scores.CH_score(model)
                    silhouette = scores.sil_score(model)
                    dunn = scores.dunn_index_score(model)

                    calinski_scores.append(calinski_harabasz)
                    dunn_scores.append(dunn)
                    davies_bouldin_scores.append(davies_bouldin)
                    silhouette_scores.append(silhouette)

                    results.append({
                        "Algorithm": algo_name,
                        "Parameters": params,
                        "Davies-Bouldin": davies_bouldin,
                        "Calinski-Harabasz": calinski_harabasz,
                        "Silhouette Score": silhouette,
                        "Dunn Index": dunn,
                        "Clusters": n_clusters
                    })

                except ValueError as e:
                    print(f"Error calculating scores: {e}")
                    pass

    # Updating the global min/max values
    if calinski_scores:
        global_min_calinski = np.min(calinski_scores)
        global_max_calinski = np.max(calinski_scores)

    if dunn_scores:
        global_min_dunn = np.min(dunn_scores)
        global_max_dunn = np.max(dunn_scores)

    if davies_bouldin_scores:
        global_min_db = np.min(davies_bouldin_scores)
        global_max_db = np.max(davies_bouldin_scores)

    if silhouette_scores:
        global_min_silhouette = np.min(silhouette_scores)
        global_max_silhouette = np.max(silhouette_scores)

    # Reversing Davies-Bouldin and normalizing all scores
    for result in results:
        if result["Calinski-Harabasz"] is not None:
            result["Calinski-Harabasz"] = scores.normalize_score(
                result["Calinski-Harabasz"], global_min_calinski, global_max_calinski
            )
        if result["Dunn Index"] is not None:
            result["Dunn Index"] = scores.normalize_score(
                result["Dunn Index"], global_min_dunn, global_max_dunn
            )
        if result["Silhouette Score"] is not None:
            result["Silhouette Score"] = scores.normalize_score(
                result["Silhouette Score"], global_min_sil, global_max_sil
            )
        if result["Davies-Bouldin"] is not None:
            davies_bouldin_reversed = global_max_db - result["Davies-Bouldin"]
            result["Davies-Bouldin"] = scores.normalize_score(
                davies_bouldin_reversed, global_min_db, global_max_db
            )

    # Convert results to DataFrame
    df = pd.DataFrame(results)

    best_params_per_algorithm = {}

    for algo_name in algorithms.keys():
        algo_df = df[df["Algorithm"] == algo_name]
        if algo_df.empty:
            print(f"No data for {algo_name}, skipping...")
            continue

        best_params = {}
        for score in ["Davies-Bouldin", "Calinski-Harabasz", "Silhouette Score", "Dunn Index"]:
            if algo_df[score].isna().all():
                print(f"Warning: All values for {score} are NaN for {algo_name}. Skipping...")
                best_params[score] = {"Parameters": None, "Score": None}
            else:
                if score in ["Davies-Bouldin", "Dunn Index"]:
                    best_row = algo_df.loc[algo_df[score].idxmin()]
                else:
                    best_row = algo_df.loc[algo_df[score].idxmax()]
                best_params[score] = {
                    "Parameters": best_row["Parameters"],
                    "Score": best_row[score]
                }

        best_params_per_algorithm[algo_name] = pd.DataFrame(best_params).T

    for algo_name, best_params_df in best_params_per_algorithm.items():
        print(f"\nBest Parameters for {algo_name}")
        display(best_params_df)
        table_filename = os.path.join(plot_dir, f"best_params_{algo_name}_{'-'.join(map(str, year_range))}.png")

        # Heatmap
        score_columns = ["Davies-Bouldin", "Calinski-Harabasz", "Silhouette Score", "Dunn Index"]
        heatmap_data = df.pivot_table(values=score_columns, index=["Algorithm"], aggfunc="mean")
        plt.figure(figsize=(12, 8))
        sns.heatmap(heatmap_data, annot=True, cmap="Reds", linewidths=0.5)
        plt.title("Heatmap of Scores for Different Parameter Sets")
        plt.tight_layout()
        plt.savefig(os.path.join(plot_dir, f"heatmap_{'-'.join(map(str, year_range))}.png"))
        plt.close()

        # Radar plot
        score_values = {}
        for algo_name in algorithms.keys():
            best_davies_bouldin = df[df["Algorithm"] == algo_name]["Davies-Bouldin"].max()
            best_calinski_harabasz = df[df["Algorithm"] == algo_name]["Calinski-Harabasz"].max()
            best_silhouette = df[df["Algorithm"] == algo_name]["Silhouette Score"].max()
            best_dunn = df[df["Algorithm"] == algo_name]["Dunn Index"].max()
            score_values[algo_name] = [best_davies_bouldin, best_calinski_harabasz, best_silhouette, best_dunn]

        categories = ["Davies-Bouldin", "Calinski-Harabasz", "Silhouette Score", "Dunn Index"]
        N = len(categories)
        angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
        angles += angles[:1]
        fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
        for algo in score_values:
            values = score_values[algo]
            values += values[:1]
            ax.plot(angles, values, label=algo, linewidth=2)
            ax.fill(angles, values, alpha=0.25)

        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(categories)
        plt.title("Comparison of Clustering Algorithms (K-Means, DBSCAN, HDBSCAN, OPTICS)", size=15)
        plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))
        plt.tight_layout()
        plt.savefig(os.path.join(plot_dir, f"radar_plot_{'-'.join(map(str, year_range))}.png"))
        plt.close()

    # Correlation matrix for each algorithm
    for algo_name in algorithms.keys():
        algo_df = df[df["Algorithm"] == algo_name]
        if not algo_df.empty:
            correlation_matrix = algo_df[score_columns].corr("pearson")
            plt.figure(figsize=(8, 6))
            sns.heatmap(correlation_matrix, annot=True, cmap="Blues", cbar=True, fmt='.2f', linewidths=0.5)
            plt.title(f"Correlation Matrix of Clustering Scores for {algo_name} Algorithm")
            plt.tight_layout()
            plt.savefig(os.path.join(plot_dir, f"correlation_matrix_{algo_name}_{'-'.join(map(str, year_range))}.png"))
            plt.close()



Running clustering for Year Range: 2002-2005

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 30}

Running DBSCAN with parameters {'eps': 0.02, 'min_samples': 35}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running HDBSCAN with parameters {'min_samples': 15, 'min_cluster_size': 25}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Running OPTICS with parameters {'min_samples': 150, 'max_eps': 500, 'xi': 0.001}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 5},0.80938
Calinski-Harabasz,{'n_clusters': 3},1.0
Silhouette Score,{'n_clusters': 5},0.691267
Dunn Index,{'n_clusters': 5},0.211779



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.01, 'min_samples': 30}",0.288449
Calinski-Harabasz,"{'eps': 0.05, 'min_samples': 25}",0.053585
Silhouette Score,"{'eps': 0.05, 'min_samples': 25}",0.607579
Dunn Index,"{'eps': 0.01, 'min_samples': 30}",0.020287



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 15, 'min_cluster_size': 25}",-0.134439
Calinski-Harabasz,"{'min_samples': 15, 'min_cluster_size': 25}",0.013453
Silhouette Score,"{'min_samples': 5, 'min_cluster_size': 15}",0.431876
Dunn Index,"{'min_samples': 10, 'min_cluster_size': 20}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.536798
Calinski-Harabasz,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.074333
Silhouette Score,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.492063
Dunn Index,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.00012



Running clustering for Year Range: 2006-2009

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 30}

Running DBSCAN with parameters {'eps': 0.02, 'min_samples': 35}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running HDBSCAN with parameters {'min_samples': 15, 'min_cluster_size': 25}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Running OPTICS with parameters {'min_samples': 150, 'max_eps': 500, 'xi': 0.001}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 5},0.567155
Calinski-Harabasz,{'n_clusters': 3},1.0
Silhouette Score,{'n_clusters': 5},0.713524
Dunn Index,{'n_clusters': 3},0.384176



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.02, 'min_samples': 35}",-0.392232
Calinski-Harabasz,"{'eps': 0.02, 'min_samples': 35}",0.033563
Silhouette Score,"{'eps': 0.02, 'min_samples': 35}",0.409467
Dunn Index,"{'eps': 0.01, 'min_samples': 30}",0.362207



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 15, 'min_cluster_size': 25}",0.124848
Calinski-Harabasz,"{'min_samples': 15, 'min_cluster_size': 25}",0.023349
Silhouette Score,"{'min_samples': 10, 'min_cluster_size': 20}",0.41656
Dunn Index,"{'min_samples': 5, 'min_cluster_size': 15}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",-0.207573
Calinski-Harabasz,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.028076
Silhouette Score,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.410157
Dunn Index,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.023164



Running clustering for Year Range: 2010-2013

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 30}

Running DBSCAN with parameters {'eps': 0.02, 'min_samples': 35}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running HDBSCAN with parameters {'min_samples': 15, 'min_cluster_size': 25}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Running OPTICS with parameters {'min_samples': 150, 'max_eps': 500, 'xi': 0.001}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 4},0.369316
Calinski-Harabasz,{'n_clusters': 5},1.0
Silhouette Score,{'n_clusters': 5},0.738578
Dunn Index,{'n_clusters': 3},0.176429



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.05, 'min_samples': 25}",-0.511935
Calinski-Harabasz,"{'eps': 0.05, 'min_samples': 25}",0.093309
Silhouette Score,"{'eps': 0.05, 'min_samples': 25}",0.581892
Dunn Index,"{'eps': 0.01, 'min_samples': 30}",0.02125



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 10, 'min_cluster_size': 20}",-0.438044
Calinski-Harabasz,"{'min_samples': 15, 'min_cluster_size': 25}",0.040479
Silhouette Score,"{'min_samples': 5, 'min_cluster_size': 15}",0.545338
Dunn Index,"{'min_samples': 5, 'min_cluster_size': 15}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",-0.582687
Calinski-Harabasz,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.158085
Silhouette Score,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.52252
Dunn Index,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.021374



Running clustering for Year Range: 2014-2017

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 30}

Running DBSCAN with parameters {'eps': 0.02, 'min_samples': 35}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running HDBSCAN with parameters {'min_samples': 15, 'min_cluster_size': 25}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Running OPTICS with parameters {'min_samples': 150, 'max_eps': 500, 'xi': 0.001}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 4},0.706348
Calinski-Harabasz,{'n_clusters': 5},1.0
Silhouette Score,{'n_clusters': 5},0.703338
Dunn Index,{'n_clusters': 3},0.84201



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.05, 'min_samples': 25}",-0.043587
Calinski-Harabasz,"{'eps': 0.05, 'min_samples': 25}",0.046384
Silhouette Score,"{'eps': 0.05, 'min_samples': 25}",0.593778
Dunn Index,"{'eps': 0.02, 'min_samples': 35}",0.0



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 15, 'min_cluster_size': 25}",0.013583
Calinski-Harabasz,"{'min_samples': 15, 'min_cluster_size': 25}",0.046888
Silhouette Score,"{'min_samples': 15, 'min_cluster_size': 25}",0.550419
Dunn Index,"{'min_samples': 5, 'min_cluster_size': 15}",0.115949



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}",-0.272804
Calinski-Harabasz,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.140658
Silhouette Score,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.489608
Dunn Index,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.564253



Running clustering for Year Range: 2019-2023

Running K-Means with parameters {'n_clusters': 3}

Running K-Means with parameters {'n_clusters': 4}

Running K-Means with parameters {'n_clusters': 5}

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 30}

Running DBSCAN with parameters {'eps': 0.02, 'min_samples': 35}

Running HDBSCAN with parameters {'min_samples': 5, 'min_cluster_size': 15}

Running HDBSCAN with parameters {'min_samples': 10, 'min_cluster_size': 20}

Running HDBSCAN with parameters {'min_samples': 15, 'min_cluster_size': 25}

Running OPTICS with parameters {'min_samples': 50, 'max_eps': 100, 'xi': 0.005}

Running OPTICS with parameters {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}

Running OPTICS with parameters {'min_samples': 150, 'max_eps': 500, 'xi': 0.001}

Best Parameters for K-Means


Unnamed: 0,Parameters,Score
Davies-Bouldin,{'n_clusters': 4},0.77014
Calinski-Harabasz,{'n_clusters': 3},1.0
Silhouette Score,{'n_clusters': 5},0.700971
Dunn Index,{'n_clusters': 4},0.29094



Best Parameters for DBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'eps': 0.05, 'min_samples': 25}",-0.21902
Calinski-Harabasz,"{'eps': 0.05, 'min_samples': 25}",0.027048
Silhouette Score,"{'eps': 0.05, 'min_samples': 25}",0.514405
Dunn Index,"{'eps': 0.01, 'min_samples': 30}",0.0681



Best Parameters for HDBSCAN


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 5, 'min_cluster_size': 15}",0.220256
Calinski-Harabasz,"{'min_samples': 15, 'min_cluster_size': 25}",0.024865
Silhouette Score,"{'min_samples': 5, 'min_cluster_size': 15}",0.467748
Dunn Index,"{'min_samples': 5, 'min_cluster_size': 15}",0.0



Best Parameters for OPTICS


Unnamed: 0,Parameters,Score
Davies-Bouldin,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.455414
Calinski-Harabasz,"{'min_samples': 150, 'max_eps': 500, 'xi': 0.001}",0.043664
Silhouette Score,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.423984
Dunn Index,"{'min_samples': 100, 'max_eps': 200, 'xi': 0.002}",0.131232
