In [1]:
import os
import numpy as np
import pandas as pd

# clustering backends
from kmeans import k_means
from DBSCAN import dbscan_clustering
from HDBSCAN import hdbscan_clustering
from OPTICS import optics_clustering

# your utilities
import cluster_data
from cluster_data import (
    run_clustering_dbcv_score,
    normalize_data,
    unnormalize,
    generate_running_year_ranges,
    bin_data_for_clustering
)
from cluster_plotter import ClusterPlotter
import cluster_plotter
import scores

# ------------------------------------------------------------------------
# Define your algorithms + parameter grids once
# ------------------------------------------------------------------------
algorithms = {
    'K-Means': {
        'model': k_means,
        'params': [{'k': 3}, {'k': 4}, {'k': 5}]
    },
    'DBSCAN': {
        'model': dbscan_clustering,
        'params': [
            {'eps': 0.05, 'min_samples': 25},
            {'eps': 0.01, 'min_samples': 30},
            {'eps': 0.02, 'min_samples': 35},
        ]
    },
    'HDBSCAN': {
        'model': hdbscan_clustering,
        'params': [
            {'min_samples': 5, 'min_cluster_size': 15},
            {'min_samples': 10, 'min_cluster_size': 20},
            {'min_samples': 15, 'min_cluster_size': 25},
        ]
    },
    'OPTICS': {
        'model': optics_clustering,
        'params': [
            {'min_samples': 50, 'max_eps': 100, 'xi': 0.005},
            {'min_samples': 100, 'max_eps': 200, 'xi': 0.002},
            {'min_samples': 150, 'max_eps': 500, 'xi': 0.001},
        ]
    },
}

# bin widths to try
bins = [3, 4, 5, 6]

# where to save everything
plot_dir = "Images/binning_width_all_algorithms"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# collect for later multi‐plot
array_of_metrics    = []
array_of_yearranges = []
array_of_binwidths  = []

# ------------------------------------------------------------------------
# Outer loop: vary your running‐window width b
# ------------------------------------------------------------------------
for b in bins:
    running_ranges = generate_running_year_ranges(2002, 2023, b)
    binned_data    = bin_data_for_clustering(running_ranges, print_res=False)

    # Inner loop: each binned chunk
    for data_chunk, year_range in binned_data:
        print(f"\n=== Year range {year_range}, bin width = {b}")

        # prepare & normalize
        X = np.vstack((data_chunk.inc, data_chunk.raan)).T
        X_norm, X_min, X_max = normalize_data(X)

        # for each algorithm & its param‐grid
        for algo_name, algo_info in algorithms.items():
            model_fn = algo_info['model']

            for params in algo_info['params']:
                # build a label to show in plot titles, etc.
                param_str = ", ".join(f"{k}={v}" for k, v in params.items())
                run_label = f"{algo_name} ({param_str})"
                print(f"  • {run_label}")

                # run & time & cluster & score via your helper
                result, runtime, n_clusters, pts_per_clust, metrics = run_clustering_dbcv_score(
                    model_fn,
                    run_label,
                    X_norm,
                    X_min,
                    X_max,
                    plot=False,
                    **params
                )

                # optional: round details if present
                if len(metrics) >= 7:
                    stds   = {k: tuple(round(v,3) for v in vals) for k, vals in metrics[4].items()}
                    sqdens = {k: round(v,3)                    for k, v    in metrics[5].items()}
                    hulld  = {k: round(v,3)                    for k, v    in metrics[6].items()}
                else:
                    stds = sqdens = hulld = None

                # collect for later
                array_of_metrics.append(metrics[:4])
                array_of_yearranges.append(year_range)
                array_of_binwidths.append(b)

                # plot each clustering in un-normalized space
                X_unnorm, centers = unnormalize(
                    result.data,
                    result.cluster_centers,
                    X_min,
                    X_max
                )
                plotter = ClusterPlotter(X_unnorm, result.labels, centers)
                fname   = f"{algo_name}_{year_range}_{b}.png"
                title   = f"{run_label} — years {year_range}, bin={b}"
                plotter.clusters_2d_plot(title, os.path.join(plot_dir, fname))

# once all done, show how your score (e.g. DBCV) varies
scores.plot_scores_for_different_binnings(
    array_of_metrics,
    array_of_yearranges,
    array_of_binwidths,
    plot_dir
)


=== Year range 2002-2004, bin width = 3
  • K-Means (k=3)


ValueError: k_means requires at least two arguments: (data, k)

In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

from OPTICS import optics_clustering
from kmeans import k_means
from HDBSCAN import hdbscan_clustering
from DBSCAN import dbscan_clustering

from cluster_data import normalize_data, unnormalize
from cluster_plotter import ClusterPlotter
from cluster_data import run_clustering
import cluster_data
import scores
import cluster_plotter


# Files for uncorr_observed_data
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018  # Exclude 2018, missing
}

# Standard year ranges
standard_year_ranges = {f"{start}-{start + 3}": np.arange(start, start + 4) for start in [2002, 2006, 2010, 2014]}
standard_year_ranges["2019-2023"] = np.arange(2019, 2024)

# Running ranges
running_ranges = cluster_data.generate_running_year_ranges(2002, 2023, 4)
year_ranges = standard_year_ranges

# Bin observed data
binned_data = cluster_data.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)
results_per_year_range = {}

# Directory for saving plots
plot_dir = "Images/comparison_algorithms_dbcv"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# Define score columns (needed for correlation matrices)
score_columns = ["DBCV Score"]

# Process each binned data set
for cluster_data_single, year_range in binned_data:
    print(f"\nRunning clustering for Year Range: {year_range}")

    # Prepare data array
    data_array = np.array([cluster_data_single.inc, cluster_data_single.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)

    # Algorithms and parameters
    algorithms = {
        'K-Means': {'model': k_means, 'params': [{'n_clusters': 3}, {'n_clusters': 4}, {'n_clusters': 5}]},
        'DBSCAN': {'model': dbscan_clustering, 'params': [{'eps': 0.05, 'min_samples': 25}, {'eps': 0.01, 'min_samples': 30}, {'eps': 0.02, 'min_samples': 35}]},
        'HDBSCAN': {'model': hdbscan_clustering, 'params': [{'min_samples': 5, 'min_cluster_size': 15}, {'min_samples': 10, 'min_cluster_size': 20}, {'min_samples': 15, 'min_cluster_size': 25}]},
        'OPTICS': {'model': optics_clustering, 'params': [{'min_samples': 50, 'max_eps': 100, 'xi': 0.005}, {'min_samples': 100, 'max_eps': 200, 'xi': 0.002}, {'min_samples': 150, 'max_eps': 500, 'xi': 0.001}]}
    }

    results = []
    dbcv_scores = []
    
    global_min = -1
    global_max = 1

    for algo_name, algo_info in algorithms.items():
        for params in algo_info['params']:
            print(f"\nRunning {algo_name} with parameters {params}")

            if algo_name == "K-Means":
                model = k_means(normalized_data, k=params['n_clusters'])
                labels = model.labels
                n_clusters = len(set(labels))

            elif algo_name == "DBSCAN":
                model = dbscan_clustering(normalized_data, eps=params['eps'], min_samples=params['min_samples'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            elif algo_name == "HDBSCAN":
                model = hdbscan_clustering(normalized_data, min_cluster_size=params['min_cluster_size'], min_samples=params['min_samples'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            elif algo_name == "OPTICS":
                model = optics_clustering(normalized_data, min_samples=params['min_samples'], max_eps=params['max_eps'], xi=params['xi'])
                labels = model.labels
                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            if n_clusters > 1:
                try:
                    print(f"Calculating DBCV score for {algo_name} with parameters {params}")
                    dbcv_score = scores.DBCV_score(model)
                    print(dbcv_score)
                    dbcv_scores.append(dbcv_score)

                    results.append({
                        "Algorithm": algo_name,
                        "Parameters": params,
                        "DBCV Score": dbcv_score,
                        "Clusters": n_clusters
                    })

                except ValueError as e:
                    print(f"Error calculating scores: {e}")
                    pass

    # Updating the global min/max values
    if dbcv_scores: 
        global_min = np.min(dbcv_scores)
        global_max = np.max(dbcv_scores)

    for result in results:
        if result["DBCV Score"] is not None:
            result["DBCV Score"] = scores.normalize_score(
                result["DBCV Score"], global_min, global_max
            )

    df = pd.DataFrame(results)

    best_params_per_algorithm = {}

    for algo_name in algorithms.keys():
        algo_df = df[df["Algorithm"] == algo_name]
        if algo_df.empty:
            print(f"No data for {algo_name}, skipping...")
            continue

        best_params = {}
        for score in score_columns:
            if algo_df[score].isna().all():
                print(f"Warning: All values for {score} are NaN for {algo_name}. Skipping...")
                best_params[score] = {"Parameters": None, "Score": None}
            else:
                if score in ["Davies-Bouldin", "Dunn Index"]:
                    best_row = algo_df.loc[algo_df[score].idxmin()]
                else:
                    best_row = algo_df.loc[algo_df[score].idxmax()]
                best_params[score] = {
                    "Parameters": best_row["Parameters"],
                    "Score": best_row[score]
                }

        best_params_per_algorithm[algo_name] = pd.DataFrame(best_params).T

    for algo_name, best_params_df in best_params_per_algorithm.items():
        print(f"\nBest Parameters for {algo_name}")
        display(best_params_df)
        # Save best parameters as CSV
        csv_filename = os.path.join(plot_dir, f"best_params_{algo_name}_{'-'.join(map(str, year_range))}.csv")
        best_params_df.to_csv(csv_filename)

    # Correlation matrix for each algorithm
    for algo_name in algorithms.keys():
        algo_df = df[df["Algorithm"] == algo_name]
        if not algo_df.empty:
            correlation_matrix = algo_df[score_columns].corr("pearson")
            plt.figure(figsize=(8, 6))
            sns.heatmap(correlation_matrix, annot=True, cmap="Blues", cbar=True, fmt='.2f', linewidths=0.5)
            plt.title(f"Correlation Matrix of Clustering Scores for {algo_name} Algorithm")
            plt.tight_layout()
            plt.savefig(os.path.join(plot_dir, f"correlation_matrix_{algo_name}_{'-'.join(map(str, year_range))}.png"))
            plt.close()


Running clustering for Year Range: 2002-2005

Running K-Means with parameters {'n_clusters': 3}
Calculating DBCV score for K-Means with parameters {'n_clusters': 3}
-0.9539274486321931

Running K-Means with parameters {'n_clusters': 4}
Calculating DBCV score for K-Means with parameters {'n_clusters': 4}
-0.9452196175603331

Running K-Means with parameters {'n_clusters': 5}
Calculating DBCV score for K-Means with parameters {'n_clusters': 5}
-0.9367540828320728

Running DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}
Calculating DBCV score for DBSCAN with parameters {'eps': 0.05, 'min_samples': 25}
-0.794687843346411

Running DBSCAN with parameters {'eps': 0.01, 'min_samples': 30}
Calculating DBCV score for DBSCAN with parameters {'eps': 0.01, 'min_samples': 30}
