In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

from DBSCAN import dbscan_clustering

from cluster_data import normalize_data, unnormalize
from cluster_plotter import ClusterPlotter
from cluster_data import run_clustering
import cluster_data
import scores
import cluster_plotter
import my_dbcv_module

# Files for uncorr_observed_data
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018  # Exclude 2018, missing
}

# Standard year ranges
standard_year_ranges = {f"{start}-{start + 3}": np.arange(start, start + 4) for start in [2002, 2006, 2010, 2014]}
standard_year_ranges["2019-2023"] = np.arange(2019, 2024)

# Running ranges
running_ranges = cluster_data.generate_running_year_ranges(2002, 2023, 4)
year_ranges = running_ranges

# Bin observed data
binned_data = cluster_data.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# Output directory setup
plot_dir = "Images/DBSCAN_parameter_tuning"
os.makedirs(plot_dir, exist_ok=True)
#cluster_plotter.clear_directory(plot_dir)

# DBSCAN parameter grid
eps_values = [0.02, 0.025, 0.05, 0.02, 0.01, 0.015] 
min_samples_values = [30, 35, 40, 45, 10, 15, 25]

# Main loop
for cluster_data_single, year_range in binned_data:
    print(f"\nRunning DBSCAN for Year Range: {year_range}")

    data_array = np.array([cluster_data_single.inc, cluster_data_single.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)

    results = []

    for eps in eps_values:
        for min_samples in min_samples_values:
            print(f"  -> eps={eps}, min_samples={min_samples}")
            result_dbscan, time_dbscan, n_clusters_dbscan, points_per_cluster_dbscan, metrics_dbscan = run_clustering(
                dbscan_clustering, "DBSCAN", normalized_data, data_min, data_max, plot=False, eps=eps, min_samples=min_samples
            )

            # Compute DBCV Score (Rust)
            if n_clusters_dbscan > 1:
                dbcv_score = scores.DBCV_score_rust(result_dbscan)
            else:
                dbcv_score = None

            # Save 2D plot
            unnormalized_data, _ = unnormalize(result_dbscan.data, None, data_min, data_max)
            plotter = ClusterPlotter(unnormalized_data, result_dbscan.labels, None)
            plot_filename = os.path.join(plot_dir, f"dbscan_{year_range}_eps{eps}_min{min_samples}.png")
            title = f"DBSCAN: {year_range}, eps={eps}, min_samples={min_samples}, DBCV={dbcv_score:.3f}" if dbcv_score is not None else ""
            plotter.clusters_2d_plot(title, plot_filename)

            noise_points = np.sum(result_dbscan.labels == -1)

            results.append({
                "Parameters": f"eps={eps}, min_samples={min_samples}",
                "DBCV Score": dbcv_score,
                "Year Range": year_range,
                "Runtime (s)": f"{time_dbscan:.3f}",
                "Clusters": n_clusters_dbscan,
                "Points per Cluster": points_per_cluster_dbscan,
                "Noise Points": noise_points,
                "Davies-Bouldin": f"{metrics_dbscan[0]:.3f}" if metrics_dbscan and isinstance(metrics_dbscan[0], (int, float)) else None,
                "Calinski-Harabasz": f"{metrics_dbscan[1]:.3f}" if metrics_dbscan and isinstance(metrics_dbscan[1], (int, float)) else None
            })

    # Save DBCV score scatter plot
    df = pd.DataFrame(results)

    # Create a heatmap of DBCV scores
    heatmap_df = df_filtered.copy()
    heatmap_df[['eps', 'min_samples']] = heatmap_df["Parameters"].str.extract(r"eps=([\d.]+), min_samples=(\d+)")
    heatmap_df["eps"] = heatmap_df["eps"].astype(float)
    heatmap_df["min_samples"] = heatmap_df["min_samples"].astype(int)
    heatmap_df = heatmap_df.pivot(index="min_samples", columns="eps", values="DBCV Score")

    if not heatmap_df.empty:
        plt.figure(figsize=(10, 6))
        sns.heatmap(heatmap_df, annot=True, fmt=".3f", cmap="coolwarm", center=0, cbar_kws={"label": "DBCV Score"})
        plt.title(f"DBSCAN DBCV Score Heatmap – Year Range {year_range}")
        plt.xlabel("eps")
        plt.ylabel("min_samples")
        plt.tight_layout()

        heatmap_path = os.path.join(plot_dir, f"dbcv_heatmap_{year_range}.png")
        plt.savefig(heatmap_path)
        plt.close()

    df_filtered = df[df["DBCV Score"].notnull()]  # Only keep valid scores

    if not df_filtered.empty:
        plt.figure(figsize=(7, 4))
        ax = plt.gca()
        ax.scatter(df_filtered['Parameters'], df_filtered['DBCV Score'], color='blue')

        for i, row in df_filtered.iterrows():
            ax.text(i, row["DBCV Score"] + 0.01, f"{row['DBCV Score']:.3f}", ha="center", va="bottom", fontsize=7, rotation=45)

        ax.set_title(f"DBCV Scores – Year Range {year_range}")
        ax.set_ylim(-1, 1)
        ax.set_xlabel("DBSCAN Parameters")
        ax.set_ylabel("DBCV Score")
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.tight_layout()

        plot_path = os.path.join(plot_dir, f"dbcv_scores_{year_range}.png")
        plt.savefig(plot_path)
        plt.close()

    # Show DataFrame inline if needed
    display(df)


Running DBSCAN for Year Range: 2002-2005
  -> eps=0.02, min_samples=30
Runtime for dbscan_clustering: 0.033479 seconds
Plot saved as: Images/DBSCAN_parameter_tuning\dbscan_2002-2005_eps0.02_min30_4.png
  -> eps=0.02, min_samples=35
Runtime for dbscan_clustering: 0.049251 seconds
Plot saved as: Images/DBSCAN_parameter_tuning\dbscan_2002-2005_eps0.02_min35_2.png
  -> eps=0.02, min_samples=40
Runtime for dbscan_clustering: 0.092319 seconds
Plot saved as: Images/DBSCAN_parameter_tuning\dbscan_2002-2005_eps0.02_min40_2.png
  -> eps=0.02, min_samples=45
Runtime for dbscan_clustering: 0.095549 seconds
Plot saved as: Images/DBSCAN_parameter_tuning\dbscan_2002-2005_eps0.02_min45_2.png
  -> eps=0.02, min_samples=10
Runtime for dbscan_clustering: 0.095950 seconds
Plot saved as: Images/DBSCAN_parameter_tuning\dbscan_2002-2005_eps0.02_min10_2.png
  -> eps=0.02, min_samples=15
Runtime for dbscan_clustering: 0.115619 seconds
Plot saved as: Images/DBSCAN_parameter_tuning\dbscan_2002-2005_eps0.02_min1

NameError: name 'df_filtered' is not defined