# PROOF Plots in 2D

## KMeans

In [1]:
import cluster_data
from cluster_data import run_clustering, normalize_data, unnormalize
import numpy as np
import pandas as pd
import os
from kmeans import k_means
from cluster_plotter import ClusterPlotter
from clustering_utils import ClusterData
import cluster_plotter

IAC_range = {"2005-2008": np.arange(2005, 2009)}

binned_data = cluster_data.bin_data_for_clustering(IAC_range, print_res=False)

k_values = [9]

results_per_year_range = {}

# Output directory for plots
plot_dir = "Images\IAC_plots\KMeans_2d"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

for cluster_data, year_range in binned_data:  
    print(f"\nRunning K-Means for Year Range: {year_range}")

    data_array = np.array([cluster_data.inc, cluster_data.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)

    for k in k_values:
        result_kmeans, _, _, _, _ = run_clustering(
            k_means, f"K-means (k={k})", normalized_data, data_min, data_max, k, plot=False, init='kmeans++'
        )

        unnormalized_data, cluster_centers = unnormalize(
            result_kmeans.data, result_kmeans.cluster_centers, data_min, data_max
        )

        plotter = ClusterPlotter(unnormalized_data, result_kmeans.labels, cluster_centers)
        plot_filename = os.path.join(plot_dir, f"kmeans_{year_range}_k{k}.svg")
        title = f"k-Means: years = {year_range}, k = {k}"
        plotter.clusters_2d_plot(title, plot_filename)


  plot_dir = "Images\IAC_plots\KMeans_2d"


{'2005-2008': {'geo': ['../input/stat_Master_05_geo_s1.crs', '../input/stat_Master_06_geo_s1.crs', '../input/stat_Master_07_geo_s1.crs', '../input/stat_Master_08_geo_s1.crs'], 'gto': ['../input/stat_Master_05_gto_s1.crs', '../input/stat_Master_06_gto_s1.crs', '../input/stat_Master_07_gto_s1.crs', '../input/stat_Master_08_gto_s1.crs'], 'fol': ['../input/stat_Master_05_fol_s1.crs', '../input/stat_Master_06_fol_s1.crs', '../input/stat_Master_07_fol_s1.crs', '../input/stat_Master_08_fol_s1.crs']}}

Running K-Means for Year Range: 2005-2008
Runtime for k_means: 0.234668 seconds
Plot saved as: Images\IAC_plots\KMeans_2d\kmeans_2005-2008_k9.svg


## DBSCAN

In [5]:
import cluster_data
from cluster_data import run_clustering, normalize_data, unnormalize
import numpy as np
import pandas as pd
import os
import cluster_plotter
from DBSCAN import dbscan_clustering
from cluster_plotter import ClusterPlotter
from clustering_utils import ClusterData

# Define year ranges
IAC_range = {"2005-2009": np.arange(2005, 2010)}

binned_data = cluster_data.bin_data_for_clustering(IAC_range, print_res=False)

results_per_year_range = {}

# Output directory for plots
plot_dir = "Images\IAC_plots\DBSCAN_2d_3years"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

for cluster_data, year_range in binned_data:  
    print(f"\nRunning DBSCAN for Year Range: {year_range}")

    data_array = np.array([cluster_data.inc, cluster_data.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)

    eps_values = [0.015]
    min_samples_values = [27]

    for eps in eps_values:
        for min_samples in min_samples_values:
            result_dbscan, _, _, _, _ = run_clustering(
                dbscan_clustering, "DBSCAN", normalized_data, data_min, data_max,
                plot=False, eps=eps, min_samples=min_samples
            )

            unnormalized_data, _ = unnormalize(
                result_dbscan.data, None, data_min, data_max
            )

            # --- Relabel clusters ---
            labels = result_dbscan.labels.copy()
            labels = np.where(labels == -1, -1, labels + 1)  # -1 stays Noise, others start at 1

            plotter = ClusterPlotter(unnormalized_data, labels, None)  # No cluster centers
            plot_filename = os.path.join(plot_dir, f"dbscan_{year_range}_eps{eps}_min{min_samples}.svg")
            title = f"DBSCAN: years = {year_range}, eps = {eps}, min_samples = {min_samples}"
            plotter.clusters_2d_plot(title, plot_filename, show_centers=False)


  plot_dir = "Images\IAC_plots\DBSCAN_2d_3years"


{'2005-2009': {'geo': ['../input/stat_Master_05_geo_s1.crs', '../input/stat_Master_06_geo_s1.crs', '../input/stat_Master_07_geo_s1.crs', '../input/stat_Master_08_geo_s1.crs', '../input/stat_Master_09_geo_s1.crs'], 'gto': ['../input/stat_Master_05_gto_s1.crs', '../input/stat_Master_06_gto_s1.crs', '../input/stat_Master_07_gto_s1.crs', '../input/stat_Master_08_gto_s1.crs', '../input/stat_Master_09_gto_s1.crs'], 'fol': ['../input/stat_Master_05_fol_s1.crs', '../input/stat_Master_06_fol_s1.crs', '../input/stat_Master_07_fol_s1.crs', '../input/stat_Master_08_fol_s1.crs', '../input/stat_Master_09_fol_s1.crs']}}

Running DBSCAN for Year Range: 2005-2009
Runtime for dbscan_clustering: 0.125320 seconds
Plot saved as: Images\IAC_plots\DBSCAN_2d_3years\dbscan_2005-2009_eps0.015_min27.svg


# PROOF plots in > 2D

## KMeans

In [6]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from kmeans import k_means
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

# --- Setup ---
year_range = "2005-2008"
data_set = {year_range: np.arange(2005, 2009)}
plot_dir = "Images/IAC_plots/KMeans_2Dplus"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load and prepare data ---
binned = cdp.bin_data_for_clustering(data_set, print_res=False)
data_obj, _ = binned[0]

# --- Full feature set for clustering (9D) ---
X_all = np.vstack([
    data_obj.ecc,
    data_obj.sem_maj,
    data_obj.inc,
    data_obj.raan,
    data_obj.perigee,
    data_obj.true_lat,
    data_obj.mean_motion,
    data_obj.mag_obj,
    data_obj.diameter
]).T

norm_X_all, X_min, X_max = normalize_data(X_all)

# --- Subset of features just for plotting (6D) ---
X_plot = np.vstack([
    data_obj.mean_motion,   # mean motion
    data_obj.ecc,           # eccentricity
    data_obj.inc,           # inclination
    data_obj.raan,          # RAAN
    data_obj.true_lat,      # argument of latitude
    data_obj.mag_obj        # magnitude
]).T

# --- K-Means parameters ---
k_values = [7]

# Extreme contrast color palette
extreme_colors = [
    "#FF000094", "#3DC53D", "#4F4FF3", '#FFFF00', "#A148A1", '#00FFFF',
    '#FF8000', '#8000FF', "#F157A4", '#00FF80', '#804000', '#000000',
    '#808080', '#404040'
]

# Map features to LaTeX labels with units
axis_labels = {
    "mean motion": r"$n$ [rev/day]",
    "ecc": r"$e$",
    "inc": r"$i$ [°]",
    "raan": r"$\Omega$ [°]",
    "arg lat": r"$\lambda$ [°]",
    "mag": r"$m$"
}

for k in k_values:
    # --- Run clustering in 9D ---
    clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
        k_means, f"K-Means (k={k})",
        norm_X_all, X_min, X_max, k, init='kmeans++'
    )
    labels = clustering.labels

    # --- Remap labels: Noise stays -1, clusters go 1..N ---
    unique_labels = sorted(set(labels) - {-1})  # all cluster labels except noise
    label_map = {old: i+1 for i, old in enumerate(unique_labels)}  # shift to 1..N
    label_map[-1] = "Noise"  # keep noise as string

    df = pd.DataFrame(X_plot, columns=[
        "mean motion", "ecc", "inc", "raan", "arg lat", "mag"
    ])
    df['cluster'] = [label_map[l] for l in labels]


    # --- Pairplot with extreme contrast colors ---
    palette_map = {i+1: extreme_colors[i] for i in range(len(unique_labels))}
    if -1 in labels:
        palette_map["Noise"] = "red"

    pp = sns.pairplot(
        df, hue='cluster', diag_kind='kde',
        plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
        corner=True,
        hue_order=[*range(1, len(unique_labels)+1)] + (["Noise"] if -1 in labels else []),
        palette=palette_map
    )


    # Replace axis labels with LaTeX
    for ax in pp.axes.flatten():
        if ax is not None:
            xlabel = ax.get_xlabel()
            ylabel = ax.get_ylabel()
            if xlabel in axis_labels:
                ax.set_xlabel(axis_labels[xlabel])
            if ylabel in axis_labels:
                ax.set_ylabel(axis_labels[ylabel])

    # Force labels also on diagonal plots
    for i in range(len(pp.diag_axes)):
        ax = pp.diag_axes[i]
        var = pp.x_vars[i]
        if var in axis_labels:
            ax.set_xlabel(axis_labels[var])
            ax.set_ylabel(axis_labels[var])

    # Fix the first diagonal label explicitly
    if len(pp.diag_axes) > 0:
        first_diag_ax = pp.diag_axes[0]
        first_var = pp.x_vars[0]
        if first_var in axis_labels:
            first_diag_ax.set_ylabel(axis_labels[first_var])

    # Enlarge legend & markers with black borders
    for lh in pp._legend.legend_handles:
        lh.set_markersize(20)
        lh.set_alpha(1.0)
        #lh.set_markeredgecolor('black')
        lh.set_markeredgewidth(0)

    pp._legend.set_title("Cluster", prop={'size': 20})
    for text in pp._legend.get_texts():
        text.set_fontsize(20)

    pp._legend.set_bbox_to_anchor((0.9, 0.6))
    pp._legend.set_frame_on(True)     
    pp._legend.get_frame().set_edgecolor("grey")
    pp._legend.get_frame().set_linewidth(1.0)

    # Title
    """pp.fig.suptitle(
        f"K-Means 9D→6D Clusters {year_range} (k={k})",
        y=1.02, fontsize=20
    )"""

    fname = f"pairplot_{year_range}_k{k}.svg"
    pp.savefig(os.path.join(plot_dir, fname), bbox_inches="tight", transparent=True)
    plt.close(pp.fig)


Runtime for k_means: 0.050317 seconds


## DBSCAN

In [7]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

year_range = "2005-2008"
data_set = {year_range: np.arange(2005, 2009)}
plot_dir = "Images/IAC_plots/DBSCAN_2Dplus"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load and prepare data ---
binned = cdp.bin_data_for_clustering(data_set, print_res=False)
data_obj, _ = binned[0]

# --- Full feature set for clustering (9D) ---
X_all = np.vstack([
    data_obj.ecc,
    data_obj.sem_maj,
    data_obj.inc,
    data_obj.raan,
    data_obj.perigee,
    data_obj.true_lat,
    data_obj.mean_motion,
    data_obj.mag_obj,
    data_obj.diameter
]).T

norm_X_all, X_min, X_max = normalize_data(X_all)

# --- Subset of features just for plotting (6D) ---
X_plot = np.vstack([
    data_obj.mean_motion,
    data_obj.ecc,
    data_obj.inc,
    data_obj.raan,
    data_obj.true_lat,
    data_obj.mag_obj
]).T

# --- DBSCAN parameters ---
eps_values = [0.2]
min_samples_values = [10]

for eps in eps_values:
    for min_samples in min_samples_values: 
        # --- Run clustering in 9D ---
        clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
            dbscan_clustering, "DBSCAN", norm_X_all, X_min, X_max,
            eps=eps, min_samples=min_samples
        )
        labels = clustering.labels

        # --- Map features to LaTeX labels with units ---
        axis_labels = {
            "mean motion": r"$n$ [rev/day]",
            "ecc": r"$e$",
            "inc": r"$i$ [°]",
            "raan": r"$\Omega$ [°]",
            "arg lat": r"$\lambda$ [°]",
            "mag": r"$m$"
        }

        # --- Remap labels: Noise stays -1, clusters go 1..N ---
        unique_labels = sorted(set(labels) - {-1})  # all cluster labels except noise
        label_map = {old: i+1 for i, old in enumerate(unique_labels)}  # shift to 1..N
        label_map[-1] = "Noise"  # keep noise as string

        df = pd.DataFrame(X_plot, columns=[
            "mean motion", "ecc", "inc", "raan", "arg lat", "mag"
        ])
        df['cluster'] = [label_map[l] for l in labels]


        # --- Extreme contrast color scheme ---
        extreme_colors = [
            "#3DC53D",    # Pure Green  
            "#4F4FF3",    # Pure Blue
            '#FFFF00',    # Pure Yellow
            "#A148A1",    # Pure Magenta
            '#00FFFF',    # Pure Cyan
            '#FF8000',    # Bright Orange
            '#8000FF',    # Bright Purple
            "#F157A4",    # Hot Pink
            '#00FF80',    # Bright Teal
            '#804000',    # Dark Brown
            '#000000',    # Black
            '#808080',    # Medium Gray
            '#404040',    # Dark Gray
            "#FF000094",  # Pure Red
        ]

        sns.set_palette(extreme_colors)

        # --- Pairplot with extreme contrast colors ---
            # --- Pairplot with extreme contrast colors ---
        palette_map = {i+1: extreme_colors[i % len(extreme_colors)] 
                    for i in range(len(unique_labels))}
        if -1 in labels:
            palette_map["Noise"] = "red"


        pp = sns.pairplot(
            df, hue='cluster', diag_kind='kde',
            plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
            corner=True,
            hue_order=[*range(1, len(unique_labels)+1)] + (["Noise"] if -1 in labels else []),
            palette=palette_map
        )

        # Replace axis labels with LaTeX
        for ax in pp.axes.flatten():
            if ax is not None:
                xlabel = ax.get_xlabel()
                ylabel = ax.get_ylabel()
                if xlabel in axis_labels:
                    ax.set_xlabel(axis_labels[xlabel])
                if ylabel in axis_labels:
                    ax.set_ylabel(axis_labels[ylabel])

        # Force labels also on diagonal plots (some are missing with corner=True)
        for i, var in enumerate(pp.x_vars):
            if i < len(pp.axes) and i < len(pp.axes[i]):
                ax = pp.axes[i, i]
                if ax is not None and var in axis_labels:
                    ax.set_xlabel(axis_labels[var])
                    ax.set_ylabel(axis_labels[var])

        # --- Enlarge and reposition legend ---
        for lh in pp._legend.legend_handles:
            lh.set_markersize(20)
            lh.set_alpha(1.0)
            lh.set_markeredgewidth(0)
        pp._legend.set_title("Cluster", prop={'size': 20})
        for text in pp._legend.get_texts():
            text.set_fontsize(20)
        pp._legend.set_bbox_to_anchor((0.9, 0.6))
        pp._legend.set_frame_on(True)     
        pp._legend.get_frame().set_edgecolor("grey")
        pp._legend.get_frame().set_linewidth(1.0)

        # Title
        """pp.fig.suptitle(
            f"DBSCAN 9D→6D Clusters {year_range} (eps={eps}, ms={min_samples})",
            y=1.02, fontsize=20
        )"""

        # Save
        fname = f"pairplot_{year_range}_eps{eps}_ms{min_samples}.svg"
        pp.savefig(os.path.join(plot_dir, fname), bbox_inches="tight", transparent=True)
        plt.close(pp.fig)


Runtime for dbscan_clustering: 0.329147 seconds


## DBSCAN 2D Plus PROOF data, remove the largest clusters

In [8]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

year_range = "2005-2008"
data_set = {year_range: np.arange(2005, 2009)}
plot_dir = "Images/IAC_plots/DBSCAN_2Dplus_smallclusters"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load and prepare data ---
binned = cdp.bin_data_for_clustering(data_set, print_res=False)
data_obj, _ = binned[0]

# --- Full feature set for clustering (9D) ---
X_all = np.vstack([
    data_obj.ecc,
    data_obj.sem_maj,
    data_obj.inc,
    data_obj.raan,
    data_obj.perigee,
    data_obj.true_lat,
    data_obj.mean_motion,
    data_obj.mag_obj,
    data_obj.diameter
]).T

norm_X_all, X_min, X_max = normalize_data(X_all)

# --- Subset of features just for plotting (6D) ---
X_plot = np.vstack([
    data_obj.mean_motion,
    data_obj.ecc,
    data_obj.inc,
    data_obj.raan,
    data_obj.true_lat,
    data_obj.mag_obj
]).T

# --- DBSCAN parameters ---
eps_values = [0.2]
min_samples_values = [10]

# --- Extreme contrast color scheme (original cluster order) ---
extreme_colors = [
    "#3DC53D",    # Cluster 1: Green
    "#4F4FF3",    # Cluster 2: Blue
    '#FFFF00',    # Cluster 3: Yellow
    "#A148A1",    # Cluster 4: Purple
    '#00FFFF',    # Cluster 5: Cyan
    '#FF8000',    # Cluster 6: Orange
    '#8000FF',    # Cluster 7: Bright Purple
    "#F157A4",    # Cluster 8: Pink
    '#00FF80',    # Cluster 9: Teal
    '#804000',    # Cluster 10: Brown
    '#000000',    # Cluster 11: Black
    '#808080',    # Cluster 12: Gray
    '#404040',    # Cluster 13: Dark Gray
    "#FF000094",  # Cluster 14: Red
]

for eps in eps_values:
    for min_samples in min_samples_values: 
        # --- Run clustering in 9D ---
        clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
            dbscan_clustering, "DBSCAN", norm_X_all, X_min, X_max,
            eps=eps, min_samples=min_samples
        )
        labels = clustering.labels

        # --- Map features to LaTeX labels with units ---
        axis_labels = {
            "mean motion": r"$n$ [rev/day]",
            "ecc": r"$e$",
            "inc": r"$i$ [°]",
            "raan": r"$\Omega$ [°]",
            "arg lat": r"$\lambda$ [°]",
            "mag": r"$m$"
        }

        # --- Remap labels: Noise stays -1, clusters go 1..N ---
        unique_labels = sorted(set(labels) - {-1})
        label_map = {old: i+1 for i, old in enumerate(unique_labels)}  # clusters 1..N
        label_map[-1] = "Noise"  # keep noise as string

        df = pd.DataFrame(X_plot, columns=[
            "mean motion", "ecc", "inc", "raan", "arg lat", "mag"
        ])
        df['cluster'] = [label_map[l] for l in labels]

        # --- Remove largest cluster + noise ---
        cluster_sizes = df['cluster'].value_counts()

        # Identify largest non-noise cluster
        non_noise_clusters = cluster_sizes.drop(labels=["Noise"], errors="ignore")
        if not non_noise_clusters.empty:
            largest_cluster = non_noise_clusters.idxmax()
            clusters_to_remove = [largest_cluster]
        else:
            clusters_to_remove = []

        # Always remove noise if present
        if "Noise" in cluster_sizes.index:
            clusters_to_remove.append("Noise")

        # Print which clusters were removed and their sizes
        print(f"\nRemoved clusters for eps={eps}, min_samples={min_samples}:")
        for cl in clusters_to_remove:
            size = cluster_sizes[cl]
            print(f"  Cluster {cl}: {size} points")

        # Filter out those clusters
        df = df[~df['cluster'].isin(clusters_to_remove)]

        # --- Original palette mapping ---
        original_palette_map = {}
        for old_label, new_label in label_map.items():
            if old_label == -1:
                original_palette_map[new_label] = "red"
            else:
                original_palette_map[new_label] = extreme_colors[(old_label) % len(extreme_colors)]

        # --- Keep only remaining clusters in the palette (no shifting) ---
        remaining_clusters = df['cluster'].unique()
        palette_map = {cl: original_palette_map[cl] for cl in remaining_clusters}

        # --- Hue order: numeric clusters in order, then noise if present ---
        numeric_clusters = sorted([cl for cl in remaining_clusters if isinstance(cl, int)])
        hue_order = numeric_clusters
        if "Noise" in remaining_clusters:
            hue_order.append("Noise")

        # --- Pairplot ---
        pp = sns.pairplot(
            df, hue='cluster', diag_kind='kde',
            plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
            corner=True,
            hue_order=hue_order,
            palette=palette_map
        )

        # Replace axis labels with LaTeX
        for ax in pp.axes.flatten():
            if ax is not None:
                xlabel = ax.get_xlabel()
                ylabel = ax.get_ylabel()
                if xlabel in axis_labels:
                    ax.set_xlabel(axis_labels[xlabel])
                if ylabel in axis_labels:
                    ax.set_ylabel(axis_labels[ylabel])

        # Force labels also on diagonal plots
        for i, var in enumerate(pp.x_vars):
            if i < len(pp.axes) and i < len(pp.axes[i]):
                ax = pp.axes[i, i]
                if ax is not None and var in axis_labels:
                    ax.set_xlabel(axis_labels[var])
                    ax.set_ylabel(axis_labels[var])

        # --- Enlarge and reposition legend ---
        for lh in pp._legend.legend_handles:
            lh.set_markersize(20)
            lh.set_alpha(1.0)
            lh.set_markeredgewidth(0)
        pp._legend.set_title("Cluster", prop={'size': 20})
        for text in pp._legend.get_texts():
            text.set_fontsize(20)
        pp._legend.set_bbox_to_anchor((0.9, 0.6))
        pp._legend.set_frame_on(True)     
        pp._legend.get_frame().set_edgecolor("grey")
        pp._legend.get_frame().set_linewidth(1.0)

        # --- Save ---
        fname = f"pairplot_{year_range}_eps{eps}_ms{min_samples}_smallclusters.svg"
        pp.savefig(os.path.join(plot_dir, fname), bbox_inches="tight", transparent=True)
        plt.close(pp.fig)


Runtime for dbscan_clustering: 0.333108 seconds

Removed clusters for eps=0.2, min_samples=10:
  Cluster 3: 4931 points
  Cluster Noise: 591 points


# Oberservation plots in 2D

## KMeans

In [9]:
import numpy as np
import os
import cluster_data
import cluster_plotter
from cluster_data import run_clustering, normalize_data, unnormalize
from cluster_plotter import ClusterPlotter
from kmeans import k_means

# Define observed files
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018  # Exclude 2018 if missing
}

# Year ranges
IAC_range = {"2005-2008": np.arange(2005, 2009)}
year_ranges = IAC_range

# Bin observed data
binned_data = cluster_data.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# Output directory
plot_dir = "Images/IAC_plots/KMeans_2d_observed"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# Run clustering and plot
for cluster_data, year_range in binned_data:
    print(f"\nRunning K-Means for Year Range: {year_range}")
    
    data_array = np.array([cluster_data.inc, cluster_data.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)
    
    for k in [8]:
        result_kmeans, _, _, _, _ = run_clustering(
            k_means, "K-Means", normalized_data, data_min, data_max, k, plot=False
        )

        unnormalized_data, cluster_centers = unnormalize(
            result_kmeans.data, result_kmeans.cluster_centers, data_min, data_max
        )
        plotter = ClusterPlotter(unnormalized_data, result_kmeans.labels, cluster_centers)
        plotter.clusters_2d_plot(
            f"K-Means: years = {year_range}, k = {k}",
            os.path.join(plot_dir, f"kmeans_{year_range}_k{k}_observed.svg")
        )



Running K-Means for Year Range: 2005-2008
Runtime for k_means: 0.100489 seconds
Plot saved as: Images/IAC_plots/KMeans_2d_observed\kmeans_2005-2008_k8_observed.svg


## DBSCAN

In [10]:
import numpy as np
import os
import cluster_data
from cluster_data import normalize_data, unnormalize
from cluster_plotter import ClusterPlotter
from DBSCAN import dbscan_clustering  # Assuming you have a DBSCAN implementation

# Define observed files
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018  # Exclude 2018 if missing
}

# Use running ranges
IAC_range = {"2005-2008": np.arange(2005, 2009)}
year_ranges = IAC_range

# Bin the observed data
binned_data = cluster_data.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# Output directory for plots
plot_dir = "Images/IAC_plots/DBSCAN_2d_observed"
os.makedirs(plot_dir, exist_ok=True)

for cluster_data, year_range in binned_data:
    data_array = np.array([cluster_data.inc, cluster_data.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)
    
    eps_values = [0.015]
    min_samples_values = [15]
    
    for eps in eps_values:
        for min_samples in min_samples_values:
            result_dbscan = dbscan_clustering(
                normalized_data, eps=eps, min_samples=min_samples
            )

            unnormalized_data, cluster_centers = unnormalize(
                result_dbscan.data, result_dbscan.cluster_centers, data_min, data_max
            )
            
            plotter = ClusterPlotter(unnormalized_data, result_dbscan.labels, cluster_centers)
            plotter.clusters_2d_plot(
                f"DBSCAN: years = {year_range}, eps = {eps}, min_samples = {min_samples}",
                os.path.join(plot_dir, f"dbscan_{year_range}_eps{eps}_min{min_samples}_observed.svg")
            )


Plot saved as: Images/IAC_plots/DBSCAN_2d_observed\dbscan_2005-2008_eps0.015_min15_observed.svg


# Observed data > 2D

## KMeans

In [11]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from kmeans import k_means
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

# --- Setup ---
year_range = "2005-2008"
plot_dir = "Images/IAC_plots/KMeans_2Dplus_observed"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load and prepare observed data ---
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018
}

year_ranges = {year_range: np.arange(2005, 2009)}
binned_data = cdp.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# --- K-Means parameters ---
k_values = [5]

# Extreme contrast color palette
extreme_colors = [
     "#3DC53D", "#4F4FF3", '#FFFF00', "#A148A1", '#00FFFF',
    '#FF8000', '#8000FF', "#F157A4", '#00FF80', '#804000',
    '#000000', "#FF000094", '#808080', '#404040'
]

# Map features to LaTeX labels with units
axis_labels = {
    "mean motion": r"$n$ [rev/day]",
    "ecc": r"$e$",
    "inc": r"$i$ [°]",
    "raan": r"$\Omega$ [°]",
    "arg lat": r"$\lambda$ [°]",
    "mag": r"$m$"
}

# --- Run clustering and create pairplots ---
for cluster_data, year_range in binned_data:
    print(f"\nRunning 9D K-Means for Year Range: {year_range}")

    # --- Full feature set for clustering (6D in your case) ---
    X_all = np.vstack([
        cluster_data.sem_maj,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mean_motion,
        cluster_data.mag_obj,
    ]).T

    norm_X_all, X_min, X_max = normalize_data(X_all)

    # --- Subset of features just for plotting (5D) ---
    X_plot = np.vstack([
        cluster_data.mean_motion,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mag_obj
    ]).T

    print("True latitude sample:", cluster_data.true_lat[:10])

    for k in k_values:
        # --- Run K-Means in 6D ---
        clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
            k_means, f"K-Means (k={k})",
            norm_X_all, X_min, X_max, k, init='kmeans++'
        )
        labels = clustering.labels
        unique_labels = sorted(set(labels))

        # --- Build dataframe for pairplot ---
        df = pd.DataFrame(X_plot, columns=[
            "mean motion", "inc", "raan", "arg lat", "mag"
        ])
        df['cluster'] = labels.astype(int)

        # --- Palette mapping ---
        unique_labels = sorted(set(labels))
        palette_map = {lab: extreme_colors[i % len(extreme_colors)]
                    for i, lab in enumerate(unique_labels)}

        legend_order = [lab for lab in unique_labels if lab != -1]  # Keep original labels (0, 1, 2, ...)
        if -1 in unique_labels:
            legend_order.append("Noise")

        # --- Pairplot ---
        pp = sns.pairplot(
            df, hue='cluster', diag_kind='kde',
            plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
            corner=True,
            hue_order=legend_order, 
            palette=palette_map
        )

        # After creating the plot, modify the legend labels
        for i, text in enumerate(pp._legend.get_texts()):
            if text.get_text() != "Noise":
                # Convert from "0", "1", "2" to "1", "2", "3"
                text.set_text(str(int(text.get_text()) + 1))

        # Replace axis labels with LaTeX
        for ax in pp.axes.flatten():
            if ax is not None:
                xlabel = ax.get_xlabel()
                ylabel = ax.get_ylabel()
                if xlabel in axis_labels:
                    ax.set_xlabel(axis_labels[xlabel])
                if ylabel in axis_labels:
                    ax.set_ylabel(axis_labels[ylabel])

        # Force labels on diagonal plots
        for i in range(len(pp.diag_axes)):
            ax = pp.diag_axes[i]
            var = pp.x_vars[i]
            if var in axis_labels:
                ax.set_xlabel(axis_labels[var])
                ax.set_ylabel(axis_labels[var])

        # Fix the first diagonal label explicitly
        if len(pp.diag_axes) > 0:
            first_diag_ax = pp.diag_axes[0]
            first_var = pp.x_vars[0]
            if first_var in axis_labels:
                first_diag_ax.set_ylabel(axis_labels[first_var])

        # Enlarge legend & markers
        for lh in pp._legend.legend_handles:
            lh.set_markersize(20)
            lh.set_alpha(1.0)
            lh.set_markeredgewidth(0)

        pp._legend.set_title("Cluster", prop={'size': 17})
        for text in pp._legend.get_texts():
            text.set_fontsize(17)
        pp._legend.set_bbox_to_anchor((0.9, 0.6))
        pp._legend.set_frame_on(True)     
        pp._legend.get_frame().set_edgecolor("grey")
        pp._legend.get_frame().set_linewidth(1.0)

        # Title
        """pp.fig.suptitle(
            f"K-Means 6D→5D Clusters {year_range} (k={k})",
            y=1.02, fontsize=20
        )"""

        # Save figure
        fname = f"pairplot_{year_range}_k{k}_observed.svg"
        pp.savefig(os.path.join(plot_dir, fname), bbox_inches="tight", transparent=True)
        plt.close(pp.fig)



Running 9D K-Means for Year Range: 2005-2008
True latitude sample: [0, 233.1312, 318.0453, 241.329, 198.4953, 20.4137, 247.8178, 126.3279, 223.0666, 242.9949]
Runtime for k_means: 0.042718 seconds


## DBSCAN

In [15]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

# --- Setup ---
year_range = "2005-2008"
plot_dir = "Images/IAC_plots/DBSCAN_2Dplus_observed"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load observed data ---
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018
}
year_ranges = {year_range: np.arange(2005, 2009)}
binned_data = cdp.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# --- DBSCAN parameters ---
eps_values = [0.11]
min_samples_values = [12]

# --- Color palette ---
extreme_colors = [
    "#3DC53D", "#4F4FF3", '#FFFF00', "#A148A1", '#00FFFF',
    '#FF8000', '#8000FF', "#F157A4", '#00FF80', '#804000', '#000000',
    '#808080', '#404040', "#FF000094"
]

# --- Axis labels ---
"""axis_labels = {
    "ecc": r"$e$",
    "sem_maj": r"$a$ [km]", 
    "inc": r"$i$ [°]", 
    "raan": r"$\Omega$ [°]",
    "perigee": r"\omega [°]",
    "arg lat": r"$\lambda$ [°]",
    "mean motion": r"$n$ [rev/day]", 
    "mag_obj": r"$m$", 
    "diameter": r"$d$ [m]"
}"""

axis_labels = {
    "mean motion": r"$n$ [rev/day]",
    "ecc": r"$e$",
    "inc": r"$i$ [°]",
    "raan": r"$\Omega$ [°]",
    "arg lat": r"$\lambda$ [°]",
    "mag": r"$m$"
}

# --- Run DBSCAN ---
for cluster_data, year_range in binned_data:
    print(f"\nRunning DBSCAN 9D for Year Range: {year_range}")

    # --- Full feature set (9D) ---
    X_all = np.vstack([
        cluster_data.sem_maj,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mean_motion,
        cluster_data.mag_obj,
    ]).T

    X_plot = np.vstack([
        cluster_data.mean_motion,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mag_obj
    ]).T

    norm_X_all, X_min, X_max = normalize_data(X_all)

    for eps in eps_values:
        for min_samples in min_samples_values:
            # --- Run DBSCAN ---
            clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
                dbscan_clustering, "DBSCAN", norm_X_all, X_min, X_max,
                eps=eps, min_samples=min_samples
            )
            labels = clustering.labels

            # --- Remap cluster labels ---
            unique_labels = sorted(set(labels) - {-1})
            label_map = {old: i+1 for i, old in enumerate(unique_labels)}
            label_map[-1] = "Noise"
            mapped_labels = [label_map[l] for l in labels]

            # --- Build dataframe ---
            """df = pd.DataFrame(X_plot, columns=[
                "ecc", "sem_maj", "inc", "raan", "perigee", "arg lat",
                "mean motion", "mag_obj", "diameter"
            ])"""
            df = pd.DataFrame(X_plot, columns=[
                "mean motion", "inc", "raan", "arg lat", "mag"
            ])
            df['cluster'] = mapped_labels

            # --- Pairplot ---
                # --- Pairplot with extreme contrast colors ---
            palette_map = {i+1: extreme_colors[i % len(extreme_colors)] 
               for i in range(len(unique_labels))}
            if -1 in labels:
                palette_map["Noise"] = "red"


            pp = sns.pairplot(
                df, hue='cluster', diag_kind='kde',
                plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
                corner=True,
                hue_order=[*range(1, len(unique_labels)+1)] + (["Noise"] if -1 in labels else []),
                palette=palette_map
            )


            # --- Set LaTeX labels ---
            for ax in pp.axes.flatten():
                if ax is not None:
                    xlabel = ax.get_xlabel()
                    ylabel = ax.get_ylabel()
                    if xlabel in axis_labels:
                        ax.set_xlabel(axis_labels[xlabel])
                    if ylabel in axis_labels:
                        ax.set_ylabel(axis_labels[ylabel])

            # Diagonal labels
            for i, var in enumerate(pp.x_vars):
                ax = pp.axes[i, i]
                if ax is not None and var in axis_labels:
                    ax.set_xlabel(axis_labels[var])
                    ax.set_ylabel(axis_labels[var])

            # --- Legend ---
            for lh in pp._legend.legend_handles:
                lh.set_markersize(20)
                lh.set_alpha(1.0)
                lh.set_markeredgewidth(0)
            pp._legend.set_title("Cluster", prop={'size': 17})
            for text in pp._legend.get_texts():
                text.set_fontsize(17)
            pp._legend.set_bbox_to_anchor((0.87, 0.6))
            pp._legend.set_frame_on(True)     
            pp._legend.get_frame().set_edgecolor("grey")
            pp._legend.get_frame().set_linewidth(1.0)

            # --- Title & save ---
            """pp.fig.suptitle(
                f"DBSCAN 6D→5D Clusters {year_range} (eps={eps}, ms={min_samples})",
                y=1.02, fontsize=20
            )"""

            fname = f"pairplot_{year_range}_eps{eps}_ms{min_samples}_observed.svg"
            pp.savefig(os.path.join(plot_dir, fname), bbox_inches="tight", transparent=True)
            plt.close(pp.fig)

            # --- Additional 2D semi-major axis distribution plot ---
            plt.figure(figsize=(8, 5))
            sns.kdeplot(cluster_data.sem_maj, fill=True, color="skyblue", linewidth=2)
            plt.xlabel(r"$a$ [km]", fontsize=16)
            plt.ylabel("Density", fontsize=16)
            plt.title(f"Semi-Major Axis Distribution {year_range} (eps={eps}, ms={min_samples})", fontsize=14)
            fname_sma = f"sma_distribution_{year_range}_eps{eps}_ms{min_samples}.svg"
            plt.tight_layout()
            plt.savefig(os.path.join(plot_dir, fname_sma), transparent=True)
            plt.close()


  """axis_labels = {



Running DBSCAN 9D for Year Range: 2005-2008
Runtime for dbscan_clustering: 0.172649 seconds


## DBSCAN 2D Plus observed data, remove the largest clusters

In [13]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

# --- Setup ---
year_range = "2005-2008"
plot_dir = "Images/IAC_plots/DBSCAN_2Dplus_observed_smallclusters"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load observed data ---
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018
}
year_ranges = {year_range: np.arange(2005, 2009)}
binned_data = cdp.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# --- DBSCAN parameters ---
eps_values = [0.11]
min_samples_values = [12]

# --- Extreme contrast color scheme (original cluster order) ---
extreme_colors = [
    "#3DC53D", "#4F4FF3", '#FFFF00', "#A148A1", '#00FFFF',
    '#FF8000', '#8000FF', "#F157A4", '#00FF80', '#804000', '#000000',
    '#808080', '#404040', "#FF000094"
]

# --- Axis labels ---
axis_labels = {
    "mean motion": r"$n$ [rev/day]",
    "ecc": r"$e$",
    "inc": r"$i$ [°]",
    "raan": r"$\Omega$ [°]",
    "arg lat": r"$\lambda$ [°]",
    "mag": r"$m$"
}

# --- Run DBSCAN ---
for cluster_data, year_range in binned_data:
    print(f"\nRunning DBSCAN 9D for Year Range: {year_range}")

    # --- Full feature set (9D) ---
    X_all = np.vstack([
        cluster_data.sem_maj,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mean_motion,
        cluster_data.mag_obj
    ]).T

    # --- Subset for plotting (5D) ---
    X_plot = np.vstack([
        cluster_data.mean_motion,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mag_obj
    ]).T

    norm_X_all, X_min, X_max = normalize_data(X_all)

    for eps in eps_values:
        for min_samples in min_samples_values:
            # --- Run clustering ---
            clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
                dbscan_clustering, "DBSCAN", norm_X_all, X_min, X_max,
                eps=eps, min_samples=min_samples
            )
            labels = clustering.labels

            # --- Remap clusters to 1..N, Noise stays -1 ---
            unique_labels = sorted(set(labels) - {-1})
            label_map = {old: i+1 for i, old in enumerate(unique_labels)}
            label_map[-1] = "Noise"
            mapped_labels = [label_map[l] for l in labels]

            # --- Build DataFrame ---
            df = pd.DataFrame(X_plot, columns=[
                "mean motion", "inc", "raan", "arg lat", "mag"
            ])
            df['cluster'] = mapped_labels

            # --- Remove largest cluster + Noise ---
            cluster_sizes = df['cluster'].value_counts()
            non_noise_clusters = cluster_sizes.drop(labels=["Noise"], errors="ignore")
            clusters_to_remove = []
            if not non_noise_clusters.empty:
                largest_cluster = non_noise_clusters.idxmax()
                clusters_to_remove.append(largest_cluster)
            if "Noise" in cluster_sizes.index:
                clusters_to_remove.append("Noise")

            print(f"\nRemoved clusters for eps={eps}, min_samples={min_samples}:")
            for cl in clusters_to_remove:
                print(f"  Cluster {cl}: {cluster_sizes[cl]} points")

            df = df[~df['cluster'].isin(clusters_to_remove)]

            # --- Original palette mapping ---
            original_palette_map = {i+1: extreme_colors[i % len(extreme_colors)] for i in range(len(extreme_colors))}
            if "Noise" in df['cluster'].unique():
                original_palette_map["Noise"] = "red"

            # --- Keep only remaining clusters in the palette ---
            remaining_clusters = df['cluster'].unique()
            palette_map = {cl: original_palette_map[cl] for cl in remaining_clusters}

            # --- Hue order: numeric clusters in order, then Noise if present ---
            numeric_clusters = sorted([cl for cl in remaining_clusters if isinstance(cl, int)])
            hue_order = numeric_clusters
            if "Noise" in remaining_clusters:
                hue_order.append("Noise")

            # --- Pairplot ---
            pp = sns.pairplot(
                df, hue='cluster', diag_kind='kde',
                plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
                corner=True,
                hue_order=hue_order,
                palette=palette_map
            )

            # --- LaTeX labels ---
            for ax in pp.axes.flatten():
                if ax is not None:
                    xlabel = ax.get_xlabel()
                    ylabel = ax.get_ylabel()
                    if xlabel in axis_labels:
                        ax.set_xlabel(axis_labels[xlabel])
                    if ylabel in axis_labels:
                        ax.set_ylabel(axis_labels[ylabel])

            # Diagonal labels
            for i, var in enumerate(pp.x_vars):
                ax = pp.axes[i, i]
                if ax is not None and var in axis_labels:
                    ax.set_xlabel(axis_labels[var])
                    ax.set_ylabel(axis_labels[var])

            # --- Legend ---
            for lh in pp._legend.legend_handles:
                lh.set_markersize(20)
                lh.set_alpha(1.0)
                lh.set_markeredgewidth(0)
            pp._legend.set_title("Cluster", prop={'size': 17})
            for text in pp._legend.get_texts():
                text.set_fontsize(17)
            pp._legend.set_bbox_to_anchor((0.9, 0.6))
            pp._legend.set_frame_on(True)     
            pp._legend.get_frame().set_edgecolor("grey")
            pp._legend.get_frame().set_linewidth(1.0)

            # --- Save ---
            fname = f"pairplot_{year_range}_eps{eps}_ms{min_samples}_observed_smallclusters.svg"
            pp.savefig(os.path.join(plot_dir, fname), transparent=True)
            plt.close(pp.fig)



Running DBSCAN 9D for Year Range: 2005-2008
Runtime for dbscan_clustering: 0.146692 seconds

Removed clusters for eps=0.11, min_samples=12:
  Cluster 1: 4461 points
  Cluster Noise: 1507 points
