## Tasks after the IAC paper has been finished, given 17.09.2025 by Nicola

1) Side activity: Filter observed data by magnitudes (only keep objects with mag 14.5 to 19)  
2) Observed data: Run DBSCAN on other time windows than 2005-2008 (shift by one year, near the original window), produce 2D plots and pairplots  
3) Correlate objects in clusters with DISCOS fragementation events, consider passed time and typical evolution path, find consitencies  

## 1) Side activity: Filter observed data by magnitudes (only keep objects with mag 14.5 to 19)  

### Observation plots in 2D, DBSCAN

In [4]:
import numpy as np
import os
import cluster_data
from cluster_data import normalize_data, unnormalize
from cluster_plotter import ClusterPlotter
from DBSCAN import dbscan_clustering 

# Define observed files
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018  # Exclude 2018 if missing
}

# Use running ranges
IAC_range = {"2005-2008": np.arange(2005, 2009)}
year_ranges = IAC_range

# Bin the observed data
binned_data = cluster_data.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# Output directory for plots
plot_dir = "Images/IAC_plots_followup/DBSCAN_2d_observed"
os.makedirs(plot_dir, exist_ok=True)

for cluster_data, year_range in binned_data:
    data_array = np.array([cluster_data.inc, cluster_data.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)
    
    eps_values = [0.015]
    min_samples_values = [15]
    
    for eps in eps_values:
        for min_samples in min_samples_values:
            result_dbscan = dbscan_clustering(
                normalized_data, eps=eps, min_samples=min_samples
            )

            unnormalized_data, cluster_centers = unnormalize(
                result_dbscan.data, result_dbscan.cluster_centers, data_min, data_max
            )
            
            plotter = ClusterPlotter(unnormalized_data, result_dbscan.labels, cluster_centers)
            plotter.clusters_2d_plot(
                f"DBSCAN: years = {year_range}, eps = {eps}, min_samples = {min_samples}",
                os.path.join(plot_dir, f"dbscan_{year_range}_eps{eps}_min{min_samples}_observed.png")
            )


Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2005-2008_eps0.015_min15_observed_2.png


### Observation plots in 2D+, DBSCAN

In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

# --- Setup ---
year_range = "2005-2008"
plot_dir = "Images/IAC_plots_followup/DBSCAN_2Dplus_observed"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load observed data ---
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018
}
year_ranges = {year_range: np.arange(2005, 2009)}
binned_data = cdp.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# --- DBSCAN parameters ---
eps_values = [0.11]
min_samples_values = [12]

# --- Color palette ---
extreme_colors = [
    "#3DC53D", "#4F4FF3", '#FFFF00', "#A148A1", '#00FFFF',
    '#FF8000', '#8000FF', "#F157A4", '#00FF80', '#804000', '#000000',
    '#808080', '#404040', "#FF000094"
]

# --- Axis labels ---
"""axis_labels = {
    "ecc": r"$e$",
    "sem_maj": r"$a$ [km]", 
    "inc": r"$i$ [°]", 
    "raan": r"$\Omega$ [°]",
    "perigee": r"\omega [°]",
    "arg lat": r"$\lambda$ [°]",
    "mean motion": r"$n$ [rev/day]", 
    "mag_obj": r"$m$", 
    "diameter": r"$d$ [m]"
}"""

axis_labels = {
    "mean motion": r"$n$ [rev/day]",
    "ecc": r"$e$",
    "inc": r"$i$ [°]",
    "raan": r"$\Omega$ [°]",
    "arg lat": r"$\lambda$ [°]",
    "mag": r"$m$"
}

# --- Run DBSCAN ---
for cluster_data, year_range in binned_data:
    print(f"\nRunning DBSCAN 9D for Year Range: {year_range}")

    # --- Full feature set (9D) ---
    X_all = np.vstack([
        cluster_data.sem_maj,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mean_motion,
        cluster_data.mag_obj,
    ]).T

    X_plot = np.vstack([
        cluster_data.mean_motion,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mag_obj
    ]).T

    norm_X_all, X_min, X_max = normalize_data(X_all)

    for eps in eps_values:
        for min_samples in min_samples_values:
            # --- Run DBSCAN ---
            clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
                dbscan_clustering, "DBSCAN", norm_X_all, X_min, X_max,
                eps=eps, min_samples=min_samples
            )
            labels = clustering.labels

            # --- Remap cluster labels ---
            unique_labels = sorted(set(labels) - {-1})
            label_map = {old: i+1 for i, old in enumerate(unique_labels)}
            label_map[-1] = "Noise"
            mapped_labels = [label_map[l] for l in labels]

            # --- Build dataframe ---
            """df = pd.DataFrame(X_plot, columns=[
                "ecc", "sem_maj", "inc", "raan", "perigee", "arg lat",
                "mean motion", "mag_obj", "diameter"
            ])"""
            df = pd.DataFrame(X_plot, columns=[
                "mean motion", "inc", "raan", "arg lat", "mag"
            ])
            df['cluster'] = mapped_labels

            # --- Pairplot ---
                # --- Pairplot with extreme contrast colors ---
            palette_map = {i+1: extreme_colors[i % len(extreme_colors)] 
               for i in range(len(unique_labels))}
            if -1 in labels:
                palette_map["Noise"] = "red"


            pp = sns.pairplot(
                df, hue='cluster', diag_kind='kde',
                plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
                corner=True,
                hue_order=[*range(1, len(unique_labels)+1)] + (["Noise"] if -1 in labels else []),
                palette=palette_map
            )


            # --- Set LaTeX labels ---
            for ax in pp.axes.flatten():
                if ax is not None:
                    xlabel = ax.get_xlabel()
                    ylabel = ax.get_ylabel()
                    if xlabel in axis_labels:
                        ax.set_xlabel(axis_labels[xlabel])
                    if ylabel in axis_labels:
                        ax.set_ylabel(axis_labels[ylabel])

            # Diagonal labels
            for i, var in enumerate(pp.x_vars):
                ax = pp.axes[i, i]
                if ax is not None and var in axis_labels:
                    ax.set_xlabel(axis_labels[var])
                    ax.set_ylabel(axis_labels[var])

            # --- Legend ---
            for lh in pp._legend.legend_handles:
                lh.set_markersize(20)
                lh.set_alpha(1.0)
                lh.set_markeredgewidth(0)
            pp._legend.set_title("Cluster", prop={'size': 17})
            for text in pp._legend.get_texts():
                text.set_fontsize(17)
            pp._legend.set_bbox_to_anchor((0.87, 0.6))
            pp._legend.set_frame_on(True)     
            pp._legend.get_frame().set_edgecolor("grey")
            pp._legend.get_frame().set_linewidth(1.0)

            # --- Title & save ---
            """pp.fig.suptitle(
                f"DBSCAN 6D→5D Clusters {year_range} (eps={eps}, ms={min_samples})",
                y=1.02, fontsize=20
            )"""

            fname = f"pairplot_{year_range}_eps{eps}_ms{min_samples}_observed.png"
            pp.savefig(os.path.join(plot_dir, fname), bbox_inches="tight", transparent=False)
            plt.close(pp.fig)

            # --- Additional 2D semi-major axis distribution plot ---
            plt.figure(figsize=(8, 5))
            sns.kdeplot(cluster_data.sem_maj, fill=True, color="skyblue", linewidth=2)
            plt.xlabel(r"$a$ [km]", fontsize=16)
            plt.ylabel("Density", fontsize=16)
            plt.title(f"Semi-Major Axis Distribution {year_range} (eps={eps}, ms={min_samples})", fontsize=14)
            fname_sma = f"sma_distribution_{year_range}_eps{eps}_ms{min_samples}.png"
            plt.tight_layout()
            plt.savefig(os.path.join(plot_dir, fname_sma), transparent=False)
            plt.close()


  """axis_labels = {



Running DBSCAN 9D for Year Range: 2005-2008
Runtime for dbscan_clustering: 0.085649 seconds


### Observation plots in 2D+, DBSCAN, small clusters

In [3]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

# --- Setup ---
year_range = "2005-2008"
plot_dir = "Images/IAC_plots_followup/DBSCAN_2Dplus_observed_smallclusters"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load observed data ---
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018
}
year_ranges = {year_range: np.arange(2005, 2009)}
binned_data = cdp.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# --- DBSCAN parameters ---
eps_values = [0.11]
min_samples_values = [12]

# --- Extreme contrast color scheme (original cluster order) ---
extreme_colors = [
    "#3DC53D", "#4F4FF3", '#FFFF00', "#A148A1", '#00FFFF',
    '#FF8000', '#8000FF', "#F157A4", '#00FF80', '#804000', '#000000',
    '#808080', '#404040', "#FF000094"
]

# --- Axis labels ---
axis_labels = {
    "mean motion": r"$n$ [rev/day]",
    "ecc": r"$e$",
    "inc": r"$i$ [°]",
    "raan": r"$\Omega$ [°]",
    "arg lat": r"$\lambda$ [°]",
    "mag": r"$m$"
}

# --- Run DBSCAN ---
for cluster_data, year_range in binned_data:
    print(f"\nRunning DBSCAN 9D for Year Range: {year_range}")

    # --- Full feature set (9D) ---
    X_all = np.vstack([
        cluster_data.sem_maj,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mean_motion,
        cluster_data.mag_obj
    ]).T

    # --- Subset for plotting (5D) ---
    X_plot = np.vstack([
        cluster_data.mean_motion,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mag_obj
    ]).T

    norm_X_all, X_min, X_max = normalize_data(X_all)

    for eps in eps_values:
        for min_samples in min_samples_values:
            # --- Run clustering ---
            clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
                dbscan_clustering, "DBSCAN", norm_X_all, X_min, X_max,
                eps=eps, min_samples=min_samples
            )
            labels = clustering.labels

            # --- Remap clusters to 1..N, Noise stays -1 ---
            unique_labels = sorted(set(labels) - {-1})
            label_map = {old: i+1 for i, old in enumerate(unique_labels)}
            label_map[-1] = "Noise"
            mapped_labels = [label_map[l] for l in labels]

            # --- Build DataFrame ---
            df = pd.DataFrame(X_plot, columns=[
                "mean motion", "inc", "raan", "arg lat", "mag"
            ])
            df['cluster'] = mapped_labels

            # --- Remove largest cluster + Noise ---
            cluster_sizes = df['cluster'].value_counts()
            non_noise_clusters = cluster_sizes.drop(labels=["Noise"], errors="ignore")
            clusters_to_remove = []
            if not non_noise_clusters.empty:
                largest_cluster = non_noise_clusters.idxmax()
                clusters_to_remove.append(largest_cluster)
            if "Noise" in cluster_sizes.index:
                clusters_to_remove.append("Noise")

            print(f"\nRemoved clusters for eps={eps}, min_samples={min_samples}:")
            for cl in clusters_to_remove:
                print(f"  Cluster {cl}: {cluster_sizes[cl]} points")

            df = df[~df['cluster'].isin(clusters_to_remove)]

            # --- Original palette mapping ---
            original_palette_map = {i+1: extreme_colors[i % len(extreme_colors)] for i in range(len(extreme_colors))}
            if "Noise" in df['cluster'].unique():
                original_palette_map["Noise"] = "red"

            # --- Keep only remaining clusters in the palette ---
            remaining_clusters = df['cluster'].unique()
            palette_map = {cl: original_palette_map[cl] for cl in remaining_clusters}

            # --- Hue order: numeric clusters in order, then Noise if present ---
            numeric_clusters = sorted([cl for cl in remaining_clusters if isinstance(cl, int)])
            hue_order = numeric_clusters
            if "Noise" in remaining_clusters:
                hue_order.append("Noise")

            # --- Pairplot ---
            pp = sns.pairplot(
                df, hue='cluster', diag_kind='kde',
                plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
                corner=True,
                hue_order=hue_order,
                palette=palette_map
            )

            # --- LaTeX labels ---
            for ax in pp.axes.flatten():
                if ax is not None:
                    xlabel = ax.get_xlabel()
                    ylabel = ax.get_ylabel()
                    if xlabel in axis_labels:
                        ax.set_xlabel(axis_labels[xlabel])
                    if ylabel in axis_labels:
                        ax.set_ylabel(axis_labels[ylabel])

            # Diagonal labels
            for i, var in enumerate(pp.x_vars):
                ax = pp.axes[i, i]
                if ax is not None and var in axis_labels:
                    ax.set_xlabel(axis_labels[var])
                    ax.set_ylabel(axis_labels[var])

            # --- Legend ---
            for lh in pp._legend.legend_handles:
                lh.set_markersize(20)
                lh.set_alpha(1.0)
                lh.set_markeredgewidth(0)
            pp._legend.set_title("Cluster", prop={'size': 17})
            for text in pp._legend.get_texts():
                text.set_fontsize(17)
            pp._legend.set_bbox_to_anchor((0.9, 0.6))
            pp._legend.set_frame_on(True)     
            pp._legend.get_frame().set_edgecolor("grey")
            pp._legend.get_frame().set_linewidth(1.0)

            # --- Save ---
            fname = f"pairplot_{year_range}_eps{eps}_ms{min_samples}_observed_smallclusters.png"
            pp.savefig(os.path.join(plot_dir, fname), transparent=False)
            plt.close(pp.fig)



Running DBSCAN 9D for Year Range: 2005-2008
Runtime for dbscan_clustering: 0.084356 seconds

Removed clusters for eps=0.11, min_samples=12:
  Cluster 1: 2175 points
  Cluster Noise: 1756 points


## 2) Observed data: Run DBSCAN on other time windows than 2005-2008 (shift by one year, near the original window), produce 2D plots and pairplots  

### Observation plots in 2D, DBSCAN

In [3]:
import numpy as np
import os
import cluster_data
from cluster_data import normalize_data, unnormalize
from cluster_plotter import ClusterPlotter
from DBSCAN import dbscan_clustering 

# Define observed files
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018  # Exclude 2018 if missing
}

# Define 4-year ranges with 1-year overlap
year_ranges = {
    f"{start}-{start+3}": np.arange(start, start+4)
    for start in range(2002, 2020) 
}

# Bin the observed data
binned_data = cluster_data.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# Output directory for plots
plot_dir = "Images/IAC_plots_followup/DBSCAN_2d_observed"
os.makedirs(plot_dir, exist_ok=True)

for cluster_data, year_range in binned_data:
    data_array = np.array([cluster_data.inc, cluster_data.raan]).T
    normalized_data, data_min, data_max = normalize_data(data_array)
    
    eps_values = [0.015]
    min_samples_values = [15]
    
    for eps in eps_values:
        for min_samples in min_samples_values:
            result_dbscan = dbscan_clustering(
                normalized_data, eps=eps, min_samples=min_samples
            )

            unnormalized_data, cluster_centers = unnormalize(
                result_dbscan.data, result_dbscan.cluster_centers, data_min, data_max
            )
            
            # clean file name (replace dash with underscore)
            year_label = str(year_range).replace("-", "_")
            
            plotter = ClusterPlotter(unnormalized_data, result_dbscan.labels, cluster_centers)
            plotter.clusters_2d_plot(
                f"DBSCAN: years = {year_range}, eps = {eps}, min_samples = {min_samples}",
                os.path.join(plot_dir, f"dbscan_{year_label}_eps{eps}_min{min_samples}_observed.png")
            )


Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2002_2005_eps0.015_min15_observed_1.png
Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2003_2006_eps0.015_min15_observed_1.png
Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2004_2007_eps0.015_min15_observed_1.png
Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2005_2008_eps0.015_min15_observed_1.png
Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2006_2009_eps0.015_min15_observed_1.png
Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2007_2010_eps0.015_min15_observed_1.png
Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2008_2011_eps0.015_min15_observed_1.png
Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2009_2012_eps0.015_min15_observed_1.png
Plot saved as: Images/IAC_plots_followup/DBSCAN_2d_observed\dbscan_2010_2013_eps0.015_min15_observed.png
Plot saved as: Images/IAC_plots_followu

### Observations plots 2D+, DBSCAN

In [4]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

# --- Setup ---
plot_dir = "Images/IAC_plots_followup/DBSCAN_2Dplus_observed"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load observed data ---
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018
}

# --- Define 4-year overlapping ranges ---
year_ranges = {
    f"{start}-{start+3}": np.arange(start, start+4)
    for start in range(2002, 2020)  
}

binned_data = cdp.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# --- DBSCAN parameters ---
eps_values = [0.11]
min_samples_values = [12]

# --- Color palette ---
extreme_colors = [
    "#3DC53D", "#4F4FF3", '#FFFF00', "#A148A1", '#00FFFF',
    '#FF8000', '#8000FF', "#F157A4", '#00FF80', '#804000', '#000000',
    '#808080', '#404040', "#FF000094"
]

# --- Axis labels ---
axis_labels = {
    "mean motion": r"$n$ [rev/day]",
    "ecc": r"$e$",
    "inc": r"$i$ [°]",
    "raan": r"$\Omega$ [°]",
    "arg lat": r"$\lambda$ [°]",
    "mag": r"$m$"
}

# --- Run DBSCAN over all year ranges ---
for cluster_data, year_range in binned_data:
    print(f"\nRunning DBSCAN 9D for Year Range: {year_range}")

    # --- Full feature set ---
    X_all = np.vstack([
        cluster_data.sem_maj,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mean_motion,
        cluster_data.mag_obj,
    ]).T

    X_plot = np.vstack([
        cluster_data.mean_motion,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mag_obj
    ]).T

    norm_X_all, X_min, X_max = normalize_data(X_all)

    for eps in eps_values:
        for min_samples in min_samples_values:
            # --- Run DBSCAN ---
            clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
                dbscan_clustering, "DBSCAN", norm_X_all, X_min, X_max,
                eps=eps, min_samples=min_samples
            )
            labels = clustering.labels

            # --- Remap cluster labels ---
            unique_labels = sorted(set(labels) - {-1})
            label_map = {old: i+1 for i, old in enumerate(unique_labels)}
            label_map[-1] = "Noise"
            mapped_labels = [label_map[l] for l in labels]

            # --- Build dataframe ---
            df = pd.DataFrame(X_plot, columns=[
                "mean motion", "inc", "raan", "arg lat", "mag"
            ])
            df['cluster'] = mapped_labels

            # --- Pairplot with custom palette ---
            palette_map = {i+1: extreme_colors[i % len(extreme_colors)] 
                           for i in range(len(unique_labels))}
            if -1 in labels:
                palette_map["Noise"] = "red"

            pp = sns.pairplot(
                df, hue='cluster', diag_kind='kde',
                plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
                corner=True,
                hue_order=[*range(1, len(unique_labels)+1)] + (["Noise"] if -1 in labels else []),
                palette=palette_map
            )

            # --- Set LaTeX labels ---
            for ax in pp.axes.flatten():
                if ax is not None:
                    xlabel = ax.get_xlabel()
                    ylabel = ax.get_ylabel()
                    if xlabel in axis_labels:
                        ax.set_xlabel(axis_labels[xlabel])
                    if ylabel in axis_labels:
                        ax.set_ylabel(axis_labels[ylabel])

            for i, var in enumerate(pp.x_vars):
                ax = pp.axes[i, i]
                if ax is not None and var in axis_labels:
                    ax.set_xlabel(axis_labels[var])
                    ax.set_ylabel(axis_labels[var])

            # --- Legend formatting ---
            for lh in pp._legend.legend_handles:
                lh.set_markersize(20)
                lh.set_alpha(1.0)
                lh.set_markeredgewidth(0)
            pp._legend.set_title("Cluster", prop={'size': 17})
            for text in pp._legend.get_texts():
                text.set_fontsize(17)
            pp._legend.set_bbox_to_anchor((0.87, 0.6))
            pp._legend.set_frame_on(True)     
            pp._legend.get_frame().set_edgecolor("grey")
            pp._legend.get_frame().set_linewidth(1.0)

            # --- Save pairplot ---
            year_label = str(year_range).replace("-", "_")
            fname = f"pairplot_{year_label}_eps{eps}_ms{min_samples}_observed.png"
            pp.savefig(os.path.join(plot_dir, fname), bbox_inches="tight", transparent=False)
            plt.close(pp.fig)

            # --- Semi-major axis distribution ---
            plt.figure(figsize=(8, 5))
            sns.kdeplot(cluster_data.sem_maj, fill=True, color="skyblue", linewidth=2)
            plt.xlabel(r"$a$ [km]", fontsize=16)
            plt.ylabel("Density", fontsize=16)
            plt.title(f"Semi-Major Axis Distribution {year_range} (eps={eps}, ms={min_samples})", fontsize=14)
            fname_sma = f"sma_distribution_{year_label}_eps{eps}_ms{min_samples}.png"
            plt.tight_layout()
            plt.savefig(os.path.join(plot_dir, fname_sma), transparent=False)
            plt.close()


  mm = np.sqrt(mu/(sem_maj*1000)**3) #convert semi major from km to m



Running DBSCAN 9D for Year Range: 2002-2005
Runtime for dbscan_clustering: 0.165776 seconds

Running DBSCAN 9D for Year Range: 2003-2006
Runtime for dbscan_clustering: 0.220740 seconds

Running DBSCAN 9D for Year Range: 2004-2007
Runtime for dbscan_clustering: 0.170042 seconds

Running DBSCAN 9D for Year Range: 2005-2008
Runtime for dbscan_clustering: 0.156841 seconds

Running DBSCAN 9D for Year Range: 2006-2009
Runtime for dbscan_clustering: 0.162911 seconds

Running DBSCAN 9D for Year Range: 2007-2010
Runtime for dbscan_clustering: 0.127922 seconds

Running DBSCAN 9D for Year Range: 2008-2011
Runtime for dbscan_clustering: 0.131074 seconds

Running DBSCAN 9D for Year Range: 2009-2012
Runtime for dbscan_clustering: 0.066947 seconds

Running DBSCAN 9D for Year Range: 2010-2013
Runtime for dbscan_clustering: 0.094511 seconds

Running DBSCAN 9D for Year Range: 2011-2014
Runtime for dbscan_clustering: 0.047018 seconds

Running DBSCAN 9D for Year Range: 2012-2015
Runtime for dbscan_cluste

### Observations plots 2D+, DBSCAN, small clusters

In [5]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter

# --- Style settings ---
plt.rcParams.update({
    "font.size": 16,
    "axes.labelsize": 20,
    "axes.titlesize": 18,
    "legend.fontsize": 14,
    "xtick.labelsize": 14,
    "ytick.labelsize": 14
})

# --- Setup ---
plot_dir = "Images/IAC_plots_followup/DBSCAN_2Dplus_observed_smallclusters"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Load observed data ---
uncorr_obs_files = {
    year: f"ogs{year}01_12_det.ele_ucorr" if year != 2002 else f"ogs{year}08_12_det.ele_ucorr"
    for year in range(2002, 2024) if year != 2018
}

# --- Define 4-year overlapping ranges ---
year_ranges = {
    f"{start}-{start+3}": np.arange(start, start+4)
    for start in range(2002, 2020)  
}

binned_data = cdp.bin_observed_data(uncorr_obs_files, year_ranges, print_res=False)

# --- DBSCAN parameters ---
eps_values = [0.11]
min_samples_values = [12]

# --- Extreme contrast color scheme ---
extreme_colors = [
    "#3DC53D", "#4F4FF3", '#FFFF00', "#A148A1", '#00FFFF',
    '#FF8000', '#8000FF', "#F157A4", '#00FF80', '#804000', '#000000',
    '#808080', '#404040', "#FF000094"
]

# --- Axis labels ---
axis_labels = {
    "mean motion": r"$n$ [rev/day]",
    "ecc": r"$e$",
    "inc": r"$i$ [°]",
    "raan": r"$\Omega$ [°]",
    "arg lat": r"$\lambda$ [°]",
    "mag": r"$m$"
}

# --- Run DBSCAN over all ranges ---
for cluster_data, year_range in binned_data:
    print(f"\nRunning DBSCAN 9D for Year Range: {year_range}")

    # --- Full feature set (6D) ---
    X_all = np.vstack([
        cluster_data.sem_maj,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mean_motion,
        cluster_data.mag_obj
    ]).T

    # --- Subset for plotting (5D) ---
    X_plot = np.vstack([
        cluster_data.mean_motion,
        cluster_data.inc,
        cluster_data.raan,
        cluster_data.true_lat,
        cluster_data.mag_obj
    ]).T

    norm_X_all, X_min, X_max = normalize_data(X_all)

    for eps in eps_values:
        for min_samples in min_samples_values:
            # --- Run clustering ---
            clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
                dbscan_clustering, "DBSCAN", norm_X_all, X_min, X_max,
                eps=eps, min_samples=min_samples
            )
            labels = clustering.labels

            # --- Remap clusters ---
            unique_labels = sorted(set(labels) - {-1})
            label_map = {old: i+1 for i, old in enumerate(unique_labels)}
            label_map[-1] = "Noise"
            mapped_labels = [label_map[l] for l in labels]

            # --- Build DataFrame ---
            df = pd.DataFrame(X_plot, columns=[
                "mean motion", "inc", "raan", "arg lat", "mag"
            ])
            df['cluster'] = mapped_labels

            # --- Remove largest cluster + Noise ---
            cluster_sizes = df['cluster'].value_counts()
            non_noise_clusters = cluster_sizes.drop(labels=["Noise"], errors="ignore")
            clusters_to_remove = []
            if not non_noise_clusters.empty:
                largest_cluster = non_noise_clusters.idxmax()
                clusters_to_remove.append(largest_cluster)
            if "Noise" in cluster_sizes.index:
                clusters_to_remove.append("Noise")

            print(f"\nRemoved clusters for eps={eps}, min_samples={min_samples}:")
            for cl in clusters_to_remove:
                print(f"  Cluster {cl}: {cluster_sizes[cl]} points")

            df = df[~df['cluster'].isin(clusters_to_remove)]

            # --- Original palette mapping ---
            original_palette_map = {i+1: extreme_colors[i % len(extreme_colors)] for i in range(len(extreme_colors))}
            if "Noise" in df['cluster'].unique():
                original_palette_map["Noise"] = "red"

            # --- Keep only remaining clusters in palette ---
            remaining_clusters = df['cluster'].unique()
            palette_map = {cl: original_palette_map[cl] for cl in remaining_clusters}

            # --- Hue order ---
            numeric_clusters = sorted([cl for cl in remaining_clusters if isinstance(cl, int)])
            hue_order = numeric_clusters
            if "Noise" in remaining_clusters:
                hue_order.append("Noise")

            # --- Pairplot ---
            pp = sns.pairplot(
                df, hue='cluster', diag_kind='kde',
                plot_kws={'alpha': 0.8, 's': 15, 'marker': 'o', 'edgecolor': None},
                corner=True,
                hue_order=hue_order,
                palette=palette_map
            )

            # --- LaTeX labels ---
            for ax in pp.axes.flatten():
                if ax is not None:
                    xlabel = ax.get_xlabel()
                    ylabel = ax.get_ylabel()
                    if xlabel in axis_labels:
                        ax.set_xlabel(axis_labels[xlabel])
                    if ylabel in axis_labels:
                        ax.set_ylabel(axis_labels[ylabel])

            for i, var in enumerate(pp.x_vars):
                ax = pp.axes[i, i]
                if ax is not None and var in axis_labels:
                    ax.set_xlabel(axis_labels[var])
                    ax.set_ylabel(axis_labels[var])

            # --- Legend formatting ---
            for lh in pp._legend.legend_handles:
                lh.set_markersize(20)
                lh.set_alpha(1.0)
                lh.set_markeredgewidth(0)
            pp._legend.set_title("Cluster", prop={'size': 17})
            for text in pp._legend.get_texts():
                text.set_fontsize(17)
            pp._legend.set_bbox_to_anchor((0.9, 0.6))
            pp._legend.set_frame_on(True)     
            pp._legend.get_frame().set_edgecolor("grey")
            pp._legend.get_frame().set_linewidth(1.0)

            # --- Save ---
            year_label = str(year_range).replace("-", "_")
            fname = f"pairplot_{year_label}_eps{eps}_ms{min_samples}_observed_smallclusters.png"
            pp.savefig(os.path.join(plot_dir, fname), transparent=False)
            plt.close(pp.fig)


  mm = np.sqrt(mu/(sem_maj*1000)**3) #convert semi major from km to m



Running DBSCAN 9D for Year Range: 2002-2005
Runtime for dbscan_clustering: 0.384275 seconds

Removed clusters for eps=0.11, min_samples=12:
  Cluster 1: 4647 points
  Cluster Noise: 1417 points

Running DBSCAN 9D for Year Range: 2003-2006
Runtime for dbscan_clustering: 0.557396 seconds

Removed clusters for eps=0.11, min_samples=12:
  Cluster 1: 5377 points
  Cluster Noise: 1598 points

Running DBSCAN 9D for Year Range: 2004-2007
Runtime for dbscan_clustering: 0.170432 seconds

Removed clusters for eps=0.11, min_samples=12:
  Cluster 1: 4791 points
  Cluster Noise: 1543 points

Running DBSCAN 9D for Year Range: 2005-2008
Runtime for dbscan_clustering: 0.147256 seconds

Removed clusters for eps=0.11, min_samples=12:
  Cluster 1: 4461 points
  Cluster Noise: 1507 points

Running DBSCAN 9D for Year Range: 2006-2009
Runtime for dbscan_clustering: 0.163638 seconds

Removed clusters for eps=0.11, min_samples=12:
  Cluster 1: 4299 points
  Cluster Noise: 1196 points

Running DBSCAN 9D for Ye