# Plan after meeting with Nicola on 08.07.2025  
1) Choose a set of data, 1 year, with the highest number of data  
2) 7x7 scatter pairplot, no clustering  
The 7 dimensions are: 6 orbital elements, mean motion   
3) Apply PCA on that dataset:   
-Create a ranking of the features based on pca_0807202 for that year  
-Create a ranking of the features based on PCA for all years, compare different strategies for obtaining the rankings  
4) Clustering in 4D using KMeans and create 4x4 pairplot  
-Use i, a, e, Omega  
-Use 4 top ranked features from PCA  
5) Make clustering in 9D: 6 orbital elements, mean motion, magnitude, size  
Make a pairplot, 6x6 with the 6 top ranked features, try again both ranking methods  

This is no longer the plan, Alessandro changed it. New plan:   

I had a meeting follow-up with Alessandro and, having a deeper look at the pairplot _k3, we cam up with the following tasks (which will replace the ones mentioned in the meeting):

1) Fix a set of data (you can choose the same time interval you showed to me, 2005-2008) and do the same analysis for k > 3. This should be done up to a value of k from which you don't notice a significant difference with previous values of k;   
2) For the same dataset, do the clustering (for different values of k) with all the features (6 orbital elements, mean motion, magnitude, diameter) and identify which of these features had the major contributions to the clusters identification.

# Step 1: Find the year with the most amount of data
Result: 2006

In [None]:
import cluster_data_pca_08072025
from cluster_data_pca_08072025 import run_clustering, normalize_data, unnormalize
import numpy as np
import pandas as pd
import os
from DBSCAN import dbscan_clustering
import cluster_plotter
import high_dim_analysis
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scores

running_ranges = cluster_data_pca_08072025.generate_running_year_ranges(2002, 2023, 1)
bins = cluster_data_pca_08072025.bin_data_for_clustering(running_ranges, print_res=False)
plot_dir = r"Images\\oca_and_clustering_08072025"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)
max_count = 0
max_year = None

max_count = 0
max_year = None

for cluster_data, year in bins:
    count = len(cluster_data.ecc)
    print(year, count)
    if count > max_count:
        max_count = count
        max_year = year

print(f"The year with the most data is {max_year} with {max_count} entries.")


2002-2002 1750
2003-2003 2487
2004-2004 1649
2005-2005 2151
2006-2006 2592
2007-2007 2157
2008-2008 721
2009-2009 2009
2010-2010 799
2011-2011 535
2012-2012 217
2013-2013 28
2014-2014 86
2015-2015 539
2016-2016 674
2017-2017 598
2018-2018 831
2019-2019 607
2020-2020 1255
2021-2021 749
2022-2022 998
2023-2023 1370
The year with the most data is 2006-2006 with 2592 entries.


## 9D clustering and pairplot, plus contributions of features using PCA loadings
Test on only one dataset, 2005 - 2008  
for k = 3 to k = 12
Compute the relevance/importance of the features (using PCA loadings), this is independent of k (PCA has nothing to do with clustering)

In [1]:
import cluster_data_pca_08072025
from cluster_data_pca_08072025 import run_clustering, normalize_data
import numpy as np
import pandas as pd
import os
from kmeans import k_means
from clustering_utils_pca_08072025 import ClusterData
import cluster_plotter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# --- Setup ---
data_set = {"2005-2008": np.arange(2005, 2009)}
binned_data = cluster_data_pca_08072025.bin_data_for_clustering(data_set, print_res=False)

plot_dir = r"Images\tasks_10072025_kmeans_and_pairplots"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

# --- Loop over year ranges ---
for cluster_data, year_range in binned_data:
    print(f"\nRunning K-Means for Year Range: {year_range}")

    X = np.vstack([
        cluster_data.ecc, cluster_data.sem_maj,
        cluster_data.inc, cluster_data.raan,
        cluster_data.perigee, cluster_data.true_lat,
        cluster_data.mean_motion, cluster_data.mag_obj,
        cluster_data.diameter
    ]).T

    feature_names = [
        "Eccentricity e", "Semi major axis [km]", "Inclination [°]",
        "RAAN [°]", "Perigee [°]", "True latitude [°]",
        "Mean Motion [rev/day]", "Magnitude [mag]", "Diameter [m]"
    ]

    normalized_data, data_min, data_max = normalize_data(X)
    k_values = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    for k in k_values:
        result, t, n_clusters, pts_per_cluster, _ = run_clustering(
            k_means, f"K-means (k={k})",
            normalized_data, data_min, data_max, k, init='kmeans++'
        )
        labels = result.labels

        # --- Pairplot for this k ---
        dfp = pd.DataFrame(X, columns=feature_names)
        dfp['cluster'] = labels.astype(str)
        pp = sns.pairplot(dfp, hue='cluster', diag_kind='kde',
                          plot_kws={'alpha': 0.6, 's': 8}, corner = True)
        pp.fig.suptitle(f"K-Means Clusters {year_range} (k={k})", y=1.02)
        pp.savefig(os.path.join(plot_dir, f"pairplot_{year_range}_k{k}.png"))
        plt.close(pp.fig)

# --- PCA contribution ---
# DOES NOT DEPEND ON K
# Standardize raw X for PCA
k = 3
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
pca.fit(X_scaled)

loadings = pca.components_.T  # shape: (n_features, n_components)

# PC1 only
pc1_loadings = np.abs(loadings[:, 0])
pc1_contrib = pc1_loadings / pc1_loadings.sum() * 100

sorted_idx = np.argsort(pc1_contrib)[::-1]
sorted_features = np.array(feature_names)[sorted_idx]
sorted_importance = pc1_contrib[sorted_idx]

# Plot PC1 loadings
plt.figure(figsize=(8, 6))
plt.barh(sorted_features, sorted_importance)
plt.xlabel("PC1 Loading Contribution [%]")
plt.title(f"PCA Feature Contribution (PC1) for {year_range} (independent of k)")
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, f"pca_feature_importance_{year_range}.png"))
plt.close()

print(f"\nFeature ranking by PC1 loading contribution for {year_range}:")
for feat, imp in zip(sorted_features, sorted_importance):
    print(f"{feat}: {imp:.2f}%")



Running K-Means for Year Range: 2005-2008
Runtime for k_means: 0.016351 seconds
Runtime for k_means: 0.033661 seconds
Runtime for k_means: 0.050035 seconds
Runtime for k_means: 0.065413 seconds
Runtime for k_means: 0.114370 seconds
Runtime for k_means: 0.281544 seconds
Runtime for k_means: 0.555828 seconds
Runtime for k_means: 0.171348 seconds
Runtime for k_means: 0.291775 seconds
Runtime for k_means: 0.262312 seconds

Feature ranking by PC1 loading contribution for 2005-2008:
Semi major axis [km]: 21.57%
Eccentricity e: 19.95%
Mean Motion [rev/day]: 19.39%
Inclination [°]: 14.94%
Diameter [m]: 8.30%
RAAN [°]: 5.43%
Perigee [°]: 4.11%
Magnitude [mag]: 3.46%
True latitude [°]: 2.86%


# Task from Alessandro: test dbscan with different tuning parameters with dbcv score
In 2D and 3D, tuning was easier, as we could use eye inspection to verify. In higher dimensions, we need a score. The only score that works in DBCV score, which is very computationally expensive (despite being written in Rust). 

In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
import cluster_data_pca_08072025 as cdp
from cluster_data_pca_08072025 import run_clustering, normalize_data
from DBSCAN import dbscan_clustering
import cluster_plotter
from scores import DBCV_score_rust

year_range = "2005-2008"
data_set = {year_range: np.arange(2005, 2009)}
plot_dir = r"Images/dbscan_9d_tuning_dbcv_score_"
os.makedirs(plot_dir, exist_ok=True)
cluster_plotter.clear_directory(plot_dir)

binned = cdp.bin_data_for_clustering(data_set, print_res=False)
data_obj, _ = binned[0]

X = np.vstack([
    data_obj.ecc,
    data_obj.sem_maj,
    data_obj.inc,
    data_obj.raan,
    data_obj.perigee,
    data_obj.true_lat,
    data_obj.mean_motion,
    data_obj.mag_obj,
    data_obj.diameter
]).T

norm_X, X_min, X_max = normalize_data(X)

# Start from an eps tuned in 2D (eps_2d) and scale to 9D: eps_9d = eps_2d * sqrt(9/2)
eps_2d_list = [0.08, 0.1, 0.15, 0.2, 0.25]  # example 2D eps candidates
eps_factor = 1
eps_vals = [round(e * eps_factor, 3) for e in eps_2d_list]

n_dim = X.shape[1]
ms_vals = [4, 7, 10, 20]

# --- Run DBSCAN for each combination and record scores ---
results = []
for eps in eps_vals:
    for ms in ms_vals:
        print(f"Running DBSCAN: eps={eps}, min_samples={ms}")
        clustering, duration, n_clusters, pts_per_cluster, _ = run_clustering(
            dbscan_clustering, "DBSCAN", norm_X, X_min, X_max,
            eps=eps, min_samples=ms
        )
        labels = clustering.labels
        score = DBCV_score_rust(clustering)
        print(f"DBCV Score: {score:.3f}")
        results.append({
            "eps": eps,
            "min_samples": ms,
            "n_clusters": n_clusters,
            "DBCV_score": score,
            "labels": labels
        })

# --- Generate pairplots and find best params ---
best = max(results, key=lambda r: r["DBCV_score"])
best_eps, best_ms = best["eps"], best["min_samples"]
print(f"\nBest parameters: eps={best_eps}, min_samples={best_ms}, score={best['DBCV_score']:.3f}\n")

for res in results:
    eps, ms, lbls = res["eps"], res["min_samples"], res["labels"]
    df = pd.DataFrame(X, columns=[
        "ecc", "a", "i", "raan", "perigee", "lat", "mn", "mag", "diam"
    ])
    df['cluster'] = lbls.astype(str)
    pp = sns.pairplot(df, hue='cluster', diag_kind='kde', plot_kws={'alpha':0.6, 's':8}, corner = True)
    pp.fig.suptitle(f"DBSCAN 9D Clusters {year_range} (eps={eps}, ms={ms})", y=1.02)
    fname = f"pairplot_{year_range}_eps{eps}_ms{ms}.png"
    pp.savefig(os.path.join(plot_dir, fname))
    plt.close(pp.fig)
    print(f"Stored plot for eps = {eps}, min_samples = {ms}.")

print("All pairplots saved to:", plot_dir)

Running DBSCAN: eps=0.08, min_samples=4
Runtime for dbscan_clustering: 0.177648 seconds
DBCV Score: -0.366
Running DBSCAN: eps=0.08, min_samples=7
Runtime for dbscan_clustering: 0.438085 seconds
DBCV Score: -0.570
Running DBSCAN: eps=0.08, min_samples=10
Runtime for dbscan_clustering: 0.411318 seconds
DBCV Score: -0.636
Running DBSCAN: eps=0.08, min_samples=20
Runtime for dbscan_clustering: 0.398099 seconds
DBCV Score: -0.686
Running DBSCAN: eps=0.1, min_samples=4
Runtime for dbscan_clustering: 0.471691 seconds
DBCV Score: -0.438
Running DBSCAN: eps=0.1, min_samples=7
Runtime for dbscan_clustering: 0.178404 seconds
DBCV Score: -0.647
Running DBSCAN: eps=0.1, min_samples=10
Runtime for dbscan_clustering: 0.166043 seconds
DBCV Score: -0.674
Running DBSCAN: eps=0.1, min_samples=20
Runtime for dbscan_clustering: 0.203169 seconds
DBCV Score: -0.704
Running DBSCAN: eps=0.15, min_samples=4
Runtime for dbscan_clustering: 0.298523 seconds
DBCV Score: -0.505
Running DBSCAN: eps=0.15, min_samples