In [4]:
from pathlib import Path
import joblib
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import sys
from pathlib import Path

# Aggiungi la cartella root del progetto al PYTHONPATH
project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

print(project_root)

from src.preprocessing import (
    load_clean_data,
    get_classification_feature_columns,
    prepare_classification_data,
    train_test_split_classification,
)
from src.features import build_preprocessor

# Paths
DB_PATH = Path("../data/processed/satellites.db")  # notebook Ã¨ in /notebooks
MODELS_DIR = Path("../models/sklearn")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# 1) Load data
df = load_clean_data(DB_PATH, table_name="satellites_clean")
df.head()

C:\Users\Alessio\Desktop\Portfolio_data_analysis\satellite_data_analysis


Unnamed: 0,norad_id,name,object_type,satellite_constellation,altitude_km,altitude_category,orbital_band,congestion_risk,inclination,eccentricity,launch_year_estimate,days_in_orbit_estimate,orbit_lifetime_category,mean_motion,epoch,data_source,snapshot_date,country,last_seen,period_minutes
0,900,CALSPHERE 1,PAYLOAD,Other,976.884937,Low LEO,LEO-Polar,LOW,90.2215,0.002694,2023,0,<1yr,13.763434,2025-12-01 05:13:21.035712,celestrak,2025-12-01,US,2025-12-01,104.625052
1,902,CALSPHERE 2,PAYLOAD,Other,1061.676342,Mid LEO,LEO-Polar,LOW,90.2363,0.002049,2023,0,<1yr,13.528813,2025-12-01 08:07:51.479328,celestrak,2025-12-01,US,2025-12-01,106.439491
2,1361,LCS 1,PAYLOAD,Other,2787.875054,High LEO,MEO,LOW,32.1433,0.001342,2023,0,<1yr,9.893094,2025-12-01 06:07:31.789920,celestrak,2025-12-01,US,2025-12-01,145.556085
3,1512,TEMPSAT 1,PAYLOAD,Other,1133.287003,Mid LEO,LEO-Polar,HIGH,89.9889,0.007145,2023,0,<1yr,13.335808,2025-12-01 10:59:03.479424,celestrak,2025-12-01,US,2025-12-01,107.979956
4,1520,CALSPHERE 4A,PAYLOAD,Other,1123.333009,Mid LEO,LEO-Polar,HIGH,89.909,0.006829,2023,0,<1yr,13.362361,2025-12-01 11:13:16.955040,celestrak,2025-12-01,US,2025-12-01,107.765388


In [10]:
X, y = prepare_classification_data(df)
num_cols, cat_cols = get_classification_feature_columns()

print("Shape X:", X.shape)
print("Shape y:", y.shape)
print("Distribuzione target:")
print(y.value_counts(normalize=True))

Shape X: (13088, 9)
Shape y: (13088,)
Distribuzione target:
LEO-Inclined      0.446898
LEO-Equatorial    0.261996
LEO-Polar         0.228606
GEO               0.034765
MEO               0.020171
GEO-Inclined      0.004814
HEO               0.002139
LEO-Retrograde    0.000611
Name: orbital_band, dtype: float64


In [6]:
X_train, X_test, y_train, y_test = train_test_split_classification(X, y)

X_train.shape, X_test.shape

((10470, 9), (2618, 9))

In [7]:
preprocessor = build_preprocessor(num_cols, cat_cols)
preprocessor

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['inclination', 'eccentricity',
                                  'launch_year_estimate',
                                  'days_in_orbit_estimate']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['object_type', 'satellite_constellation',
                                  'congestion_risk', 'orbit_lifetime_category',
                                  'country'])])

In [8]:
models = {
    "logreg": LogisticRegression(max_iter=1000),
    "rf": RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        n_jobs=-1,
        random_state=42,
    ),
    "gb": GradientBoostingClassifier(random_state=42),
}

results = {}

In [9]:
# 6) Training e valutazione

for name, clf in models.items():
    print(f"\n=== Training {name} ===")
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", clf),
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print(classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    acc = np.mean(y_pred == y_test)
    results[name] = {"accuracy": acc}

    # Salva il modello
    model_path = MODELS_DIR / f"classification_{name}.joblib"
    joblib.dump(pipe, model_path)
    print("Model saved to:", model_path)

results


=== Training logreg ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

           GEO       0.81      1.00      0.90        91
  GEO-Inclined       0.50      0.08      0.13        13
           HEO       0.57      0.67      0.62         6
LEO-Equatorial       0.99      0.98      0.99       686
  LEO-Inclined       0.99      0.99      0.99      1170
     LEO-Polar       1.00      1.00      1.00       598
LEO-Retrograde       0.00      0.00      0.00         1
           MEO       0.70      0.57      0.62        53

      accuracy                           0.98      2618
     macro avg       0.69      0.66      0.66      2618
  weighted avg       0.98      0.98      0.98      2618

Confusion matrix:
 [[  91    0    0    0    0    0    0    0]
 [   8    1    0    2    0    0    0    2]
 [   0    0    4    0    0    0    0    2]
 [   2    0    0  675    3    0    0    6]
 [   0    0    0    5 1162    0    0    3]
 [   0    0    0    0    0  598    0    0]
 [   0    0    0    0    0    1    0    0]
 [  1

{'logreg': {'accuracy': 0.9782276546982429},
 'rf': {'accuracy': 0.9912146676852559},
 'gb': {'accuracy': 0.9900687547746372}}