In [8]:
from pathlib import Path
import joblib

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import sys
ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("Project root in sys.path:", ROOT)


from src.preprocessing import (
    load_clean_data,
    get_regression_feature_columns,
    prepare_regression_data,
    train_test_split_regression,
)
from src.features import build_preprocessor

# Paths
DB_PATH = Path("../data/processed/satellites.db")
MODELS_DIR = Path("../models/sklearn")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# 1) Load data
df = load_clean_data(DB_PATH, table_name="satellites_clean")
df.head()

Project root in sys.path: C:\Users\Alessio\Desktop\Portfolio_data_analysis\satellite_data_analysis


Unnamed: 0,norad_id,name,object_type,satellite_constellation,altitude_km,altitude_category,orbital_band,congestion_risk,inclination,eccentricity,launch_year_estimate,days_in_orbit_estimate,orbit_lifetime_category,mean_motion,epoch,data_source,snapshot_date,country,last_seen,period_minutes
0,900,CALSPHERE 1,PAYLOAD,Other,976.884937,Low LEO,LEO-Polar,LOW,90.2215,0.002694,2023,0,<1yr,13.763434,2025-12-01 05:13:21.035712,celestrak,2025-12-01,US,2025-12-01,104.625052
1,902,CALSPHERE 2,PAYLOAD,Other,1061.676342,Mid LEO,LEO-Polar,LOW,90.2363,0.002049,2023,0,<1yr,13.528813,2025-12-01 08:07:51.479328,celestrak,2025-12-01,US,2025-12-01,106.439491
2,1361,LCS 1,PAYLOAD,Other,2787.875054,High LEO,MEO,LOW,32.1433,0.001342,2023,0,<1yr,9.893094,2025-12-01 06:07:31.789920,celestrak,2025-12-01,US,2025-12-01,145.556085
3,1512,TEMPSAT 1,PAYLOAD,Other,1133.287003,Mid LEO,LEO-Polar,HIGH,89.9889,0.007145,2023,0,<1yr,13.335808,2025-12-01 10:59:03.479424,celestrak,2025-12-01,US,2025-12-01,107.979956
4,1520,CALSPHERE 4A,PAYLOAD,Other,1123.333009,Mid LEO,LEO-Polar,HIGH,89.909,0.006829,2023,0,<1yr,13.362361,2025-12-01 11:13:16.955040,celestrak,2025-12-01,US,2025-12-01,107.765388


In [9]:
# 2) Prepare X, y for regression

X, y = prepare_regression_data(df)
num_cols, cat_cols = get_regression_feature_columns()

print("Shape X:", X.shape)
print("Target stats:")
print(y.describe())

Shape X: (13088, 10)
Target stats:
count    13088.000000
mean       168.594141
std        305.456864
min         87.218358
25%         94.105928
50%         95.437705
75%         96.106283
max       5070.396648
Name: period_minutes, dtype: float64


In [11]:
# 3) Train/test split

X_train, X_test, y_train, y_test = train_test_split_regression(X, y)
X_train.shape, X_test.shape

((10470, 10), (2618, 10))

In [12]:
# 4) Preprocessor

preprocessor = build_preprocessor(num_cols, cat_cols)
preprocessor

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['inclination', 'eccentricity',
                                  'launch_year_estimate',
                                  'days_in_orbit_estimate']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['object_type', 'satellite_constellation',
                                  'congestion_risk', 'orbit_lifetime_category',
                                  'country', 'orbital_band'])])

In [13]:
# 5) regression models

models = {
    "linreg": LinearRegression(),
    "rf_reg": RandomForestRegressor(
        n_estimators=200,
        n_jobs=-1,
        random_state=42,
    ),
    "gb_reg": GradientBoostingRegressor(
        random_state=42,
    ),
}

results = {}

In [14]:
# 6) Training + evaluation

for name, reg in models.items():
    print(f"\n=== Training {name} ===")
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", reg),
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE:  {mae:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R²:   {r2:.3f}")

    results[name] = {"mae": mae, "rmse": rmse, "r2": r2}

    model_path = MODELS_DIR / f"regression_{name}.joblib"
    joblib.dump(pipe, model_path)
    print("Model saved to:", model_path)

results


=== Training linreg ===
MAE:  14.466
RMSE: 76.209
R²:   0.932
Model saved to: ..\models\sklearn\regression_linreg.joblib

=== Training rf_reg ===
MAE:  1.462
RMSE: 28.812
R²:   0.990
Model saved to: ..\models\sklearn\regression_rf_reg.joblib

=== Training gb_reg ===
MAE:  3.135
RMSE: 20.276
R²:   0.995
Model saved to: ..\models\sklearn\regression_gb_reg.joblib


{'linreg': {'mae': 14.466410327225068,
  'rmse': 76.2091334608134,
  'r2': 0.9316582631154785},
 'rf_reg': {'mae': 1.4621378778877456,
  'rmse': 28.812369911570904,
  'r2': 0.9902314539340794},
 'gb_reg': {'mae': 3.1352898364244584,
  'rmse': 20.27553565202388,
  'r2': 0.995162548341253}}