In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

import os
import sys
sys.path.append(os.path.abspath('../configs'))
from config import *

%load_ext autoreload
%autoreload 2

  from scipy.sparse import csr_matrix, issparse


In [3]:
# Step 1: Load CSV without parsing dates
df = pd.read_csv('CompiledDataSet/INMS_V1.csv')

# Step 2: Convert the date column using the correct format
df['TIME'] = pd.to_datetime(df['sclk'], format='%Y-%jT%H:%M:%S.%f', errors='coerce')

In [45]:
use_training_subset = True #Use only 10min interval around the plume

if use_training_subset:
    df_ml = df[df['TRAINING_SUBSET'] == True].copy()
    print(f"Using training subset: {df_ml.shape[0]} rows")
else:
    df_ml = df.copy()
    print(f"Using full data: {df_ml.shape[0]} rows")

Using training subset: 241247 rows


In [None]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

class ModelBenchmark:
    def __init__(self, models, param_grids, df, feature_names, feature_cols, target_col, sample_frac):
        """
        models: dict of form {"rf": RandomForestClassifier(), "lr": LogisticRegression(), ...}
        param_grids: dict of form {"rf": {...}, "lr": {...}}
        X_train, y_train: Training data
        X_val, y_val: Validation data
        X_test, y_test: Test data
        feature_names: list of feature names
        """
        self.models = models
        self.param_grids = param_grids
        self.df = df

        # Sample fraction for fast testing
        if sample_frac < 1.0:
            df_ml = df.sample(frac=sample_frac, random_state=42).copy()
        else:
            df_ml = df.copy()
        X = df_ml[feature_cols]
        y = df_ml[target_col]

        # Split for model comparison (as above)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.feature_names = feature_names
        self.results = []

        # BEST MODEL TRACKING
        self.best_score = -float('inf')
        self.best_model = None
        self.best_params = None
        self.best_name = None

    def hypertune(self, model_key, n_iter=20, scoring='f1', cv=5):
        model = self.models[model_key]
        param_grid = self.param_grids.get(model_key, {})
        searcher = RandomizedSearchCV(
            model,
            param_distributions=param_grid,
            n_iter=n_iter,      
            scoring=scoring,
            cv=cv,             
            verbose=0,          
            n_jobs=10,      
            random_state=42
        )
        searcher.fit(self.X_train, self.y_train)
        return searcher.best_estimator_, searcher.best_params_, searcher.best_score_


    def evaluate(self, clf, mdl_name):
        y_test_pred = clf.predict(self.X_test)
        y_test_prob = clf.predict_proba(self.X_test)[:, 1]
        test_report = classification_report(self.y_test, y_test_pred, digits=3, output_dict=True)
        
        # Access feature importances only if estimator supports it
        feature_importances = None
        # For pipeline with XGBoost
        if hasattr(clf, "named_steps") and "xgb" in clf.named_steps:
            feature_importances = clf.named_steps["xgb"].feature_importances_
        elif hasattr(clf, "feature_importances_"):
            feature_importances = clf.feature_importances_
        
        result = {
            "model": mdl_name,
            "roc_auc_test": roc_auc_score(self.y_test, y_test_prob),
            "test_report": test_report,
            "conf_matrix_test": confusion_matrix(self.y_test, y_test_pred),
            "feature_importances": feature_importances
        }
        self.results.append(result)
        print(f"\n===== {mdl_name} Results =====")
        print("Test:")
        print(classification_report(self.y_test, y_test_pred, digits=3))
        if feature_importances is not None:
            print("Feature Importances:")
            feat_imp_df = pd.DataFrame({
                "Feature": self.feature_names,
                "Importance": feature_importances
            }).sort_values("Importance", ascending=False)
            print(feat_imp_df)


    def run_all(self):
        for key in self.models:
            print(f"\nHypertuning {key}...")
            best_clf, best_params, best_score = self.hypertune(key)
            print("Best Params:", best_params)
            print(f"Best CV Score: {best_score:.3f}")
            self.evaluate(best_clf, key)
            
            # Update best if this model is better
            if best_score > self.best_score:
                self.best_score = best_score
                self.best_model = best_clf
                self.best_params = best_params
                self.best_name = key

    def summary(self):
        df_sum = pd.DataFrame([
            {
                "model": r["model"],
                "roc_auc_test": r["roc_auc_test"],
                "test_f1": r["test_report"]["1"]["f1-score"]
            } for r in self.results
        ])
        print("\nModel Comparison Summary:")
        print(df_sum.sort_values("roc_auc_test", ascending=False))
        
        # EXTRA: Show Best Model Details
        print("\n=== Best Model Overall ===")
        print(f"Model: {self.best_name}")
        print(f"Best CV Score: {self.best_score:.3f}")
        print(f"Best Params: {self.best_params}")

In [66]:
from models.randomforest import get_model as get_rf, param_grid as rf_grid
from models.logisticregression import get_model as get_lr, param_grid as lr_grid
from models.xgboost import get_model as get_xg, param_grid as xg_grid
from models.knn import get_model as get_knn, param_grid as knn_grid

models = {
    #"rf": get_rf(),
    #"lr": get_lr(),
    "xg": get_xg(),
    #"knn": get_knn(),
}
param_grids = {
    #"rf": rf_grid,
    #"lr": lr_grid,
    "xg": xg_grid,
    #"knn": knn_grid
}

# Define features and target for ML (drop unnecessary columns)
exclude_cols = ['sclk', 'source', 'TIME', 'FLYBY', 'TRAINING_SUBSET',"TIME_FROM_CA"] # "alt_t"
feature_cols = [col for col in df_ml.columns if col not in exclude_cols + ['PLUME']]
#feature_cols = ["c1counts", "c2counts", "mass_per_charge","velocity_comp" ]
target_col = 'PLUME'

benchmark = ModelBenchmark(models, param_grids, df_ml, feature_cols, feature_cols, "PLUME", sample_frac=1)

benchmark.run_all()
benchmark.summary()


Hypertuning xg...
Best Params: {'xgb__subsample': 0.8, 'xgb__scale_pos_weight': 1, 'xgb__n_estimators': 100, 'xgb__min_child_weight': 1, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.2, 'xgb__gamma': 0.1, 'xgb__colsample_bytree': 0.8}
Best CV Score: 0.997

===== xg Results =====
Test:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     69225
           1      0.994     0.998     0.996      3150

    accuracy                          1.000     72375
   macro avg      0.997     0.999     0.998     72375
weighted avg      1.000     1.000     1.000     72375

Feature Importances:
            Feature  Importance
9        sc_pos_t_y    0.598685
6      view_dir_t_x    0.107074
7      view_dir_t_y    0.077200
5             alt_t    0.066374
1        targ_pos_y    0.062414
12         c1counts    0.027550
0        targ_pos_x    0.019871
10       sc_pos_t_z    0.018923
3     velocity_comp    0.006739
8      view_dir_t_z    0.005617
11       sc_ve

In [67]:
use_training_subset = False #Use only 10min interval around the plume

if use_training_subset:
    df_ml = df[df['TRAINING_SUBSET'] == True].copy()
    print(f"Using training subset: {df_ml.shape[0]} rows")
else:
    df_ml = df.copy()
    print(f"Using full data: {df_ml.shape[0]} rows")

Using full data: 1418837 rows


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Model best hyperparameters from your results
params_lr = {
    "solver": "newton-cholesky",
    "penalty": "l2",
    "max_iter": 2000,
    "class_weight": None,
    "C": 100.0
}
params_rf = {
    "n_estimators": 100,
    "min_samples_split": 5,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
    "max_depth": 8,
    "class_weight": None
}
params_knn = {
    "n_neighbors": 9,
    "weights": "distance",
    "metric": "manhattan",
    "leaf_size": 50,
    "algorithm": "auto"
}
params_xgb = {
    "subsample": 0.8,
    "scale_pos_weight": 1,
    "n_estimators": 100,
    "min_child_weight": 1,
    "max_depth": 6,
    "learning_rate": 0.2,
    "gamma": 0.1,
    "colsample_bytree": 0.8,
    "random_state": 42,
    "use_label_encoder": False,
    "eval_metric": "logloss"
}


# Feature and target extraction
X = df_ml[feature_cols]
y = df_ml[target_col]

# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}

for name, pipeline in [
    ("Logistic Regression", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(**params_lr))
    ])),
    ("Random Forest", Pipeline([
        ("clf", RandomForestClassifier(**params_rf))
    ])),
    ("KNN", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier(**params_knn))
    ])),
    ("XGBoost", Pipeline([
        ("scaler", StandardScaler()),
        ("clf", XGBClassifier(**params_xgb))
    ]))
]:
    print(f"\nRunning {name} on full data...")
    start = time.time()
    scores = cross_val_score(
        pipeline, X, y, cv=cv, scoring='f1', n_jobs=-1
    )
    comp_time = time.time() - start
    print(f"{name} CV F1 scores: {scores}")
    print(f"{name} Mean F1: {np.mean(scores):.3f} (+/- {np.std(scores):.3f})")
    print(f"{name} Total computation time: {comp_time:.1f} seconds")
    results[name] = {'scores': scores, 'mean': np.mean(scores), 'std': np.std(scores), 'time': comp_time}



Running Logistic Regression on full data...
Logistic Regression CV F1 scores: [0.90282277 0.90699285 0.91491815 0.91810144 0.91435185]
Logistic Regression Mean F1: 0.911 (+/- 0.006)
Logistic Regression Total elapsed time: 9.0 seconds

Running Random Forest on full data...
Random Forest CV F1 scores: [0.99976196 0.99952381 0.99976196 0.99856938 0.99976196]
Random Forest Mean F1: 0.999 (+/- 0.000)
Random Forest Total elapsed time: 130.0 seconds

Running KNN on full data...
KNN CV F1 scores: [0.88536585 0.87853659 0.89396887 0.88274174 0.88597768]
KNN Mean F1: 0.885 (+/- 0.005)
KNN Total elapsed time: 12.5 seconds

Running XGBoost on full data...
XGBoost CV F1 scores: [0.97366548 0.973659   0.98261491 0.97788462 0.97253403]
XGBoost Mean F1: 0.976 (+/- 0.004)
XGBoost Total elapsed time: 15.8 seconds
