## Data exploration

This script performs some data exploration to see, what kind of data our data set contains.

In [2]:
import pandas as pd
import pathlib
import os
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath('configs'))
from config import *
path = pathlib.Path.cwd()

In [3]:
data_path = path / "CompiledDataSet" / "Cassini_CDA_Count+Event_V3.csv"

df = pd.read_csv(data_path, index_col=0, parse_dates=["TIME"])
df.head()

Unnamed: 0,TIME,COUNTER_0,COUNTER_1,COUNTER_2,COUNTER_3,COUNTER_4,COUNTER_5,COUNTER_6,COUNTER_7,COUNTER_8,...,SPACECRAFT_RA,SC_SIII_LONG,SC_SIII_LAT,SC_SAT_DIST,SC_X_VEL,SC_Y_VEL,SC_Z_VEL,DETECTOR_RA,DETECTOR_DEC,PLUME_LABEL
0,2005-01-01 00:00:34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,115.32,223.44,12.46,59.63,-8.38,-4.45,-1.08,67.33,59.25,0
1,2005-01-01 00:01:38,0.0,0.0,0.0,0.0,0.0,0.046875,0.0,0.0,0.0,...,115.32,223.44,12.46,59.63,-8.38,-4.45,-1.08,67.33,59.25,0
2,2005-01-01 00:02:42,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,...,115.32,223.44,12.46,59.63,-8.38,-4.45,-1.08,67.33,59.25,0
3,2005-01-01 00:03:46,0.0,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.0,...,115.32,223.44,12.46,59.63,-8.38,-4.45,-1.08,67.33,59.25,0
4,2005-01-01 00:04:50,0.0,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.0,...,115.32,223.44,12.46,59.63,-8.38,-4.45,-1.08,67.6,59.23,0


## Random forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

exclude_cols = ['TIME', 'PLUME_LABEL', 'SOURCE_VOLUME', 'SOURCE_TYPE', 'SOURCE_FILE']
feature_cols = [col for col in df.columns 
                if col not in exclude_cols 
                and pd.api.types.is_numeric_dtype(df[col])]
X = df[feature_cols]
y = df['PLUME_LABEL'].astype(int)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Hyperparameter grid for RF
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5]
}

clf = RandomForestClassifier(class_weight='balanced', n_jobs=9, random_state=42)

grid_search = GridSearchCV(clf, param_grid, 
                           cv=5,
                           scoring='f1',
                           verbose=2,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
print("Best params:", grid_search.best_params_)
best_clf = grid_search.best_estimator_

y_pred = best_clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
importances = best_clf.feature_importances_
feature_importance = pd.Series(importances, index=feature_cols).sort_values(ascending=False)
print("Random Forest feature importances:")
print(feature_importance)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 

In [7]:
# Select features – numeric columns excluding time, label, and metadata
# exclude_cols = ['TIME', 'PLUME_LABEL', 'SOURCE_VOLUME', 'SOURCE_TYPE', 
#                 'SOURCE_FILE',"SC_SAT_DIST","SC_SIII_LAT",
#                 "SPACECRAFT_RA","SC_SIII_LONG","SC_X_VEL","SC_Y_VEL", "SC_Z_VEL" ]

exclude_cols = ["PLUME_LABEL"]

feature_cols = [col for col in df.columns 
                if col not in exclude_cols 
                and pd.api.types.is_numeric_dtype(df[col])]
X = df[feature_cols]
y = df['PLUME_LABEL'].astype(int)

# Optional: Fill NaNs or impute. Here, simply fill with feature means
X = X.fillna(X.mean())

# Stratified train-test split (preserves plume/background ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# feature_cols as previously defined
importances = clf.feature_importances_
feature_importance = pd.Series(importances, index=feature_cols).sort_values(ascending=False)

print("Random Forest Feature Importances:")
print(feature_importance)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     90158
           1       0.76      0.76      0.76        25

    accuracy                           1.00     90183
   macro avg       0.88      0.88      0.88     90183
weighted avg       1.00      1.00      1.00     90183

Confusion Matrix:
[[90152     6]
 [    6    19]]
Random Forest Feature Importances:
DETECTOR_DEC     1.340994e-01
DETECTOR_RA      1.003406e-01
QI_AMPLITUDE     9.416116e-02
QT_AMPLITUDE     9.205554e-02
SC_Z_VEL         8.805843e-02
SC_SAT_DIST      7.049358e-02
SC_X_VEL         6.759229e-02
SPACECRAFT_RA    6.643488e-02
SC_SIII_LONG     6.249259e-02
SC_Y_VEL         6.000541e-02
QI_RISE_TIME     4.477808e-02
SC_SIII_LAT      3.897051e-02
QC_AMPLITUDE     3.343788e-02
QT_RISE_TIME     2.712911e-02
QC_RISE_TIME     1.654121e-02
QP_AMPLITUDE     1.755729e-03
COUNTER_19       7.783232e-04
COUNTER_14       4.635681e-04
COUNTER_16       1.364523e

Logistic regression