In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

#  Load data 
df_metrics = pd.read_csv("../data/data_derivatives/features_within_subject.csv")
df_metrics = df_metrics[df_metrics["SedationLevel"] != 4].copy()  # drop recovery-state
features = [col for col in df_metrics if col.endswith("_ratio")]
target = "SedationLevel"

# features = [
#       'mean_degree',
#       'clustering', 
#       'path_length', 
#       'global_efficiency', 
#       'local_efficiency',
#       'modularity', 
#       'participation_coefficient', 
#       'small_worldness',
#       'mean_degree_delta',
#       'clustering_delta', 
#       'path_length_delta', 
#       'global_efficiency_delta',
#       'local_efficiency_delta', 
#       'modularity_delta',
#       'participation_coefficient_delta', 
#       'small_worldness_delta'
#    ]

X = df_metrics[features].copy()
y = df_metrics[target]
groups = df_metrics["Subject"]



In [None]:
# Group-aware Cross Validated Accuracy
gkf = GroupKFold(n_splits=5)

# Define pipeline: scaling → PCA → RandomForest
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95)),
    ("rf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Parameter grid (you can also tune n_components if you want)
param_grid = {
    "rf__n_estimators": [200, 400, 600],
    "rf__max_depth": [3, 4, 5],
    "rf__min_samples_leaf": [3, 4, 5, 6],
    "rf__max_features": [0.4, 0.5, 0.6]
}

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=gkf,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X, y, groups=groups)

print("Best parameters:", grid_search.best_params_)
print(f"Best cross-val accuracy: {grid_search.best_score_:.3f}")