In [4]:
import h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# 1. Load the dataset
h5_path = '/Users/albertozanelli/Desktop/astrostatistics_bicocca_2025/working/sample_2e7_design_precessing_higherordermodes_3detectors.h5'
with h5py.File(h5_path, 'r') as f:
    # Assuming features are under 'features' group and label under 'det'
    features = f['features'][:]      # shape (N, F)
    labels = f['det'][:]             # shape (N,)
    feature_names = list(f['features'].attrs['names'])  # optional feature names

# Convert to DataFrame for convenience
df = pd.DataFrame(features, columns=feature_names)
df['label'] = labels

# 2. Train-validation split
X = df.drop('label', axis=1)
y = df['label']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Build a pipeline: scaling + classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# 4. Hyperparameter grid
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5],
}

# 5. Grid search
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='recall',   # optimize for completeness
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)

# 6. Evaluate on validation set
y_pred = grid.predict(X_val)
cm = confusion_matrix(y_val, y_pred)
report = classification_report(y_val, y_pred, target_names=['undetected', 'detected'])

print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(report)

# 7. Compute completeness and contamination
# completeness = recall of detected class
tp = cm[1,1]
fn = cm[1,0]
fp = cm[0,1]

completeness = tp / (tp + fn)
contamination = fp / (fp + tp)

print(f"Completeness (recall): {completeness:.3f}")
print(f"Contamination (false positive rate): {contamination:.3f}")


KeyError: "Unable to synchronously open object (object 'features' doesn't exist)"