Selecting the optimal number of features dynamically

In [1]:
import pickle
import os
import time
from mrmr_dynamic_selection import MRMR

# Load the encoded split data
with open("Data/encoded_split_data.pkl", "rb") as f:
    split_data = pickle.load(f)

# Dictionary to store results
dynamic_feature_selection_results = {}

# Process each dataset
for name, data in split_data.items():
    print(f"🔍 Processing {name}...")
    X_train = data["X_train"]
    X_val = data["X_val"]
    X_test = data["X_test"]
    y_train = data["y_train"]
    y_val = data["y_val"]
    y_test = data["y_test"]

    # Determine which features are discrete (flag bools as discrete)
    discrete_flags = [
        True if dtype == bool else False  # Flag only bools as discrete
        for dtype in X_train.dtypes
    ]

    # Convert bool columns to int (0/1) in all splits
    for df in [X_train, X_val, X_test]:
        bool_cols = df.select_dtypes(include='bool').columns
        df[bool_cols] = df[bool_cols].astype(int)

    try:

        # Run dynamic MRMR
        selector = MRMR(
            variables=X_train.columns.tolist(),
            method="MID",
            discrete_features=discrete_flags,
            regression=True,
            random_state=42
        )
        start = time.time()
        selector.fit(X_train, y_train)
        elapsed = time.time() - start

        # Transform the datasets
        X_train_sel = selector.transform(X_train)
        X_val_sel = selector.transform(X_val)
        X_test_sel = selector.transform(X_test)

        # Save everything to dictionary
        dynamic_feature_selection_results[name] = {
            "k": X_train_sel.shape[1],
            "X_train": X_train_sel,
            "X_val": X_val_sel,
            "X_test": X_test_sel,
            "y_train": y_train,
            "y_val": y_val,
            "y_test": y_test,
            "time": elapsed
        }

        print(f"✅ {name}: Selected {X_train_sel.shape[1]} features in {elapsed:.2f} seconds.")

    except Exception as e:
        print(f"❌ Error processing {name}: {e}")
        dynamic_feature_selection_results[name] = {
            "error": str(e)
        }

# Save the results
os.makedirs("Data", exist_ok=True)
with open("Data/dynamic_selected_features.pkl", "wb") as f:
    pickle.dump(dynamic_feature_selection_results, f)

print("\n🎯 All datasets processed. Results saved to 'Data/dynamic_selected_features.pkl'.")

🔍 Processing fri_c1_500_50...
✅ fri_c1_500_50: Selected 18 features in 2.84 seconds.
🔍 Processing fri_c3_1000_50...
✅ fri_c3_1000_50: Selected 19 features in 4.72 seconds.
🔍 Processing fri_c4_500_50...
✅ fri_c4_500_50: Selected 21 features in 2.33 seconds.
🔍 Processing fri_c4_1000_50...
✅ fri_c4_1000_50: Selected 20 features in 3.99 seconds.
🔍 Processing fri_c2_1000_25...
✅ fri_c2_1000_25: Selected 11 features in 1.15 seconds.
🔍 Processing fri_c1_1000_25...
✅ fri_c1_1000_25: Selected 9 features in 1.03 seconds.
🔍 Processing fri_c3_1000_25...
✅ fri_c3_1000_25: Selected 12 features in 1.24 seconds.
🔍 Processing BodyFat...
✅ BodyFat: Selected 6 features in 0.23 seconds.
🔍 Processing Forest_Fires...


  y = column_or_1d(y, warn=True)


✅ Forest_Fires: Selected 6 features in 0.93 seconds.
🔍 Processing Quakes...
✅ Quakes: Selected 1 features in 0.05 seconds.
🔍 Processing Servo...
✅ Servo: Selected 5 features in 0.10 seconds.
🔍 Processing auto93...


  y = column_or_1d(y, warn=True)


✅ auto93: Selected 28 features in 1.39 seconds.
🔍 Processing autoPrice...
✅ autoPrice: Selected 2 features in 0.05 seconds.
🔍 Processing autoMPG...
✅ autoMPG: Selected 4 features in 0.06 seconds.
🔍 Processing Concrete_Compressive_Strength...
✅ Concrete_Compressive_Strength: Selected 2 features in 0.08 seconds.
🔍 Processing Airfoil_Self_Noise...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


✅ Airfoil_Self_Noise: Selected 2 features in 0.08 seconds.
🔍 Processing pyrim...
✅ pyrim: Selected 15 features in 0.28 seconds.
🔍 Processing boston...
✅ boston: Selected 7 features in 0.26 seconds.
🔍 Processing Wine_Quality...


  y = column_or_1d(y, warn=True)


✅ Wine_Quality: Selected 1 features in 0.57 seconds.
🔍 Processing California_Housing...
✅ California_Housing: Selected 2 features in 1.74 seconds.

🎯 All datasets processed. Results saved to 'Data/dynamic_selected_features.pkl'.
