Checking if there are any categorical variables in our data

In [1]:
import pickle

# Load the split_data dictionary
with open("Data/encoded_split_data.pkl", "rb") as f:
    split_data = pickle.load(f)

# Print variable types for X_train of each dataset
for name, data in split_data.items():
    X_train = data["X_train"]
    dtypes = X_train.dtypes.unique()
    print(f"\n{name} — Unique data types in X_train:")
    print(dtypes)


fri_c1_500_50 — Unique data types in X_train:
[dtype('float64')]

fri_c3_1000_50 — Unique data types in X_train:
[dtype('float64')]

fri_c4_500_50 — Unique data types in X_train:
[dtype('float64')]

fri_c4_1000_50 — Unique data types in X_train:
[dtype('float64')]

fri_c2_1000_25 — Unique data types in X_train:
[dtype('float64')]

fri_c1_1000_25 — Unique data types in X_train:
[dtype('float64')]

fri_c3_1000_25 — Unique data types in X_train:
[dtype('float64')]

BodyFat — Unique data types in X_train:
[dtype('float64') dtype('int64')]

Forest_Fires — Unique data types in X_train:
[dtype('int64') dtype('float64') dtype('bool')]

Quakes — Unique data types in X_train:
[dtype('int64') dtype('float64')]

Servo — Unique data types in X_train:
[dtype('int64') dtype('bool')]

auto93 — Unique data types in X_train:
[dtype('uint8') dtype('float64') dtype('bool')]

autoPrice — Unique data types in X_train:
[dtype('float64') dtype('uint8')]

autoMPG — Unique data types in X_train:
[dtype('float6

Doing the FS

In [2]:
import pandas as pd
from feature_engine.selection import MRMR
import os
import time

# Step 1: Load split_data.pkl
with open("Data/encoded_split_data.pkl", "rb") as f:
    split_data = pickle.load(f)

# Step 2: Initialize output dictionary
selected_feature_data = {}

# Step 3: Loop through each dataset
for name, data in split_data.items():
    print(f"Processing {name}...")
    X_train = data["X_train"]
    X_val = data["X_val"]
    X_test = data["X_test"]
    y_train = data["y_train"]
    y_val = data["y_val"]
    y_test = data["y_test"]

    variables = X_train.columns.tolist()

    # Determine which features are discrete (flag bools as discrete)
    discrete_flags = [
        True if dtype == bool else False  # Flag only bools as discrete
        for dtype in X_train.dtypes
    ]

    # Convert bool columns to int (0/1) in all splits
    for df in [X_train, X_val, X_test]:
        bool_cols = df.select_dtypes(include='bool').columns
        df[bool_cols] = df[bool_cols].astype(int)

    # Store results for this dataset
    selected_feature_data[name] = {}

    # Step 4: Loop from 1 to number of variables
    for k in range( 2, len(variables) + 1):
        try:
            if k == len(variables):
                # Skip MRMR – use original data as-is
                selected_feature_data[name][k] = {
                    "X_train": X_train,
                    "X_val": X_val,
                    "X_test": X_test,
                    "y_train": y_train,
                    "y_val": y_val,
                    "y_test": y_test,
                    "time": 0.0
                }
                continue
    
            sel = MRMR(
                variables=variables,
                method="MID",
                max_features=k,
                discrete_features=discrete_flags,
                regression=True,
                random_state=42
            )
            start = time.time()
            sel.fit(X_train, y_train)
            elapsed = time.time() - start

            # Transform splits
            X_train_sel = sel.transform(X_train)
            X_val_sel = sel.transform(X_val)
            X_test_sel = sel.transform(X_test)

            # Save results under this k
            selected_feature_data[name][k] = {
                "X_train": X_train_sel,
                "X_val": X_val_sel,
                "X_test": X_test_sel,
                "y_train": y_train,
                "y_val": y_val,
                "y_test": y_test,
                "time": elapsed
            }

        except Exception as e:
            print('It Hit Here')
            selected_feature_data[name][k] = {"error": str(e)}

# Step 5: Save to pickle file
os.makedirs("Data", exist_ok=True)
with open("Data/selected_feature_data.pkl", "wb") as f:
    pickle.dump(selected_feature_data, f)

print("✅ Feature selection completed and saved to 'Data/selected_feature_data.pkl'")


Processing fri_c1_500_50...
Processing fri_c3_1000_50...
Processing fri_c4_500_50...
Processing fri_c4_1000_50...
Processing fri_c2_1000_25...
Processing fri_c1_1000_25...
Processing fri_c3_1000_25...
Processing BodyFat...
Processing Forest_Fires...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Processing Quakes...
Processing Servo...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Processing auto93...
Processing autoPrice...
Processing autoMPG...
Processing Concrete_Compressive_Strength...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Processing Airfoil_Self_Noise...
Processing pyrim...
Processing boston...
Processing Wine_Quality...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Processing California_Housing...
✅ Feature selection completed and saved to 'Data/selected_feature_data.pkl'


Doing the FS for simulated Dataset

In [2]:
import pandas as pd
from feature_engine.selection import MRMR
import os
import time
import pickle

# Step 1: Load split_data.pkl
with open("Data/encoded_split_data_simulated.pkl", "rb") as f:
    split_data = pickle.load(f)

# Step 2: Initialize output dictionary
selected_feature_data = {}

# Step 3: Loop through each dataset
for name, data in split_data.items():
    print(f"Processing {name}...")
    X_train = data["X_train"]
    X_val = data["X_val"]
    X_test = data["X_test"]
    y_train = data["y_train"]
    y_val = data["y_val"]
    y_test = data["y_test"]

    variables = X_train.columns.tolist()

    # # Determine which features are discrete (flag bools as discrete)
    # discrete_flags = [
    #     True if dtype == bool else False  # Flag only bools as discrete
    #     for dtype in X_train.dtypes
    # ]

    # # Convert bool columns to int (0/1) in all splits
    # for df in [X_train, X_val, X_test]:
    #     bool_cols = df.select_dtypes(include='bool').columns
    #     df[bool_cols] = df[bool_cols].astype(int)

    # Store results for this dataset
    selected_feature_data[name] = {}

    # Step 4: Loop from 1 to number of variables
    for k in range( 2, len(variables) + 1):
        try:
            if k == len(variables):
                # Skip MRMR – use original data as-is
                selected_feature_data[name][k] = {
                    "X_train": X_train,
                    "X_val": X_val,
                    "X_test": X_test,
                    "y_train": y_train,
                    "y_val": y_val,
                    "y_test": y_test,
                    "time": 0.0
                }
                continue
    
            sel = MRMR(
                # variables=variables,
                method="MID",
                max_features=k,
                # discrete_features=discrete_flags,
                regression=True,
                random_state=42
            )
            start = time.time()
            sel.fit(X_train, y_train)
            elapsed = time.time() - start

            # Transform splits
            X_train_sel = sel.transform(X_train)
            X_val_sel = sel.transform(X_val)
            X_test_sel = sel.transform(X_test)

            # Save results under this k
            selected_feature_data[name][k] = {
                "X_train": X_train_sel,
                "X_val": X_val_sel,
                "X_test": X_test_sel,
                "y_train": y_train,
                "y_val": y_val,
                "y_test": y_test,
                "time": elapsed
            }

        except Exception as e:
            print('It Hit Here')
            selected_feature_data[name][k] = {"error": str(e)}

# Step 5: Save to pickle file
os.makedirs("Data", exist_ok=True)
with open("Data/selected_feature_data_simulated.pkl", "wb") as f:
    pickle.dump(selected_feature_data, f)

print("✅ Feature selection completed and saved to 'Data/selected_feature_data_simulated.pkl'")

Processing sim_10...
Processing sim_20...
Processing sim_30...
Processing sim_40...
Processing sim_50...
Processing sim_60...
Processing sim_70...
Processing sim_80...
Processing sim_90...
Processing sim_100...
Processing sim_110...
Processing sim_120...
Processing sim_130...
Processing sim_140...
Processing sim_150...
✅ Feature selection completed and saved to 'Data/selected_feature_data_simulated.pkl'
