In [1]:
# -------------------------------------------
# 1. LOAD DATA, ENSURE MODULARITY
# -------------------------------------------


import pandas as pd
from pathlib import Path
import datetime

# Define this project's file locations.
# This notebook uses a centralized config.py file for all path management.

# Import config paths where the project's paths are defined in one location.
import sys
sys.path.append('..')
from config import CLEAN_DATA_DIR, FEATURE_SUBSETS_DIR

# Define paths using the config file, and name two output files.
clean_data_path = CLEAN_DATA_DIR / "DryBean_clean.parquet"
diagnostic_path = CLEAN_DATA_DIR / "DryBean_feature_diagnostics.xlsx"
subset_dir = FEATURE_SUBSETS_DIR
subset_dir.mkdir(exist_ok=True)



# Load data - again. Loading from persistant storage (e.g., hard/SSD drive) is intended to ensure modularity and reliability.
df = pd.read_parquet(clean_data_path)
diagnostics = pd.read_excel(diagnostic_path, index_col=0)
all_features = list(df.columns.drop("target", errors="ignore"))  # Adjust target name if needed

In [2]:
# -------------------------------------------
# 2. SELECT FEATURES FOR MOEDEL TRAINING
# -------------------------------------------

# Edit this list manually to control the feature subset.
selected_features = [
    "A", "P", "L", "l", "K", "Ec", "C", "Ed", "Ex", "S", "R", "CO", "SF1", "SF2", "SF3", "SF4"
]

In [None]:
# -------------------------------------------
# 3. DISPLAY POTENTIAL ISSUES WITH SELECTED FEATURES
# -------------------------------------------

# Confirm that all selected features actually exist in the dataset, and that there are no typos in the "selected" feature names.
# Stop execution if any selected feature is missing or has a typo.
missing = [f for f in selected_features if f not in all_features]
if missing:
    raise ValueError(f"The following selected features are not in dataset: {missing}")

# Create a summary table showing which selected features have at least one of the three warning flags.
flags = diagnostics[["low_variance_flag", "importance_flag", "pct_missing_flag"]].fillna(False)
warn_flags = flags.loc[selected_features]
warn_summary = warn_flags.any(axis=1)

# If at least one flag is True for a feature, show the feature in this summary.
if warn_summary.any():
    print("⚠️ The following selected features have at least one warning flag:")
    display(warn_flags[warn_summary])
else:
    print("All selected features passed diagnostics checks.")



Unnamed: 0,low_variance_flag,importance_flag,pct_missing_flag
Ec,True,False,False
Ex,True,False,False
S,True,False,False
R,True,False,False
CO,True,False,False
SF1,True,False,False
SF2,True,False,False
SF3,True,False,False
SF4,True,False,False


In [4]:
# -------------------------------------------
# 4. SAVE FEATURE SUBSET
# -------------------------------------------

from shutil import copyfile
from datetime import datetime


# Save timestamped version of the selected feature subset.This is creating a historical record
# of every feature set created here to keep track of changes over time. 
# There is one feature per line in the text file.
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
archive_filename = f"DryBean_feature_subset_{timestamp}.txt"
archive_path = subset_dir / archive_filename

with open(archive_path, "w") as f:
    f.write("\n".join(selected_features))

print(f"Timestamped feature subset saved to: {archive_path}")

# Create a second file that always contains the latest feature subset.
# Overwrite this second file to always have an up-to-date version of the features selected.
# This second file will be the input to the next Python module.
latest_filename = "DryBean_feature_subset_latest.txt"
latest_path = subset_dir / latest_filename

# Copy contents to latest (overwriting if it exists).
copyfile(src=archive_path, dst=latest_path)

print(f"Latest feature subset overwritten at: {latest_path}")


Timestamped feature subset saved to: C:\Misc\ml_benchmark\outputs\feature_subsets\DryBean_feature_subset_2025-07-25_11-50-19.txt
Latest feature subset overwritten at: C:\Misc\ml_benchmark\outputs\feature_subsets\DryBean_feature_subset_latest.txt


In [5]:
# -------------------------------------------
# 5. DISPLAY THE FULL AND SELECTED FEATURES
# -------------------------------------------

# Remove the target/label column from the available features list.
# Change label name (e.g., "Class") to the actual target column name in the dataset being used.
non_target_features = [f for f in all_features if f != "Class"]

# Display feature selection and removal decisions.
print(f"Total available features:  {len(non_target_features)}")
print(f"Features selected:         {len(selected_features)}")
print("Dropped features:")
for f in sorted(set(non_target_features) - set(selected_features)):
    print(f"  - {f}")

Total available features:  16
Features selected:         16
Dropped features:
