In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.svm import SVC

from darwin.config import (
    PROCESSED_DATA_DIR, 
    RANDOM_STATE, 
    CRITERION, 
    MAX_DEPTH, 
    CLASS_WEIGHT, 
    FEATURE_NUM
)

[32m2025-03-28 10:20:31.644[0m | [1mINFO    [0m | [36mdarwin.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/eduardoduarte/Projects/refactor-darwin/darwin[0m


In [2]:
input_path = PROCESSED_DATA_DIR / 'preprocessed_data.csv'

In [3]:
df = pd.read_csv(input_path)

In [4]:
def select_feature_imp(df: pd.DataFrame, n: int) -> list[str]:
    """
    Given a DataFrame, returns the n most important features based on the 
    feature_importance_ of a RandomForestClassifier
        df: pd.DataFrame
            DataFrame with the data
        n: int
            Number of features to return
        return: list
            List with the n most important features
    """
    # Splits the target and the features
    X = df.drop("class", axis='columns')
    y = df["class"]
    
    # Train the Forest
    forest = RandomForestClassifier(n_estimators=100,
                                random_state=RANDOM_STATE,
                                criterion=CRITERION,
                                max_depth=MAX_DEPTH,
                                class_weight=CLASS_WEIGHT)
    forest.fit(X, y)
    
    # Get the feature importance
    feature_imp = pd.Series(forest.feature_importances_, index=X.columns)
    feature_imp = feature_imp.sort_values(ascending=False)

    selected_features = feature_imp.head(n).index.tolist()

    return selected_features

In [5]:
def select_anova(df: pd.DataFrame, n: int) -> list[str]:
    """
    Given a DataFrame, returns the n most important features based on the 
    ANOVA F-Value from SelectKBest
        df: pd.DataFrame
            DataFrame with the data
        n: int
            Number of features to return
        return: list
            List with the n most important features
    """
    # Splits the target and the features
    X = df.drop("class", axis='columns')
    y = df["class"]

    # Fit the selector to the data
    selector = SelectKBest(f_classif, k=n)
    selector.fit(X, y)

    # Get the selected features with highest f-value
    selected_indices = np.argsort(selector.scores_)[::-1][:n]
    selected_features = X.columns[selected_indices].tolist()
    
    return selected_features



In [6]:
def select_rfe(df: pd.DataFrame, n: int) -> list[str]:
    """
    Given a DataFrame, returns the n most important features selected
    by Recursive Feature Elimination with a Support Vector Classifier
        df: pd.DataFrame
            DataFrame with the data
        n: int
            Number of features to return
        return: list
            List with the n most important features
    """
    # Splits the target and and features
    X = df.drop("class", axis='columns')
    y = df["class"]

    # Fit the selector to the data
    estimator = SVC(kernel="linear")
    selector = RFE(estimator, n_features_to_select=n, step=1)
    selector = selector.fit(X, y)

    selected_features = X.columns[selector.support_].tolist()

    return selected_features

In [7]:
from collections import Counter

counter = Counter(select_anova(df, FEATURE_NUM) + select_feature_imp(df, FEATURE_NUM) + select_rfe(df, FEATURE_NUM))
print(counter)

Counter({'total_time3': 2, 'total_time15': 2, 'air_time15': 2, 'gmrt_in_air7': 1, 'mean_gmrt7': 1, 'disp_index23': 1, 'mean_speed_in_air7': 1, 'paper_time9': 1, 'air_time16': 1, 'mean_gmrt17': 1, 'total_time9': 1, 'disp_index22': 1, 'total_time23': 1, 'air_time23': 1, 'air_time17': 1, 'total_time6': 1, 'total_time17': 1, 'paper_time17': 1, 'total_time8': 1, 'pressure_var5': 1, 'disp_index8': 1, 'mean_jerk_on_paper8': 1, 'paper_time12': 1, 'gmrt_in_air17': 1, 'max_y_extension19': 1, 'disp_index21': 1, 'air_time24': 1})


In [8]:
feature_imp = df[select_feature_imp(df, FEATURE_NUM)]
anova = df[select_anova(df, FEATURE_NUM)]
rfe = df[select_rfe(df, FEATURE_NUM)]

In [9]:
print(feature_imp.columns)
print(anova.columns)
print(rfe.columns)

Index(['total_time23', 'total_time15', 'air_time15', 'air_time23',
       'air_time17', 'total_time6', 'total_time17', 'paper_time17',
       'total_time3', 'total_time8'],
      dtype='object')
Index(['gmrt_in_air7', 'mean_gmrt7', 'disp_index23', 'mean_speed_in_air7',
       'paper_time9', 'air_time16', 'mean_gmrt17', 'total_time9',
       'disp_index22', 'total_time3'],
      dtype='object')
Index(['pressure_var5', 'disp_index8', 'mean_jerk_on_paper8', 'paper_time12',
       'air_time15', 'total_time15', 'gmrt_in_air17', 'max_y_extension19',
       'disp_index21', 'air_time24'],
      dtype='object')


In [10]:
assert feature_imp.shape[1] == FEATURE_NUM
assert anova.shape[1] == FEATURE_NUM
assert rfe.shape[1] == FEATURE_NUM