# ***Feature Selection for Classification Dataset***


In [94]:
import pandas as pd

df=pd.read_csv("data/bodyPerformance.csv")
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13393 entries, 0 to 13392
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      13393 non-null  float64
 1   gender                   13393 non-null  object 
 2   height_cm                13393 non-null  float64
 3   weight_kg                13393 non-null  float64
 4   body fat_%               13393 non-null  float64
 5   diastolic                13393 non-null  float64
 6   systolic                 13393 non-null  float64
 7   gripForce                13393 non-null  float64
 8   sit and bend forward_cm  13393 non-null  float64
 9   sit-ups counts           13393 non-null  float64
 10  broad jump_cm            13393 non-null  float64
 11  class                    13393 non-null  object 
dtypes: float64(10), object(2)
memory usage: 1.2+ MB


## ***Encode to be able to calculate correlations***

In [96]:
from sklearn.preprocessing import LabelEncoder

def encode_object_columns(df):
    obj_cols = df.select_dtypes(include=["object"]).columns
    for col in obj_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df

df = encode_object_columns(df)
df.tail()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
13388,25.0,1,172.1,71.8,16.2,74.0,141.0,35.8,17.4,47.0,198.0,2
13389,21.0,1,179.7,63.9,12.1,74.0,128.0,33.0,1.1,48.0,167.0,3
13390,39.0,1,177.2,80.5,20.1,78.0,132.0,63.5,16.4,45.0,229.0,0
13391,64.0,0,146.1,57.7,40.4,68.0,121.0,19.3,9.2,0.0,75.0,3
13392,34.0,1,164.0,66.1,19.5,82.0,150.0,35.9,7.1,51.0,180.0,2


In [97]:
import numpy as np

def corr_matrix(df,target_col):
    corr_matrix=df.corr()[target_col].abs().sort_values(ascending=False)
    return corr_matrix

def remove_highly_correlated_features(df, target_col, threshold=0.8):
    """
    Removes columns from a DataFrame that have a correlation coefficient greater than the threshold
    with another non-target column.
    """
    df_to_prune = df.copy()
    corr_matrix = df_to_prune.drop(columns=target_col).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    print("Columns removed due to high correlation with other features:")
    print(to_drop)

    df_pruned = df_to_prune.drop(columns=to_drop)

    return df_pruned, to_drop


correlation_matrix=corr_matrix(df,"class")
target_col="class"

correlation_matrix

class                      1.000000
sit and bend forward_cm    0.588123
sit-ups counts             0.452832
body fat_%                 0.341956
broad jump_cm              0.262154
weight_kg                  0.214129
gripForce                  0.136088
gender                     0.075605
diastolic                  0.066761
age                        0.065612
height_cm                  0.037753
systolic                   0.035484
Name: class, dtype: float64

### ***Correlation to target - to high correlation should be removed (if exists)***

In [98]:
df, removed_columns = remove_highly_correlated_features(df,target_col,threshold=0.9)
df.head()

Columns removed due to high correlation with other features:
[]


Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,1,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,2
1,25.0,1,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,0
2,31.0,1,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,2
3,32.0,1,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,1
4,28.0,1,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,1


## ***Check and imput missing values***

In [99]:
df.isnull().sum()

# Use simple imputation of mean to fill missing values:
from sklearn.impute import SimpleImputer
def impute_missing_values(df):
    """
    Imputes missing values in the DataFrame using mean of column.
    """
    mice = SimpleImputer(strategy='mean')
    df_imputed = mice.fit_transform(df)
    return df_imputed

### **Without feature selection**

In [100]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X=df.drop(columns=[target_col])
y=df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Random Forest Classifier: {accuracy:.2f}")




Accuracy of Random Forest Classifier: 0.75


### **With feature selection - ANOVASelector**

In [101]:
from kydavra import ANOVASelector
from sklearn.preprocessing import StandardScaler

df_copy = df.copy()

selector = ANOVASelector(significance_level=0.05, classification=True)
selected_cols = selector.select(df_copy,y_column='class')

X_train_selected = X_train[selected_cols]
X_test_selected = X_test[selected_cols]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

rf_scaled = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scaled.fit(X_train_scaled, y_train)

y_pred_scaled = rf_scaled.predict(X_test_scaled)
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)
print(f"Accuracy of Scaled Random Forest Classifier: {accuracy_scaled:.2f}")


Accuracy of Scaled Random Forest Classifier: 0.75


### ***Feature selection + Hyperparameters***

In [102]:
df_copy = df.copy()
X = df_copy.drop('class', axis=1)
y = df_copy['class']


In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

from kydavra import PCAReducer

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

train_df = pd.concat([X_train, y_train], axis=1)

reducer = PCAReducer(min_corr=0.5, max_corr=0.8, correlation_type='pearson')
train_reduced_df = reducer.reduce(train_df, y_column='class')

test_df = pd.concat([X_test, y_test], axis=1)
test_reduced_df = reducer.apply(test_df)

X_train_reduced = train_reduced_df.drop(columns=['class'])
X_test_reduced = test_reduced_df.drop(columns=['class'])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_reduced)
X_test_scaled = scaler.transform(X_test_reduced)

rf = RandomForestClassifier(random_state=42)

param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['sqrt', 'log2']
}

random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train_scaled, y_train)

best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"\nNew columns after PCAReducer: {list(X_train_reduced.columns)}")
print("\nBest params:", random_search.best_params_)
print("Best RandomizedSearchCV score (on training data):", random_search.best_score_)
print(f"\nFinal accuracy on test data: {test_accuracy:.4f}")



New columns after PCAReducer: ['age', 'gender', 'height_cm', 'weight_kg', 'body fat_%', 'diastolic', 'systolic', 'gripForce', 'sit and bend forward_cm', 'sit-ups counts', 'broad jump_cm']

Best params: {'n_estimators': 150, 'max_features': 'log2', 'max_depth': 30}
Best RandomizedSearchCV score (on training data): 0.7289517518004247

Final accuracy on test data: 0.7536
