In [2]:
# Day 83 - Feature Selection Methods

import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.metrics import accuracy_score

# Load dataset
data = load_wine()
X, y = data.data, data.target
feature_names = data.feature_names

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [3]:
# ---------------- Filter Method (Chi-Square) ----------------
print("---- Filter Method: Chi-Square ----")
chi_selector = SelectKBest(score_func=chi2, k=5)
X_kbest = chi_selector.fit_transform(abs(X_train_scaled), y_train)
selected_features = np.array(feature_names)[chi_selector.get_support()]
print("Selected Features:", selected_features)

---- Filter Method: Chi-Square ----
Selected Features: ['flavanoids' 'color_intensity' 'hue' 'od280/od315_of_diluted_wines'
 'proline']


In [4]:
# ---------------- Wrapper Method (RFE) ----------------
print("\n---- Wrapper Method: RFE ----")
model = LogisticRegression(max_iter=200)
rfe = RFE(model, n_features_to_select=5)
rfe.fit(X_train_scaled, y_train)
rfe_features = np.array(feature_names)[rfe.support_]
print("Selected Features:", rfe_features)


---- Wrapper Method: RFE ----
Selected Features: ['alcohol' 'flavanoids' 'color_intensity' 'hue' 'proline']


In [6]:
# ---------------- Embedded Method (Feature Importance) ----------------
print("\n---- Embedded Method: Random Forest Importance ----")
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
importances = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False)
print(importances.head(5))


---- Embedded Method: Random Forest Importance ----
flavanoids                      0.202293
color_intensity                 0.171202
proline                         0.139046
alcohol                         0.112398
od280/od315_of_diluted_wines    0.111564
dtype: float64


In [7]:

# ---------------- Model Comparison ----------------
# Logistic Regression without FS
lr = LogisticRegression(max_iter=200)
lr.fit(X_train_scaled, y_train)
y_pred1 = lr.predict(X_test_scaled)

# Logistic Regression with RFE features
X_train_rfe = rfe.transform(X_train_scaled)
X_test_rfe = rfe.transform(X_test_scaled)
lr.fit(X_train_rfe, y_train)
y_pred2 = lr.predict(X_test_rfe)

print("\nAccuracy without FS:", accuracy_score(y_test, y_pred1))
print("Accuracy with RFE FS:", accuracy_score(y_test, y_pred2))


Accuracy without FS: 1.0
Accuracy with RFE FS: 1.0
