In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('data/breast_cancer.csv')
X = data.drop(columns=['target'], axis=1)
y = data['target']

# Final model
logreg = LogisticRegression(max_iter=1000, solver='liblinear', penalty='l1', C=0.1)

# Preprocessing
scaler = StandardScaler()
rfe = RFE(estimator=logreg, n_features_to_select=5)
var_thresh = VarianceThreshold(threshold=0.01)

# Build pipeline with only transformers and estimator at the end
pipeline = Pipeline([
    ('variance', var_thresh),
    ('scaler', scaler),
    ('classifier', logreg)
])

# Add RFE to the pipeline after scaling and before classifier
pipeline.steps.insert(2, ('rfe', rfe))

# Fit the pipeline
pipeline.fit(X, y)

# Get the mask of selected features from RFE
selected_mask = pipeline.named_steps['rfe'].support_

# Get the feature names after variance threshold
features_after_var = X.columns[var_thresh.get_support()]

# Get the final selected feature names
selected_features = features_after_var[selected_mask]
print("Selected features:", list(selected_features))



Selected features: ['radius error', 'worst radius', 'worst texture', 'worst compactness', 'worst concavity']
