# Feature selection

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

from bcd.utils.paths import data_interim_dir, data_processed_dir

## Carga de datos

In [3]:
df = pd.read_csv(data_interim_dir("dataset.csv"))
numeric_features = ['age', 'tumor_size_(cm)']
binary_features = ['menopause', 'inv-nodes', 'breast', 'metastasis', 'history']
categorical_features = ['breast_quadrant']

target = 'diagnosis_result'

display(df.head())

Unnamed: 0,s/n,year,age,menopause,tumor_size_(cm),inv-nodes,breast,metastasis,breast_quadrant,history,diagnosis_result
0,1,2019,40,1,2.0,0,Right,0,Upper inner,0,Benign
1,2,2019,39,1,2.0,0,Left,0,Upper outer,0,Benign
2,3,2019,45,0,4.0,0,Left,0,Lower outer,0,Benign
3,4,2019,26,1,3.0,0,Left,0,Lower inner,1,Benign
4,5,2019,21,1,1.0,0,Right,0,Upper outer,1,Benign


## Convertir 'diagnosis_result' (target) y 'breast' a binario

In [4]:
le_target = LabelEncoder()
df[target] = le_target.fit_transform(df[target])


print("Correspondencia de etiquetas:")
for i, label in enumerate(le_target.classes_):
    print(f"{label}: {i}")

Correspondencia de etiquetas:
Benign: 0
Malignant: 1


In [5]:
le_breast = LabelEncoder()
df['breast'] = le_breast.fit_transform(df['breast'])


print("Correspondencia de etiquetas:")
for i, label in enumerate(le_breast.classes_):
    print(f"{label}: {i}")

Correspondencia de etiquetas:
Left: 0
Right: 1


## Separar características y variable objetivo

In [6]:
# Place the (X, y) classification problem
X = df[numeric_features + binary_features + categorical_features]
y = df[target]

## Conjunto de entrenamiento y prueba

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Pipeline para el preprocesamiento

Nota: ya que en la selección de características vamos a usar un `RandomForestClassifier` no es necesarió escalar las variables numéricas. En caso que se quiera usar otro clasificador suceptible a la escala de la las variables, se suquiere tranformar los datos.

In [8]:
# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('bin', 'passthrough', binary_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

## Pipeline y ajuste del modelo

In [9]:
rf_selector = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('selector', SelectFromModel(RandomForestClassifier(
            n_estimators=100, random_state=42), threshold='median')),
    ]
)

rf_selector.fit(X_train, y_train)

## Selección de características

In [10]:
selected_features_mask = rf_selector.named_steps['selector'].get_support()
all_features = (
    numeric_features +
    binary_features +
    [f"{feat}_{val}" for feat in categorical_features
     for val in X[feat].unique() if val != X[feat].unique()[0]]
)
selected_features = [feature for feature, selected in zip(
    all_features, selected_features_mask) if selected]

print("Características seleccionadas:")
print(selected_features)

Características seleccionadas:
['age', 'tumor_size_(cm)', 'inv-nodes', 'metastasis', 'breast_quadrant_Lower inner']


## Transformación de los datos de entrenamiento y prueba

In [11]:
# Transformar los datos de entrenamiento y prueba
X_train_selected = rf_selector.transform(X_train)
X_test_selected = rf_selector.transform(X_test)

print(f"Forma de X_train_selected: {X_train_selected.shape}")
print(f"Forma de X_test_selected: {X_test_selected.shape}")

Forma de X_train_selected: (142, 5)
Forma de X_test_selected: (62, 5)


## Prueba de rendimiento del modelo con las características seleccionadas

In [12]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_selected, y_train)
accuracy = rf_model.score(X_test_selected, y_test)
print(f"Precisión del modelo con características seleccionadas: {accuracy:.4f}")

Precisión del modelo con características seleccionadas: 0.9032


In [13]:
df_train = pd.concat([pd.DataFrame(
    X_train_selected, columns=selected_features), y_train.reset_index(drop=True)], axis=1)
df_test = pd.concat([pd.DataFrame(
    X_test_selected, columns=selected_features), y_test.reset_index(drop=True)], axis=1)

## Guardar datos de entrenamiento y prueba

In [14]:
df_train.to_csv(data_processed_dir("train.csv"), index=False)
df_test.to_csv(data_processed_dir("test.csv"), index=False)