# Feature selection

In [1]:
import afdetection.utils.paths as path

from afdetection.data.make_dataset import MakeDataset
from afdetection.features.build_features import BuildFeatures

from xgboost import XGBClassifier
from sklearn_genetic import GAFeatureSelectionCV

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
make_data = MakeDataset()
dataset_DIR = path.data_raw_dir('dataset.csv')
dataset = make_data.read_from_csv(dataset_DIR)

build_features = BuildFeatures()
X, y = build_features.features_target_split(
    dataset=dataset,
    drop_cols=['diagnosi', 'ritmi'],
    target='ritmi'
)

## Genetic algorithm feature selection

In [3]:
# Estimator
clf = XGBClassifier()

selector = GAFeatureSelectionCV(estimator=clf,
                                cv=2,
                                scoring="accuracy",
                                population_size=30,
                                generations=2,
                                n_jobs=-1,
                                verbose=True,
                                keep_top_k=1,
                                elitism=True)

selector.fit(X, y)
support = selector.best_features_

selected_ga = X.columns[support].to_list()
discarted_ga = X.columns[~support].to_list()

print('Selected features:\n', selected_ga)
print('Discarted features:\n', discarted_ga)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	30    	0.770717	0.00793486 	0.78547    	0.754823   
1  	60    	0.77737 	0.00570283 	0.78547    	0.766024   
2  	60    	0.779506	0.00419313 	0.78547    	0.768046   
Selected features:
 ['I_cD1_energy', 'I_cD4_mean', 'I_cD3_mean', 'I_cD2_mean', 'I_cA5_varance', 'I_cD5_varance', 'I_cD4_varance', 'I_cD1_varance', 'I_cD5_std_dev', 'I_cD4_std_dev', 'I_cD3_std_dev', 'II_cA5_energy', 'II_cD5_energy', 'II_cD2_energy', 'II_cD4_mean', 'II_cD3_mean', 'II_cD2_mean', 'II_cA5_varance', 'II_cD3_varance', 'II_cD1_varance', 'II_cD5_std_dev', 'II_cD4_std_dev', 'II_cD2_std_dev', 'III_cA5_energy', 'III_cD1_energy', 'III_cD5_mean', 'III_cD4_mean', 'III_cD5_varance', 'III_cD2_varance', 'III_cD5_std_dev', 'aVF_cA5_energy', 'aVF_cD5_energy', 'aVF_cD4_energy', 'aVF_cD1_energy', 'aVF_cA5_mean', 'aVF_cD4_mean', 'aVF_cD3_mean', 'aVF_cD1_mean', 'aVF_cD3_varance', 'aVF_cD1_varance', 'aVF_cD5_std_dev', 'aVF_cD3_std_dev', 'aVF_cD2_std_dev', 'aVR_cA5_energy', 

In [None]:
# Selected features:
['I_cD1_energy', 'I_cD4_mean', 'I_cD3_mean', 'I_cD2_mean', 'I_cA5_varance', 'I_cD5_varance', 'I_cD4_varance', 'I_cD1_varance', 'I_cD5_std_dev', 'I_cD4_std_dev', 'I_cD3_std_dev', 'II_cA5_energy', 'II_cD5_energy', 'II_cD2_energy', 'II_cD4_mean', 'II_cD3_mean', 'II_cD2_mean', 'II_cA5_varance', 'II_cD3_varance', 'II_cD1_varance', 'II_cD5_std_dev', 'II_cD4_std_dev', 'II_cD2_std_dev', 'III_cA5_energy', 'III_cD1_energy', 'III_cD5_mean', 'III_cD4_mean', 'III_cD5_varance', 'III_cD2_varance', 'III_cD5_std_dev', 'aVF_cA5_energy', 'aVF_cD5_energy', 'aVF_cD4_energy', 'aVF_cD1_energy', 'aVF_cA5_mean', 'aVF_cD4_mean', 'aVF_cD3_mean', 'aVF_cD1_mean', 'aVF_cD3_varance', 'aVF_cD1_varance', 'aVF_cD5_std_dev', 'aVF_cD3_std_dev', 'aVF_cD2_std_dev', 'aVR_cA5_energy', 'aVR_cD5_energy', 'aVR_cD3_energy', 'aVR_cA5_mean', 'aVR_cD4_mean', 'aVR_cD3_mean', 'aVR_cD2_mean', 'aVR_cD3_varance', 'aVR_cD1_varance', 'aVR_cD5_std_dev', 'aVR_cD4_std_dev', 'aVR_cD2_std_dev', 'aVR_cD1_std_dev', 'aVL_cA5_energy', 'aVL_cD2_energy', 'aVL_cA5_mean', 'aVL_cD4_mean', 'aVL_cD3_mean', 'aVL_cD1_mean', 'aVL_cD3_std_dev', 'aVL_cD1_std_dev', 'V1_cA5_energy', 'V1_cD3_energy', 'V1_cD1_energy', 'V1_cA5_mean', 'V1_cD5_mean', 'V1_cD3_mean', 'V1_cD4_varance', 'V1_cD2_varance', 'V1_cD1_varance', 'V1_cD5_std_dev', 'V1_cD3_std_dev', 'V1_cD1_std_dev', 'V2_cD4_energy', 'V2_cD3_energy', 'V2_cD2_energy', 'V2_cD1_energy', 'V2_cA5_mean', 'V2_cD4_mean', 'V2_cD3_mean', 'V2_cD1_mean', 'V2_cA5_varance', 'V2_cD4_varance', 'V2_cD1_varance', 'V2_cD5_std_dev', 'V2_cD4_std_dev', 'V2_cD3_std_dev', 'V2_cD2_std_dev', 'V3_cD5_energy', 'V3_cD4_energy', 'V3_cD3_energy', 'V3_cD1_energy', 'V3_cD4_mean', 'V3_cD3_mean', 'V3_cD1_mean', 'V3_cD4_varance', 'V3_cD3_varance', 'V3_cD3_std_dev', 'V4_cA5_energy', 'V4_cD5_energy', 'V4_cD1_energy', 'V4_cA5_mean', 'V4_cD3_mean', 'V4_cD2_mean', 'V4_cD1_mean', 'V4_cA5_varance', 'V4_cD5_varance', 'V4_cD3_varance', 'V4_cD2_varance', 'V4_cD1_std_dev', 'V5_cD3_energy', 'V5_cD1_energy', 'V5_cD5_mean', 'V5_cD4_mean', 'V5_cD2_mean', 'V5_cD5_varance', 'V5_cD4_varance', 'V5_cD2_varance', 'V5_cD5_std_dev', 'V5_cD4_std_dev', 'V5_cD3_std_dev', 'V6_cA5_energy', 'V6_cD3_energy', 'V6_cD4_mean', 'V6_cD1_mean', 'V6_cA5_varance', 'V6_cD4_varance', 'V6_cD2_varance', 'V6_cD1_varance', 'V6_cD5_std_dev', 'V6_cD4_std_dev', 'V6_cD3_std_dev', 'V6_cD2_std_dev', 'V6_cD1_std_dev', 'aVL_dFreq', 'V1_dFreq', 'V2_dFreq', 'V4_dFreq', 'I_specEn', 'II_specEn', 'III_specEn', 'aVR_specEn', 'V1_specEn', 'V2_specEn', 'V6_specEn', 'age', 'weight']