### Отбор признаков: recursive feature selection

#### sklearn.featureselection.RFE

***in***:
- estimator - оценщик машинного обучения, который может выдать важность признаков за счет атрибутов coef или featureimportances attributes;
- nfeaturestoselect - количество признаков для выбора. Отбирает половину по умолчанию;
- step: int, (0, 1) - количество признаков для удаления (целое число, отношение).

***return***:
- ranking - ранжирование признаков;
- nfeatures - количество выбранных признаков;
- support - массив, указывающий, был ли выбран признак.

In [103]:
import os 
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
seed = 32

In [122]:
PATH_DIR = '../../bases'
FILE_NAME = 'Heart.csv'

PATH_FILE = os.path.join(PATH_DIR, FILE_NAME)
os.path.isfile(PATH_FILE)

True

In [123]:
df = pd.read_csv(PATH_FILE, index_col=0)
print('raw: ', df.shape)
display(df.head())
df.dropna(inplace=True)
print('dropped NaN', df.shape)

raw:  (303, 14)


Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


dropped NaN (297, 14)


In [124]:
df['AHD'].map({'No':0, 'Yes': 1})

obj_cols = np.where(df.dtypes == 'object')[0]
obj_cols = df.columns[obj_cols]

le = LabelEncoder()

for col in obj_cols:
    df[col] = le.fit_transform(df[col])
    
df.head() 

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0,0
2,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1,1
3,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2,1
4,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1,0
5,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1,0


In [125]:
X = df.iloc[:, :-1].values
y = df.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=seed)

#### Feature selection using Recursive Feature Elemination

In [85]:
# 1
esitmator = GradientBoostingClassifier()
selector = RFE(estimator=esitmator, n_features_to_select=6)
selector.fit(X_train, y_train)

res1_support = selector.support_
res1_ranking = selector.ranking_

print(res1_support)
print(res1_ranking)

[ True  True False  True  True False False  True False False False  True
 False]
[1 1 5 1 1 7 6 1 4 2 3 1 8]


In [86]:
# 
res1 = pd.DataFrame(np.c_[selector.support_, selector.ranking_],
             index=df.columns[:-1], 
             columns=['sup1', 'rank1'])
res1                    

Unnamed: 0,sup1,rank1
Age,1,1
Sex,1,1
ChestPain,0,5
RestBP,1,1
Chol,1,1
Fbs,0,7
RestECG,0,6
MaxHR,1,1
ExAng,0,4
Oldpeak,0,2


In [87]:
# 2

seed =32
esitmator = GradientBoostingClassifier(random_state=seed)
selector = RFE(estimator=esitmator, n_features_to_select=6)

pipe = Pipeline([
    ('feature_selection', selector),
    ('model', model)])

cv = RepeatedStratifiedKFold(n_splits=10,
                             n_repeats=5,
                             random_state=seed)
n_scores = cross_val_score(pipe, X_train, y_train, scoring='accuracy',
                           cv=cv, n_jobs=-1)

res = np.mean(n_scores).round(4)

print(res)

1.0


In [88]:
pipe.fit(X_train, y_train)
res2 = pd.DataFrame(np.c_[selector.support_, selector.ranking_],
             index=df.columns[:-1], 
             columns=['sup2', 'rank2'])
pd.concat([res1, res2], axis=1)

Unnamed: 0,sup1,rank1,sup2,rank2
Age,1,1,0,2
Sex,1,1,1,1
ChestPain,0,5,0,3
RestBP,1,1,1,1
Chol,1,1,1,1
Fbs,0,7,0,7
RestECG,0,6,0,8
MaxHR,1,1,1,1
ExAng,0,4,0,5
Oldpeak,0,2,1,1


#### Автоматический отбор признаков
sklearn.featureselection.RFECV

***in***:
- estimator - аналог класса RFE;
- min_features_to_select
- cv
***out***:
- nfeatures - оптимальное количество признаков;
- support - массив, содержащий информацию о выборе признака;
- ranking - ранжирование признаков;
- gridscores - оценка, полученная в результате кросс-валидации.


In [89]:
# 1
estimator = GradientBoostingClassifier(random_state=seed)
rfecv = RFECV(estimator=estimator)
pipeline = Pipeline([('feature_selection', rfecv), ('model', model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=seed)
n_scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy')
np.mean(n_scores).round(4)

1.0

In [90]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
estimator = GradientBoostingClassifier(random_state=seed)
selector = RFECV(estimator=estimator, min_features_to_select=6, cv=skf, n_jobs=-1, scoring='accuracy')
selector.fit(X_train, y_train)

RFECV(cv=StratifiedKFold(n_splits=5, random_state=32, shuffle=True),
      estimator=GradientBoostingClassifier(random_state=32),
      min_features_to_select=6, n_jobs=-1, scoring='accuracy')