## **Feature Selection**

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import metrics
import numpy as np
import pandas as pd

**Breast Cancer Wisconsin (Diagnostic) Dataset**

*   569 instances (212 Malignant, 357 Benign)
*   30 numerical features (computed from a digitized image of a breast mass)
*   2 classes (Malignant, Benign)

In [2]:
wisconsin = datasets.load_breast_cancer()
TrainX, TestX, TrainY, TestY = train_test_split(wisconsin.data, wisconsin.target, train_size=0.5, random_state=999)

Add random noise

In [3]:
# 각 환자마다 10개의 임의의 값들을 추가한다. (필요없는 10개의 feature를 골랐을 때를 상정한다.)
rTrainX = np.concatenate((TrainX, np.random.randn(TrainX.shape[0], 10)), axis=1)
rTestX = np.concatenate((TestX, np.random.randn(TestX.shape[0], 10)), axis=1)
print(rTrainX.shape)

(284, 40)


In [4]:
# 40개의 feature로 했을 때의 정확도
model = GaussianNB()
model.fit(rTrainX, TrainY)

tr_pred = model.predict(rTrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = model.predict(rTestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

Training Accuracy :  0.9507042253521126
Test Accuracy :  0.9438596491228071


**Filter Methods: Select the best K features**

In [6]:
# 40개 중에 k개만 선택을 해보자.
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif

In [7]:
# mutual_info_class를 사용하여 5개만 뽑아보자!
# 전체적인 data의 경향이나, 두가지 분포가 얼마나 비슷한가 다른가,,, entropy와 관련도 있으며,,, disqution에 따라서 얼마나 차이가 나나..?
fs_kbest = SelectKBest(mutual_info_classif, k=5)
fs_kbest.fit(rTrainX, TrainY)          # run score function on the data

SelectKBest(k=5, score_func=<function mutual_info_classif at 0x7fdef2709ae8>)

In [8]:
# 각 feature의 score가 몇인지.
print('Feature Score : ', fs_kbest.scores_)

Feature Score :  [0.34122313 0.07928106 0.36739094 0.34990094 0.07677956 0.20077904
 0.33149856 0.41843362 0.1134013  0.02360209 0.19226598 0.02155257
 0.24541379 0.31792553 0.02551535 0.06101159 0.099352   0.09301305
 0.04521892 0.03278512 0.39733898 0.12098763 0.42872919 0.42351673
 0.09789187 0.20582183 0.25953347 0.40387076 0.0522205  0.06886198
 0.         0.         0.         0.         0.         0.00589481
 0.         0.         0.02500263 0.        ]


In [9]:
scores = list(fs_kbest.scores_)
scores_idx = np.argsort(scores) # sorting된 값이 아니라 해당되는 feature를 알려준다.
best5feats = np.sort(scores_idx[-5:])
best5feats

# 10개의 random에서 뽑히지 않았다. (31~40번)

array([ 7, 20, 22, 23, 27])

In [10]:
# 첫번째 사람의 5개 feature 뽑아보기
print(rTrainX[1,best5feats])

[2.944e-02 1.566e+01 1.012e+02 7.500e+02 7.453e-02]


In [11]:
# 위와 같은 내용
sTrainX = fs_kbest.transform(rTrainX)  # select the best K features
sTestX = fs_kbest.transform(rTestX)

print(sTrainX[1,:])

[2.944e-02 1.566e+01 1.012e+02 7.500e+02 7.453e-02]


In [13]:
model_filt = GaussianNB()
model_filt.fit(sTrainX, TrainY)

tr_pred2 = model_filt.predict(sTrainX)
tr_acc2 = metrics.accuracy_score(TrainY, tr_pred2)

ts_pred2 = model_filt.predict(sTestX)
ts_acc2 = metrics.accuracy_score(TestY, ts_pred2)

print('Filter Training Accuracy : ', tr_acc2)
print('Filter Test Accuracy : ', ts_acc2)

# 40개를 사용하나, 5개를 사용하나 거의 비슷하다.

Filter Training Accuracy :  0.9577464788732394
Filter Test Accuracy :  0.9403508771929825


**Wrapper Methods**

In [None]:
# mlxtend가 service가 안된다,,,!
from mlxtend.feature_selection import SequentialFeatureSelector as sf

Sequential Forward Selection (SFS)

In [None]:
model_base = GaussianNB()

SFS = sf(model_base, k_features="best", forward=True, floating=False, scoring='accuracy', verbose=0, cv=5)

SFS.fit(rTrainX, TrainY)

print('SFS Selected Features : ', SFS.k_feature_idx_)

In [None]:
selected_feats = list(SFS.k_feature_idx_)
sfsTrainX = rTrainX[:, selected_feats]
sfsTestX = rTestX[:, selected_feats]

model_sfs = GaussianNB()
model_sfs.fit(sfsTrainX, TrainY)

tr_pred3 = model_sfs.predict(sfsTrainX)
tr_acc3 = metrics.accuracy_score(TrainY, tr_pred3)

ts_pred3 = model_sfs.predict(sfsTestX)
ts_acc3 = metrics.accuracy_score(TestY, ts_pred3)

print('SFS Training Accuracy : ', tr_acc3)
print('SFS Test Accuracy : ', ts_acc3)

Sequential Backward Selection (SBS)

In [None]:
model_base = GaussianNB()

SBS = sf(model_base, k_features="best", forward=False, floating=False, scoring='accuracy', verbose=0, cv=5)

SBS.fit(rTrainX, TrainY)

print('SBS Selected Features : ', SBS.k_feature_idx_)

In [None]:
selected_feats = list(SBS.k_feature_idx_)
sbsTrainX = rTrainX[:, selected_feats]
sbsTestX = rTestX[:, selected_feats]

model_sbs = GaussianNB()
model_sbs.fit(sbsTrainX, TrainY)

tr_pred4 = model_sbs.predict(sbsTrainX)
tr_acc4 = metrics.accuracy_score(TrainY, tr_pred4)

ts_pred4 = model_sbs.predict(sbsTestX)
ts_acc4 = metrics.accuracy_score(TestY, ts_pred4)

print('SBS Training Accuracy : ', tr_acc4)
print('SBS Test Accuracy : ', ts_acc4)

Sequential Floating Forward Selection (SFFS)

In [None]:
model_base = GaussianNB()

SFFS = sf(model_base, k_features="best", forward=True, floating=True, scoring='accuracy', verbose=0, cv=5)

SFFS.fit(rTrainX, TrainY)

print('SFFS Selected Features : ', SFFS.k_feature_idx_)

In [None]:
selected_feats = list(SFFS.k_feature_idx_)
sffsTrainX = rTrainX[:, selected_feats]
sffsTestX = rTestX[:, selected_feats]

model_sffs = GaussianNB()
model_sffs.fit(sffsTrainX, TrainY)

tr_pred5 = model_sffs.predict(sffsTrainX)
tr_acc5 = metrics.accuracy_score(TrainY, tr_pred5)

ts_pred5 = model_sffs.predict(sffsTestX)
ts_acc5 = metrics.accuracy_score(TestY, ts_pred5)

print('SFFS Training Accuracy : ', tr_acc5)
print('SFFS Test Accuracy : ', ts_acc5)