## **Feature Selection**

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import metrics
import numpy as np
import pandas as pd

**Breast Cancer Wisconsin (Diagnostic) Dataset**

*   569 instances (212 Malignant, 357 Benign)
*   30 numerical features (computed from a digitized image of a breast mass)
*   2 classes (Malignant, Benign)

In [None]:
wisconsin = datasets.load_breast_cancer()
TrainX, TestX, TrainY, TestY = train_test_split(wisconsin.data, wisconsin.target, train_size=0.5, random_state=999)

Add random noise

In [None]:
# 각 환자마다 10개의 임의의 값들을 추가한다. (필요없는 10개의 feature를 골랐을 때를 상정한다.)
rTrainX = np.concatenate((TrainX, np.random.randn(TrainX.shape[0], 10)), axis=1)
rTestX = np.concatenate((TestX, np.random.randn(TestX.shape[0], 10)), axis=1)
print(rTrainX.shape)

In [None]:
# 40개의 feature로 했을 때의 정확도
model = GaussianNB()
model.fit(rTrainX, TrainY)

tr_pred = model.predict(rTrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = model.predict(rTestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

**Filter Methods: Select the best K features**

In [None]:
# 40개 중에 k개만 선택을 해보자.
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif

In [None]:
# mutual_info_class를 사용하여 5개만 뽑아보자!
# 전체적인 data의 경향이나, 두가지 분포가 얼마나 비슷한가 다른가,,, entropy와 관련도 있으며,,, disqution에 따라서 얼마나 차이가 나나..?
fs_kbest = SelectKBest(mutual_info_classif, k=5)
fs_kbest.fit(rTrainX, TrainY)          # run score function on the data

In [None]:
# 각 feature의 score가 몇인지.
print('Feature Score : ', fs_kbest.scores_)

In [None]:
scores = list(fs_kbest.scores_)
scores_idx = np.argsort(scores) # sorting된 값이 아니라 해당되는 feature를 알려준다.
best5feats = np.sort(scores_idx[-5:])
best5feats

# 10개의 random에서 뽑히지 않았다. (31~40번)

In [None]:
# 첫번째 사람의 5개 feature 뽑아보기
print(rTrainX[1,best5feats])

In [None]:
# 위와 같은 내용
sTrainX = fs_kbest.transform(rTrainX)  # select the best K features
sTestX = fs_kbest.transform(rTestX)

print(sTrainX[1,:])

In [None]:
model_filt = GaussianNB()
model_filt.fit(sTrainX, TrainY)

tr_pred2 = model_filt.predict(sTrainX)
tr_acc2 = metrics.accuracy_score(TrainY, tr_pred2)

ts_pred2 = model_filt.predict(sTestX)
ts_acc2 = metrics.accuracy_score(TestY, ts_pred2)

print('Filter Training Accuracy : ', tr_acc2)
print('Filter Test Accuracy : ', ts_acc2)

# 40개를 사용하나, 5개를 사용하나 거의 비슷하다.

**Wrapper Methods**

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector as sf

Sequential Forward Selection (SFS) - Best k

In [None]:
model_base = GaussianNB()

num_feats_sfsk = 5
SFSk = sf(model_base, n_features_to_select=num_feats_sfsk, direction="forward", scoring="accuracy", cv=5)

SFSk.fit(rTrainX, TrainY)

In [None]:
print('SFS Selected Features : ', SFSk.get_support())

In [None]:
sfskTrainX = SFSk.transform(rTrainX)
sfskTestX = SFSk.transform(rTestX)

print(sfskTrainX.shape)
print(sfskTestX.shape)

In [None]:
model_sfsk = GaussianNB()
model_sfsk.fit(sfskTrainX, TrainY)

tr_pred3 = model_sfsk.predict(sfskTrainX)
tr_acc3 = metrics.accuracy_score(TrainY, tr_pred3)

ts_pred3 = model_sfsk.predict(sfskTestX)
ts_acc3 = metrics.accuracy_score(TestY, ts_pred3)

print('SFS-k Training Accuracy : ', tr_acc3)
print('SFS-k Test Accuracy : ', ts_acc3)

Sequential Backward Selection (SBS) - Best k

In [None]:
model_base = GaussianNB()

num_feats_sbsk = 15
SBSk = sf(model_base, n_features_to_select=num_feats_sbsk, direction="backward", scoring="accuracy", cv=5)

SBSk.fit(rTrainX, TrainY)

In [None]:
print('SBS Selected Features : ', SBSk.get_support())

In [None]:
sbskTrainX = SBSk.transform(rTrainX)
sbskTestX = SBSk.transform(rTestX)

print(sbskTrainX.shape)
print(sbskTestX.shape)

In [None]:
model_sbsk = GaussianNB()
model_sbsk.fit(sbskTrainX, TrainY)

tr_pred4 = model_sbsk.predict(sbskTrainX)
tr_acc4 = metrics.accuracy_score(TrainY, tr_pred4)

ts_pred4 = model_sbsk.predict(sbskTestX)
ts_acc4 = metrics.accuracy_score(TestY, ts_pred4)

print('SBS-k Training Accuracy : ', tr_acc4)
print('SBS-k Test Accuracy : ', ts_acc4)

Sequential Forward Selection (SFS)

In [None]:
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as sf2

In [None]:
model_base = GaussianNB()

SFS = sf2(model_base, k_features="best", forward=True, floating=False, scoring='accuracy', verbose=0, cv=5)

SFS.fit(rTrainX, TrainY)

num_feats_sfs = len(SFS.k_feature_idx_)

print('SFS Selected Features : ', SFS.k_feature_idx_)
print('SFS Selected Number of Features : ', num_feats_sfs)

In [None]:
selected_feats = list(SFS.k_feature_idx_)
sfsTrainX = SFS.transform(rTrainX)
sfsTestX = SFS.transform(rTestX)

print(sfsTrainX.shape)
print(sfsTestX.shape)

In [None]:
model_sfs = GaussianNB()
model_sfs.fit(sfsTrainX, TrainY)

tr_pred5 = model_sfs.predict(sfsTrainX)
tr_acc5 = metrics.accuracy_score(TrainY, tr_pred5)

ts_pred5 = model_sfs.predict(sfsTestX)
ts_acc5 = metrics.accuracy_score(TestY, ts_pred5)

print('SFS Training Accuracy : ', tr_acc5)
print('SFS Test Accuracy : ', ts_acc5)

Sequential Backward Selection (SBS)

In [None]:
model_base = GaussianNB()

SBS = sf2(model_base, k_features="best", forward=False, floating=False, scoring='accuracy', verbose=0, cv=5)

SBS.fit(rTrainX, TrainY)

num_feats_sbs = len(SBS.k_feature_idx_)

print('SBS Selected Features : ', SBS.k_feature_idx_)
print('SBS Selected Number of Features : ', num_feats_sbs)

In [None]:
selected_feats = list(SBS.k_feature_idx_)
sbsTrainX = SBS.transform(rTrainX)
sbsTestX = SBS.transform(rTestX)

print(sbsTrainX.shape)
print(sbsTestX.shape)

In [None]:
model_sbs = GaussianNB()
model_sbs.fit(sbsTrainX, TrainY)

tr_pred6 = model_sbs.predict(sbsTrainX)
tr_acc6 = metrics.accuracy_score(TrainY, tr_pred6)

ts_pred6 = model_sbs.predict(sbsTestX)
ts_acc6 = metrics.accuracy_score(TestY, ts_pred6)

print('SBS Training Accuracy : ', tr_acc6)
print('SBS Test Accuracy : ', ts_acc6)

Sequential Floating Forward Selection (SFFS)

In [None]:
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as sf2

In [None]:
model_base = GaussianNB()

SFFS = sf2(model_base, k_features="best", forward=True, floating=True, scoring='accuracy', verbose=0, cv=5)

SFFS.fit(rTrainX, TrainY)

num_feats_sffs = len(SFFS.k_feature_idx_)

print('SFFS Selected Features : ', SFFS.k_feature_idx_)
print('SFFS Selected Number of Features : ', num_feats_sffs)

In [None]:
selected_feats = list(SFFS.k_feature_idx_)
sffsTrainX = rTrainX[:, selected_feats]
sffsTestX = rTestX[:, selected_feats]

model_sffs = GaussianNB()
model_sffs.fit(sffsTrainX, TrainY)

tr_pred7 = model_sffs.predict(sffsTrainX)
tr_acc7 = metrics.accuracy_score(TrainY, tr_pred7)

ts_pred7 = model_sffs.predict(sffsTestX)
ts_acc7 = metrics.accuracy_score(TestY, ts_pred7)

print('SFFS Training Accuracy : ', tr_acc7)
print('SFFS Test Accuracy : ', ts_acc7)

Summary

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.DataFrame({'Train': [tr_acc, tr_acc2, tr_acc3, tr_acc4, tr_acc5, tr_acc6, tr_acc7], 
                   'Test': [ts_acc, ts_acc2, ts_acc3, ts_acc4, ts_acc5, ts_acc6, ts_acc7],
                  'N_feats': [40, 5, num_feats_sfsk, num_feats_sbsk, num_feats_sfs, num_feats_sbs, num_feats_sffs]})
df.index = ['None', 'BestK', 'SFS-k', 'SBS-k', 'SFS', 'SBS', 'SFFS']

print(df)

In [None]:
fig = plt.figure(figsize=(10,5))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
df[['Train','Test']].plot(ax=ax1)
df['N_feats'].plot(ax=ax2)