## **Feature Selection**

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import metrics
import numpy as np
import pandas as pd

**Breast Cancer Wisconsin (Diagnostic) Dataset**

*   569 instances (212 Malignant, 357 Benign)
*   30 numerical features (computed from a digitized image of a breast mass)
*   2 classes (Malignant, Benign)

In [None]:
wisconsin = datasets.load_breast_cancer()
TrainX, TestX, TrainY, TestY = train_test_split(wisconsin.data, wisconsin.target, train_size=0.5, random_state=999)

Add random noise

In [None]:
rTrainX = np.concatenate((TrainX, np.random.randn(TrainX.shape[0], 10)), axis=1)
rTestX = np.concatenate((TestX, np.random.randn(TestX.shape[0], 10)), axis=1)
print(rTrainX.shape)

In [None]:
model = GaussianNB()
model.fit(rTrainX, TrainY)

tr_pred = model.predict(rTrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = model.predict(rTestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

**Filter Methods: Select the best K features**

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif

In [None]:
fs_kbest = SelectKBest(mutual_info_classif, k=5)
fs_kbest.fit(rTrainX, TrainY)          # run score function on the data

In [None]:
print('Feature Score : ', fs_kbest.scores_)

In [None]:
scores = list(fs_kbest.scores_)
scores_idx = np.argsort(scores)
best5feats = np.sort(scores_idx[-5:])
best5feats

In [None]:
print(rTrainX[1,best5feats])

In [None]:
sTrainX = fs_kbest.transform(rTrainX)  # select the best K features
sTestX = fs_kbest.transform(rTestX)

print(sTrainX[1,:])

In [None]:
model_filt = GaussianNB()
model_filt.fit(sTrainX, TrainY)

tr_pred2 = model_filt.predict(sTrainX)
tr_acc2 = metrics.accuracy_score(TrainY, tr_pred2)

ts_pred2 = model_filt.predict(sTestX)
ts_acc2 = metrics.accuracy_score(TestY, ts_pred2)

print('Filter Training Accuracy : ', tr_acc2)
print('Filter Test Accuracy : ', ts_acc2)

**Wrapper Methods**

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as sf

Sequential Forward Selection (SFS)

In [None]:
model_base = GaussianNB()

SFS = sf(model_base, k_features="best", forward=True, floating=False, scoring='accuracy', verbose=0, cv=5)

SFS.fit(rTrainX, TrainY)

print('SFS Selected Features : ', SFS.k_feature_idx_)

In [None]:
selected_feats = list(SFS.k_feature_idx_)
sfsTrainX = rTrainX[:, selected_feats]
sfsTestX = rTestX[:, selected_feats]

model_sfs = GaussianNB()
model_sfs.fit(sfsTrainX, TrainY)

tr_pred3 = model_sfs.predict(sfsTrainX)
tr_acc3 = metrics.accuracy_score(TrainY, tr_pred3)

ts_pred3 = model_sfs.predict(sfsTestX)
ts_acc3 = metrics.accuracy_score(TestY, ts_pred3)

print('SFS Training Accuracy : ', tr_acc3)
print('SFS Test Accuracy : ', ts_acc3)

Sequential Backward Selection (SBS)

In [None]:
model_base = GaussianNB()

SBS = sf(model_base, k_features="best", forward=False, floating=False, scoring='accuracy', verbose=0, cv=5)

SBS.fit(rTrainX, TrainY)

print('SBS Selected Features : ', SBS.k_feature_idx_)

In [None]:
selected_feats = list(SBS.k_feature_idx_)
sbsTrainX = rTrainX[:, selected_feats]
sbsTestX = rTestX[:, selected_feats]

model_sbs = GaussianNB()
model_sbs.fit(sbsTrainX, TrainY)

tr_pred4 = model_sbs.predict(sbsTrainX)
tr_acc4 = metrics.accuracy_score(TrainY, tr_pred4)

ts_pred4 = model_sbs.predict(sbsTestX)
ts_acc4 = metrics.accuracy_score(TestY, ts_pred4)

print('SBS Training Accuracy : ', tr_acc4)
print('SBS Test Accuracy : ', ts_acc4)

Sequential Floating Forward Selection (SFFS)

In [None]:
model_base = GaussianNB()

SFFS = sf(model_base, k_features="best", forward=True, floating=True, scoring='accuracy', verbose=0, cv=5)

SFFS.fit(rTrainX, TrainY)

print('SFFS Selected Features : ', SFFS.k_feature_idx_)

In [None]:
selected_feats = list(SFFS.k_feature_idx_)
sffsTrainX = rTrainX[:, selected_feats]
sffsTestX = rTestX[:, selected_feats]

model_sffs = GaussianNB()
model_sffs.fit(sffsTrainX, TrainY)

tr_pred5 = model_sffs.predict(sffsTrainX)
tr_acc5 = metrics.accuracy_score(TrainY, tr_pred5)

ts_pred5 = model_sffs.predict(sffsTestX)
ts_acc5 = metrics.accuracy_score(TestY, ts_pred5)

print('SFFS Training Accuracy : ', tr_acc5)
print('SFFS Test Accuracy : ', ts_acc5)