# Feature Selection for Machine Learning

#### Filter

In [2]:
# we use the VarianceThreshold function to remove all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.
from sklearn.feature_selection import VarianceThreshold
X = [
      [0, 0, 1],
      [0, 1, 0],
      [1, 0, 0], 
      [0, 1, 1],
      [0, 1, 0],
      [0, 1, 1]
]
sel = VarianceThreshold(threshold=0.2)
sel.fit_transform(X)
# sel.fit(X)
sel.get_support()

array([False,  True,  True])

#### Univariate feature selection

In [13]:
features,labels =load_iris(return_X_y=True)
features
labels


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [14]:

# chi2 is used to select features based on the chi-squared test of independence between each feature and the target.
from sklearn.datasets import load_iris
from sklearn.feature_selection import chi2

features, labels = load_iris(return_X_y=True)
chi2(features, labels)

(array([ 10.81782088,   3.7107283 , 116.31261309,  67.0483602 ]),
 array([4.47651499e-03, 1.56395980e-01, 5.53397228e-26, 2.75824965e-15]))

In [17]:
# we can use selectKBest to select the top k features based on the chi-squared test. and chi2 became a parameter of selectKBest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
features, labels = load_iris(return_X_y=True)
features_new = SelectKBest(chi2, k=3).fit_transform(features, labels)
features_new 

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2],
       [5.4, 1.7, 0.4],
       [4.6, 1.4, 0.3],
       [5. , 1.5, 0.2],
       [4.4, 1.4, 0.2],
       [4.9, 1.5, 0.1],
       [5.4, 1.5, 0.2],
       [4.8, 1.6, 0.2],
       [4.8, 1.4, 0.1],
       [4.3, 1.1, 0.1],
       [5.8, 1.2, 0.2],
       [5.7, 1.5, 0.4],
       [5.4, 1.3, 0.4],
       [5.1, 1.4, 0.3],
       [5.7, 1.7, 0.3],
       [5.1, 1.5, 0.3],
       [5.4, 1.7, 0.2],
       [5.1, 1.5, 0.4],
       [4.6, 1. , 0.2],
       [5.1, 1.7, 0.5],
       [4.8, 1.9, 0.2],
       [5. , 1.6, 0.2],
       [5. , 1.6, 0.4],
       [5.2, 1.5, 0.2],
       [5.2, 1.4, 0.2],
       [4.7, 1.6, 0.2],
       [4.8, 1.6, 0.2],
       [5.4, 1.5, 0.4],
       [5.2, 1.5, 0.1],
       [5.5, 1.4, 0.2],
       [4.9, 1.5, 0.2],
       [5. , 1.2, 0.2],
       [5.5, 1.3, 0.2],
       [4.9, 1.4, 0.1],
       [4.4, 1.3, 0.2],
       [5.1, 1.5, 0.2],
       [5. , 1.3, 0.3],
       [4.5, 1.3

In [19]:
# we can use f_classif to select features based on the analysis of variance (ANOVA) F-value between each feature and the target.
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

features, labels = load_iris(return_X_y=True)

print(features.shape)
mutual_info_classif(features, labels)
#f_classif(features, labels)
# features_new = SelectKBest(f_classif, k=1).fit_transform(features, labels)
# features_new.shape

(150, 4)


(array([ 119.26450218,   49.16004009, 1180.16118225,  960.0071468 ]),
 array([1.66966919e-31, 4.49201713e-17, 2.85677661e-91, 4.16944584e-85]))

In [6]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
X, y = load_iris(return_X_y=True)
X.shape
# X_new = SelectKBest(f_classif, k=2).fit_transform(X, y)
# X_new.shape

(150, 4)