# Dimensionality Reduction Using Feature Selection
Difference Between Feature Extraction and Feature Selection:  
the former is to create new features with similar ability to train quality models but with significantly fewer dimensions while the latter is to select high quality,informative features and dropping less useful features.

In [1]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

## Thresholding Numercial Feature Variance

In [2]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
thresholder = VarianceThreshold(threshold=.5)
features_high_variance = thresholder.fit_transform(features)
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [4]:
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

## Thresholding Binary Feature Variance

In [5]:
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]
thresholder = VarianceThreshold(threshold=(0.75*(1-.75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

## Handling Highly Correlated Features

In [7]:
import numpy as np
import pandas as pd
features = np.array([[1, 1, 1],
[2, 2, 0],
[3, 3, 1],
[4, 4, 0],
[5, 5, 1],
[6, 6, 0],
[7, 7, 1],
[8, 7, 0],
[9, 7, 1]])
dataframe = pd.DataFrame(features)
corr_matrix = dataframe.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


## Removing Irrelevant Features for Classfication

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
features = iris.data
target = iris.target
features = features.astype(int)
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)
features.shape[1], features_kbest.shape[1]

(4, 2)

In [9]:
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)
features.shape[1], features_kbest.shape[1]

(4, 2)

In [10]:
from sklearn.feature_selection import SelectPercentile
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)
features.shape[1], features_kbest.shape[1]

(4, 3)

## Recursively Eliminating Features

In [11]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import linear_model
warnings.filterwarnings(action="ignore", module="scipy",
                        message="^internal gelsd")
features, target = make_regression(n_samples=10000,
                                   n_features=100,
                                   n_informative=2,
                                   random_state=1)
ols = linear_model.LinearRegression()
rfecv = RFECV(estimator=ols, step=1,scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799, -0.28547464,  0.7031277 ],
       [-1.07500204, -0.8689623 ,  2.56148527],
       [ 1.37940721, -0.14714771, -1.77039484],
       ...,
       [-0.80331656, -1.030216  , -1.60648007],
       [ 0.39508844, -0.91553464, -1.34564911],
       [-0.55383035, -0.69804472,  0.82880112]])

In [12]:
rfecv.n_features_

3

In [13]:
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [14]:
rfecv.ranking_

array([89, 47, 32, 73, 22,  1, 70, 10, 71, 26, 90, 62, 61, 39, 48, 34, 25,
       86, 43,  9, 68, 46, 77,  4, 56, 37, 16, 88, 81, 52,  7, 41,  1, 15,
       55, 87, 23,  6, 38,  1, 42, 94, 69, 54, 66, 11, 76, 58, 98, 53, 29,
       93, 44, 97, 80, 27, 84, 18, 24, 13, 33, 19, 65,  2, 82, 20, 91, 67,
       51, 72, 74,  8, 75, 83, 50, 95, 49, 57, 92, 40, 59, 36, 28, 30, 60,
       17,  3, 85, 64, 14, 63, 79,  5, 45, 31, 21, 35, 78, 12, 96])