# Dimensionality reduction by feature selection

In [1]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

In [2]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
thresholder = VarianceThreshold(threshold=.5)

In [3]:
features_high_variance = thresholder.fit_transform(features)
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [4]:
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

In [5]:
tresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
tresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

*Var* (**x**) = *p* (1 - *p*)
*p* - доля наблюдений класса 1

In [6]:
import pandas as pd
import numpy as np

In [7]:
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

In [8]:
dataframe = pd.DataFrame(features)
corr_matrix = dataframe.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [9]:
dataframe.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

In [11]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
features = features.astype(int)

In [12]:
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

In [13]:
print("Исходное количество признаков:", features.shape[1])
print("Сокращенное количество признаков:", features_kbest.shape[1])

Исходное количество признаков: 4
Сокращенное количество признаков: 2


In [14]:
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

In [15]:
print("Исходное количество признаков:", features.shape[1])
print("Сокращенное количество признаков:", features_kbest.shape[1])

Исходное количество признаков: 4
Сокращенное количество признаков: 2


In [16]:
from sklearn.feature_selection import SelectPercentile

In [17]:
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

In [18]:
print("Исходное количество признаков:", features.shape[1])
print("Сокращенное количество признаков:", features_kbest.shape[1])

Исходное количество признаков: 4
Сокращенное количество признаков: 3


In [19]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

In [20]:
warnings.filterwarnings("ignore")
features, target = make_regression(n_samples=10_000,
                                   n_features=100,
                                   n_informative=2,
                                   random_state=1)

In [21]:
ols = linear_model.LinearRegression()
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 , -0.09009697],
       [-1.07500204,  2.56148527, -0.83288561],
       [ 1.37940721, -1.77039484, -0.19323117],
       ...,
       [-0.80331656, -1.60648007,  0.16783823],
       [ 0.39508844, -1.34564911, -0.23055945],
       [-0.55383035,  0.82880112,  0.05952898]])

In [22]:
rfecv.n_features_  # representative features

3

In [23]:
rfecv.support_  # boolean expression of representativeness of each feature

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False])

In [24]:
rfecv.ranking_  # 1 - most representative feature

array([ 7, 13, 94, 19, 60,  1, 84, 41, 79, 44, 66, 73, 32, 62, 37, 48, 22,
       50, 55,  4, 40, 87, 46, 20,  9, 30, 10, 23, 28, 96, 26, 12,  6, 82,
       78, 85, 90, 76, 21,  1, 91, 15, 25, 14, 24, 31, 33, 57, 89, 63,  3,
       59, 80,  5, 83, 51, 61, 93, 71, 17,  2, 49, 11, 88, 38, 36, 35, 97,
       39, 74, 77, 58, 43, 16, 56, 65, 92, 27, 86, 75, 45, 34, 18, 54, 64,
       95, 98, 29, 68, 70, 67, 53, 42, 72,  8, 81, 52,  1, 47, 69])