# Feature selection  


## Example from user [guid](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection).

In [4]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
X.shape

(150, 4)

In [5]:
X_tmp = SelectKBest(chi2, k=2)
X_new = X_tmp.fit_transform(X, y)
X_new.shape

(150, 2)

In [6]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target

X_tmp = SelectKBest(chi2, k=2).fit(X, y)
print("Point all feateres: {}".format(X_tmp.scores_))  # shape without target value 4
X_new = X_tmp.fit(X, y)
# print("K highest scoring features: {}".format(X_new.transform(X)))
# print("K highest scoring features: {}".format(iris.DESCR))

Point all feateres: [ 10.81782088   3.7107283  116.31261309  67.0483602 ]


Лучшие признаки `petal length` и `petal width`.

In [7]:
from sklearn.datasets import load_iris
data = load_iris()
data.target[[10, 25, 50]]
list(data.target_names)

['setosa', 'versicolor', 'virginica']

In [8]:
print(__doc__)


# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
# Though the following import is not directly being used, it is required
# for 3D projection to work
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn import datasets

np.random.seed(5)

iris = datasets.load_iris()
X = iris.data
y = iris.target

estimators = [('k_means_iris_8', KMeans(n_clusters=8)),
              ('k_means_iris_3', KMeans(n_clusters=3)),
              ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1,
                                               init='random'))]

fignum = 1
titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization']
for name, est in estimators:
    fig = plt.figure(fignum, figsize=(4, 3))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2],
               c=labels.astype(np.float), edgecolor='k')

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    ax.set_xlabel('Petal width')
    ax.set_ylabel('Sepal length')
    ax.set_zlabel('Petal length')
    ax.set_title(titles[fignum - 1])
    ax.dist = 12
    fignum = fignum + 1

# Plot the ground truth
fig = plt.figure(fignum, figsize=(4, 3))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

for name, label in [('Setosa', 0),
                    ('Versicolour', 1),
                    ('Virginica', 2)]:
    ax.text3D(X[y == label, 3].mean(),
              X[y == label, 0].mean(),
              X[y == label, 2].mean() + 2, name,
              horizontalalignment='center',
              bbox=dict(alpha=.2, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
ax.set_title('Ground Truth')
ax.dist = 12

Automatically created module for IPython interactive environment


## chi-squared test [article](http://datareview.info/article/otbor-priznakov-dlya-mashinnogo-obucheniya-na-python/)

In [9]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import os
import pandas as pd
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# load data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
if os.path.exists("./data/diabetes.csv"):
    dataframe = pd.read_csv("./data/diabetes.csv", names=names)
else:
    print("Put diabetes.csv in folder 'data'")
    exit(1)
    
array = dataframe[1:].astype('float64')
X = array.iloc[:,0:8]
Y = array.iloc[:,8]

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

# summarize scores
numpy.set_printoptions(precision=3)  # Number of digits of precision for floating point output (8 to 3)
print(pd.DataFrame({"Features": names[:8], "Values": fit.scores_}))
features = fit.transform(X)
# summarize selected features
print(f"new shape: {features.shape}")

  Features       Values
0     preg   111.519691
1     plas  1411.887041
2     pres    17.605373
3     skin    53.108040
4     test  2175.565273
5     mass   127.669343
6     pedi     5.392682
7      age   181.303689
new shape: (768, 4)


In [10]:
dataframe[dataframe.columns[fit.get_support(indices=True)]]

Unnamed: 0,plas,test,mass,age
0,Glucose,Insulin,BMI,Age
1,148,0,33.6,50
2,85,0,26.6,31
3,183,0,23.3,32
4,89,94,28.1,21
5,137,168,43.1,33
6,116,0,25.6,30
7,78,88,31,26
8,115,0,35.3,29
9,197,543,30.5,53


## Removing features with low variance [User Guide](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection)

In [11]:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

In [12]:
sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
sel.fit_transform(X)

array([[0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 1, 1],
       [0, 1, 0],
       [0, 1, 1]])

In [13]:
X = [[0, 0, 1], [0, 1, 0], [0, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.95 * (1 - .95)))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

## Recursive feature elimination [artical](http://datareview.info/article/otbor-priznakov-dlya-mashinnogo-obucheniya-na-python/)
Метод рекурсивного исключения признаков (recursive feature elimination, RFE) реализует следующий алгоритм: модель обучается на исходном наборе признаков и оценивает их значимость, затем исключается один или несколько наименее значимых признаков, модель обучается на оставшихся признаках, и так далее, пока не останется заданное количество лучших признаков. В документации scikit-learn вы можете подробнее прочитать о классе [RFE](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE).

In [14]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# load data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
if os.path.exists("./data/diabetes.csv"):
    dataframe = pd.read_csv("./data/diabetes.csv", names=names)
else:
    print("Put diabetes.csv in folder 'data'")
    exit(1)
array = dataframe.values
    
array = dataframe[1:].astype('float64')
X = array.iloc[:,0:8]
Y = array.iloc[:,8]

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)

print("Num Features: {}".format(fit.n_features_))
print("Selected Features: {}".format(fit.support_))
print("Feature Ranking: {}".format(fit.ranking_))

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]




## Principal component analysis [artical](http://datareview.info/article/otbor-priznakov-dlya-mashinnogo-obucheniya-na-python/)
Метод главных компонент (principal component analysis, PCA) позволяет уменьшить размерность данных с помощью преобразования на основе линейной алгебры. Пользователь может задать требуемое количество измерений (главных компонент) в результирующих данных.
Подробная информация о классе PCA доступна в [документации](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) scikit-learn.

In [21]:
# Feature Extraction with PCA
import os
import numpy
import pandas as pd
from pandas import read_csv
from sklearn.decomposition import PCA

# load data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
if os.path.exists("./data/diabetes.csv"):
    dataframe = pd.read_csv("./data/diabetes.csv", names=names)
else:
    print("Put diabetes.csv in folder 'data'")
    exit(1)
array = dataframe.values

array = dataframe[1:].astype('float64')
X = array.iloc[:,0:8]
Y = array.iloc[:,8]

# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
features = fit.transform(X)

# summarize components
print(f"Explained Variance: {fit.explained_variance_ratio_}")
print(features[0:5,:])

Explained Variance: [0.889 0.062 0.026]
[[-7.571e+01 -3.595e+01 -7.261e+00]
 [-8.236e+01  2.891e+01 -5.497e+00]
 [-7.463e+01 -6.791e+01  1.946e+01]
 [ 1.108e+01  3.490e+01 -5.302e-02]
 [ 8.974e+01 -2.747e+00  2.521e+01]]


Сжымает в пространство меньшей размерности (TODO не понятно пока что для чего и зачем использовать)

## [Отбор на основе важности признаков](http://datareview.info/article/otbor-priznakov-dlya-mashinnogo-obucheniya-na-python/)

Ансамблевые алгоритмы на основе деревьев решений, такие как случайный лес (random forest), позволяют оценить важность признаков.

В представленном ниже примере мы обучаем классификатор ExtraTreesClassifier, чтобы с его помощью определить важность признаков. Подробнее о классе [ExtraTreesClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html) можно узнать из документации scikit-learn

In [24]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier

# load data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
if os.path.exists("./data/diabetes.csv"):
    dataframe = pd.read_csv("./data/diabetes.csv", names=names)
else:
    print("Put diabetes.csv in folder 'data'")
    exit(1)
array = dataframe.values

array = dataframe[1:].astype('float64')
X = array.iloc[:,0:8]
Y = array.iloc[:,8]

# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)

# plas, age, mass
print(pd.DataFrame({"Features": names[:8], "Values": model.feature_importances_}))

[0.115 0.254 0.093 0.084 0.07  0.141 0.117 0.125]
  Features    Values
0     preg  0.115487
1     plas  0.253578
2     pres  0.093278
3     skin  0.083807
4     test  0.070073
5     mass  0.141367
6     pedi  0.117317
7      age  0.125093


