# Feature selection  
## chi-squared test [article](http://datareview.info/article/otbor-priznakov-dlya-mashinnogo-obucheniya-na-python/)

In [34]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import os
import pandas as pd
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# load data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
if os.path.exists("./data/diabetes.csv"):
    dataframe = pd.read_csv("./data/diabetes.csv", names=names)
else:
    print("Put diabetes.csv in folder 'data'")
    exit(1)
    
array = dataframe[1:].astype('float64')
X = array.iloc[:,0:8]
Y = array.iloc[:,8]

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

# summarize scores
numpy.set_printoptions(precision=3)  # Number of digits of precision for floating point output (8 to 3)
print(pd.DataFrame({"Features": names[:8], "Values": fit.scores_}))
features = fit.transform(X)
# summarize selected features
print(f"new shape: {features.shape}")

  Features       Values
0     preg   111.519691
1     plas  1411.887041
2     pres    17.605373
3     skin    53.108040
4     test  2175.565273
5     mass   127.669343
6     pedi     5.392682
7      age   181.303689
new shape: (768, 4)


In [12]:
dataframe[dataframe.columns[fit.get_support(indices=True)]]

Unnamed: 0,plas,test,mass,age
0,Glucose,Insulin,BMI,Age
1,148,0,33.6,50
2,85,0,26.6,31
3,183,0,23.3,32
4,89,94,28.1,21
5,137,168,43.1,33
6,116,0,25.6,30
7,78,88,31,26
8,115,0,35.3,29
9,197,543,30.5,53


Example from user guid.

In [35]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
X.shape

(150, 4)

In [36]:
X_tmp = SelectKBest(chi2, k=2)
X_new = X_tmp.fit_transform(X, y)
X_new.shape

(150, 2)

In [38]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target

X_tmp = SelectKBest(chi2, k=2).fit(X, y)
# X_new = X_tmp.fit(X, y)
print(X_tmp.scores_)  # shape without target value 

[ 10.818   3.711 116.313  67.048]


## Removing features with low variance [User Guide](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection)

In [14]:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])