# Feature Selection


A seleção de recursos é um processo que seleciona automaticamente os recursos em seus dados que mais contribuem para a predição ou da saída em que você está interessado.

Ter muitos recursos irrelevantes nos dados pode diminuir a precisão dos modelos.  
Três benefícios de executar a seleção de recursos antes da modelagem dos dados:

- ##### Reduzir Overfitting:  
Dados menos redundantes significa menos oportunidades de tomar decisões.
  
- ##### Melhora a Accuracy:  
Dados sem redundancia melhora os resultados de precisão.  

- ##### Reduz o tempo de treinamento:  
Com a remoção de alguns dados o treinamento se torna mais rapido.

### Recursive Feature Elimination - *RFE*

In [None]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

dataset = datasets.load_iris()
features = dataset.feature_names

model = LogisticRegression()

# create the RFE model and select 3 attributes
rfe = RFE(model, 3, verbose=1)
rfe = rfe.fit(dataset.data, dataset.target)

print('Melhores selecionados recebem ranking 1: ')
for i, r in enumerate(rfe.ranking_):
    print([r, features[i]])
    
print()

print('Features usadas:')
for i, s in enumerate(rfe.support_):
    if s:
        print(features[i])

### Recursive Feature Elimination Cross-Validated - *RFECV*

In [None]:
from sklearn import datasets
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

dataset = datasets.load_iris()
features = dataset.feature_names

model = LogisticRegression()

rfe = RFECV(model, cv=4, verbose=1)
rfe = rfe.fit(dataset.data, dataset.target)

print('Melhores selecionados recebem ranking 1: ')
for i, r in enumerate(rfe.ranking_):
    print([r, features[i]])
    
print()

print('Features usadas:')
for i, s in enumerate(rfe.support_):
    if s:
        print(features[i])

### VarianceThreshold
  
Podemos remover recursos onde a grande maioria das observações é de um tipo.

##### Para features binárias

In [126]:
from sklearn.feature_selection import VarianceThreshold

# Feature 0: 80% value 0
# Feature 1: 80% value 1
# Feature 2: 60% value 0, 40% value 1
X = [[0, 1, 0],
     [0, 1, 1],
     [0, 1, 0],
     [0, 1, 1],
     [1, 0, 0]]

# VAR = p(1 - p)

p1 = .70  # 70%
var = (p1 * (1 - p1))
thresholder = VarianceThreshold(threshold=var)
r = thresholder.fit_transform(X)
print('Variância em t1: ')
print(r)

print()

p2 = .80  # 80%
var = (p2 * (1 - p2))
thresholder = VarianceThreshold(threshold=var)
r = thresholder.fit_transform(X)
print('Variância em t2: ')
print(r)

Variância em t1: 
[[0]
 [1]
 [0]
 [1]
 [0]]

Variância em t2: 
[[0 1 0]
 [0 1 1]
 [0 1 0]
 [0 1 1]
 [1 0 0]]


##### Para features de qualquer valor

In [125]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

iris = datasets.load_iris()
X = iris.data
y = iris.target

t = .5  # 50%
thresholder = VarianceThreshold(threshold=t)
X_high_variance = thresholder.fit_transform(X)

X_high_variance[0:5]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2]])

### SelectKBest

In [124]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, f_regression

# mutual_info_regression(X, y) -> Estimate mutual information for a continuous target variable.

X, y = load_iris(return_X_y=True)
selectors = []

# list of statistical functions
functions = [
    chi2, 
    f_classif, 
    mutual_info_classif,
    f_regression
]

for f in functions:
    sel = SelectKBest(f, k=2)
    sel.fit(X, y)
    selectors.append(sel)
    
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [119]:
for sel in selectors:
    print(sel)

SelectKBest(k=2, score_func=<function chi2 at 0x7faaf7910e18>)
SelectKBest(k=2, score_func=<function f_classif at 0x7faaf7910d08>)
SelectKBest(k=2, score_func=<function mutual_info_classif at 0x7faaf6b471e0>)
SelectKBest(k=2, score_func=<function f_regression at 0x7faaf7910ea0>)


In [120]:
for sel in selectors:
    newX = sel.transform(X)
    df = pd.DataFrame(newX).head()
    print('Usando ', sel.score_func.__name__)
    print(df)

Usando  chi2
     0    1
0  1.4  0.2
1  1.4  0.2
2  1.3  0.2
3  1.5  0.2
4  1.4  0.2
Usando  f_classif
     0    1
0  1.4  0.2
1  1.4  0.2
2  1.3  0.2
3  1.5  0.2
4  1.4  0.2
Usando  mutual_info_classif
     0    1
0  1.4  0.2
1  1.4  0.2
2  1.3  0.2
3  1.5  0.2
4  1.4  0.2
Usando  f_regression
     0    1
0  1.4  0.2
1  1.4  0.2
2  1.3  0.2
3  1.5  0.2
4  1.4  0.2


In [121]:
for sel in selectors:
    print('Score of: ', sel.score_func.__name__)
    print(sel.scores_, end='\n\n')

Score of:  chi2
[ 10.81782088   3.59449902 116.16984746  67.24482759]

Score of:  f_classif
[ 119.26450218   47.3644614  1179.0343277   959.32440573]

Score of:  mutual_info_classif
[0.45328809 0.31382581 0.98841858 0.99631433]

Score of:  f_regression
[ 233.8389959    31.59750825 1342.15918918 1589.55920433]



### SelectPercentile

In [25]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectPercentile, chi2, f_classif, mutual_info_classif, f_regression

selectors = []
functions = [
    chi2, 
    f_classif, 
    mutual_info_classif,
    f_regression
]

X, y = load_iris(return_X_y=True)

for f in functions:
    sel = SelectPercentile(f, percentile=10)
    sel.fit(X, y)
    selectors.append(sel)
    
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [27]:
for sel in selectors:
    newX = sel.transform(X)
    df = pd.DataFrame(newX).head()
    print('Usando ', sel.score_func.__name__)
    print(df)

Usando  chi2
     0
0  1.4
1  1.4
2  1.3
3  1.5
4  1.4
Usando  f_classif
     0
0  1.4
1  1.4
2  1.3
3  1.5
4  1.4
Usando  mutual_info_classif
     0
0  0.2
1  0.2
2  0.2
3  0.2
4  0.2
Usando  f_regression
     0
0  0.2
1  0.2
2  0.2
3  0.2
4  0.2


In [28]:
for sel in selectors:
    print('Score of: ', sel.score_func.__name__)
    print(sel.scores_, end='\n\n')

Score of:  chi2
[ 10.81782088   3.59449902 116.16984746  67.24482759]

Score of:  f_classif
[ 119.26450218   47.3644614  1179.0343277   959.32440573]

Score of:  mutual_info_classif
[0.50037996 0.25925553 0.98426092 0.98989393]

Score of:  f_regression
[ 233.8389959    31.59750825 1342.15918918 1589.55920433]



### SelectFromModel

In [16]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

X, y = load_iris(return_X_y=True)

model = LogisticRegression()

sel = SelectFromModel(model, threshold=.5).fit(X, y)
n_features = sel.transform(X).shape[1]

while n_features > 2:
    sel.threshold += 0.1
    X_transform = sel.transform(X)
    n_features = X_transform.shape[1]
    
X_transform.shape

(150, 2)

### GenericUnivariateSelect

In [31]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import GenericUnivariateSelect, chi2

X, y = load_iris(return_X_y=True)

# mode = {‘percentile’, ‘k_best’, ‘fpr’, ‘fdr’, ‘fwe’}
X_transform = GenericUnivariateSelect(chi2, mode='k_best', param=2).fit_transform(X, y)
X_transform.shape

(150, 2)

### SelectFpr