# Feature Selection


A seleção de recursos é um processo que seleciona automaticamente os recursos em seus dados que mais contribuem para a predição ou da saída em que você está interessado.

Ter muitos recursos irrelevantes nos dados pode diminuir a precisão dos modelos.  
Três benefícios de executar a seleção de recursos antes da modelagem dos dados:

- ##### Reduzir Overfitting:  
Dados menos redundantes significa menos oportunidades de tomar decisões.
  
- ##### Melhora a Accuracy:  
Dados sem redundancia melhora os resultados de precisão.  

- ##### Reduz o tempo de treinamento:  
Com a remoção de alguns dados o treinamento se torna mais rapido.

### Recursive Feature Elimination - *RFE*

In [None]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

dataset = datasets.load_iris()
features = dataset.feature_names
verb = 0

model = LogisticRegression()

# create the RFE model and select 3 attributes
rfe = RFE(model, 3, verbose=verb)
rfe = rfe.fit(dataset.data, dataset.target)

print('Melhores selecionados recebem ranking 1: ')
for i, r in enumerate(rfe.ranking_):
    print([r, features[i]])
    
print()

print('Features usadas:')
for i, s in enumerate(rfe.support_):
    if s:
        print(features[i])

### Recursive Feature Elimination Cross-Validated - *RFECV*

In [None]:
from sklearn import datasets
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

dataset = datasets.load_iris()
# dataset = datasets.load_breast_cancer()
verb = 0

features = dataset.feature_names

model = LogisticRegression()

rfe = RFECV(model, cv=4, verbose=verb, step=1)
rfe = rfe.fit(dataset.data, dataset.target)

print('Melhores selecionados recebem ranking 1: ')
for i, r in enumerate(rfe.ranking_): print([r, features[i]])
    
print()

print('Features usadas:')
for i, s in enumerate(rfe.support_):
    if s: print(features[i])

### VarianceThreshold
  
Podemos remover recursos onde a grande maioria das observações é de um tipo.

##### Para features binárias

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Feature 0: 80% value 0
# Feature 1: 80% value 1
# Feature 2: 60% value 0, 40% value 1
X = [[0, 1, 0],
     [0, 1, 1],
     [0, 1, 0],
     [0, 1, 1],
     [1, 0, 0]]

# VAR = p(1 - p)

p1 = .70  # 70%
var = (p1 * (1 - p1))
thresholder = VarianceThreshold(threshold=var)
r = thresholder.fit_transform(X)
print('Variância em t1: ')
print(r)

print()

p2 = .80  # 80%
var = (p2 * (1 - p2))
thresholder = VarianceThreshold(threshold=var)
r = thresholder.fit_transform(X)
print('Variância em t2: ')
print(r)

##### Para features de qualquer valor

In [None]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

iris = datasets.load_iris()
X = iris.data
y = iris.target

t = .18  # 50%
thresholder = VarianceThreshold(threshold=t)
X_high_variance = thresholder.fit_transform(X)

X_high_variance[0:5]

### SelectKBest

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, f_regression

# mutual_info_regression(X, y) -> Estimate mutual information for a continuous target variable.

X, y = load_iris(return_X_y=True)
selectors = []

# list of statistical functions
functions = [
    chi2, 
    f_classif, 
    mutual_info_classif,
    f_regression
]

for f in functions:
    sel = SelectKBest(f, k=2)
    sel.fit(X, y)
    selectors.append(sel)
    
pd.DataFrame(X).head()

In [None]:
for sel in selectors:
    print(sel)

In [None]:
for sel in selectors:
    newX = sel.transform(X)
    df = pd.DataFrame(newX).head()
    print('Usando ', sel.score_func.__name__)
    print(df)

In [None]:
for sel in selectors:
    print('Score of: ', sel.score_func.__name__)
    print(sel.scores_, end='\n\n')

### SelectPercentile

In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile, chi2, f_classif, mutual_info_classif, f_regression

selectors = []
functions = [
    chi2, 
    f_classif, 
    mutual_info_classif,
    f_regression
]

X, y = load_breast_cancer(return_X_y=True)

for f in functions:
    sel = SelectPercentile(f, percentile=80)
    sel.fit(X, y)
    selectors.append(sel)
    
pd.DataFrame(X).head()

In [None]:
for sel in selectors:
    newX = sel.transform(X)
    df = pd.DataFrame(newX).head()
    print('Usando ', sel.score_func.__name__)
    print(df)

In [None]:
for sel in selectors:
    print('Score of: ', sel.score_func.__name__)
    print(sel.transform(X).shape)
    print(sel.scores_, end='\n\n')

### SelectFromModel

In [None]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

X, y = load_iris(return_X_y=True)

model = LogisticRegression()

sel = SelectFromModel(model, threshold=.5).fit(X, y)
n_features = sel.transform(X).shape[1]

while n_features > 2:
    sel.threshold += 0.1
    X_transform = sel.transform(X)
    n_features = X_transform.shape[1]
    
X_transform.shape

### GenericUnivariateSelect

Usa Select* vistos anteriormente

In [None]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import GenericUnivariateSelect, chi2

X, y = load_iris(return_X_y=True)

# mode = {‘percentile’, ‘k_best’, ‘fpr’, ‘fdr’, ‘fwe’}
X_transform = GenericUnivariateSelect(chi2, mode='k_best', param=2).fit_transform(X, y)
X_transform.shape

### SelectFpr, SelectFdr e SelectFwe

Ver mais: [http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection](sklearn - Feature Selection)