## Seleção de ANOVA

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFdr, chi2
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [4]:
dataset = pd.read_csv('../0_datasets/ad.data', header=None)
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


In [5]:
X = dataset.iloc[:,0:1558].values
X

array([[125.    , 125.    ,   1.    , ...,   0.    ,   0.    ,   0.    ],
       [ 57.    , 468.    ,   8.2105, ...,   0.    ,   0.    ,   0.    ],
       [ 33.    , 230.    ,   6.9696, ...,   0.    ,   0.    ,   0.    ],
       ...,
       [ 23.    , 120.    ,   5.2173, ...,   0.    ,   0.    ,   0.    ],
       [  0.    ,   0.    ,   0.    , ...,   0.    ,   0.    ,   0.    ],
       [ 40.    ,  40.    ,   1.    , ...,   0.    ,   0.    ,   0.    ]])

In [6]:
y = dataset.iloc[:,1558]
y

0          ad.
1          ad.
2          ad.
3          ad.
4          ad.
         ...  
3274    nonad.
3275    nonad.
3276    nonad.
3277    nonad.
3278    nonad.
Name: 1558, Length: 3279, dtype: object

In [7]:
np.unique(y, return_counts=True)

(array(['ad.', 'nonad.'], dtype=object), array([ 459, 2820], dtype=int64))

In [8]:
naive1 = GaussianNB()
naive1.fit(X,y)
previsoes1 = naive1.predict(X)
accuracy_score(y,previsoes1)

0.7813357731015553

In [9]:
selecao = SelectFdr(chi2, alpha=0.01)
X_novo = selecao.fit_transform(X,y)

In [10]:
X.shape, X_novo.shape

((3279, 1558), (3279, 433))

In [11]:
selecao.pvalues_, len(selecao.pvalues_)

(array([2.14710304e-268, 0.00000000e+000, 8.98165813e-150, ...,
        6.03353380e-041, 5.63437216e-012, 9.37945775e-002]),
 1558)

In [12]:
np.sum(selecao.pvalues_ <= 0.01)

476

In [13]:
colunas = selecao.get_support()
colunas

array([ True,  True,  True, ...,  True,  True, False])

In [14]:
indices = np.where(colunas==True)
indices

(array([   0,    1,    2,    9,   11,   14,   20,   21,   26,   31,   34,
          36,   49,   58,   59,   64,   65,   69,   70,   86,   91,   95,
          96,  102,  104,  106,  113,  133,  134,  139,  155,  158,  163,
         167,  172,  175,  180,  181,  183,  185,  186,  189,  190,  192,
         193,  224,  242,  246,  248,  249,  251,  259,  264,  265,  266,
         267,  268,  269,  270,  274,  276,  278,  287,  290,  293,  304,
         307,  310,  312,  317,  321,  329,  330,  336,  341,  345,  346,
         350,  351,  355,  356,  357,  359,  366,  367,  370,  372,  381,
         386,  388,  389,  398,  405,  418,  420,  426,  427,  429,  430,
         432,  435,  440,  455,  457,  460,  465,  470,  472,  477,  478,
         482,  508,  511,  518,  528,  532,  533,  540,  542,  548,  551,
         556,  572,  573,  574,  576,  586,  622,  625,  627,  638,  643,
         648,  653,  658,  661,  663,  666,  683,  688,  694,  703,  704,
         709,  711,  720,  723,  729, 

In [15]:
naive2 = GaussianNB()
naive2.fit(X_novo,y)
previsoes2 = naive2.predict(X_novo)
accuracy_score(y,previsoes2)

0.970722781335773

### Anova

In [17]:
from sklearn.feature_selection import f_classif

In [18]:
selecao3 = SelectFdr(f_classif, alpha=0.01)
X_final = selecao.fit_transform(X,y)

In [19]:
X.shape, X_novo.shape, X_final.shape

((3279, 1558), (3279, 433), (3279, 433))

In [21]:
selecao.pvalues_, np.sum(selecao.pvalues_ < 0.01)

(array([2.14710304e-268, 0.00000000e+000, 8.98165813e-150, ...,
        6.03353380e-041, 5.63437216e-012, 9.37945775e-002]),
 476)

In [22]:
naive3 = GaussianNB()
naive3.fit(X_final,y)
previsoes3 = naive3.predict(X_final)
accuracy_score(y,previsoes3)

0.970722781335773