# Introdução
Para esse módulo, irei utilizar a técnica do Naive Bayes que O Naive Bayes é um algoritmo de aprendizado de máquina baseado em probabilidade e no Teorema de Bayes. Ele assume independência entre os atributos, sendo usado para classificação com rapidez e simplicidade, especialmente em grandes conjuntos de dados. 

In [33]:
import pandas as pd
import numpy as np 
import sklearn

In [39]:
df = pd.read_csv('agaricus-lepiota.data')
df = df.rename(columns={
    'p': 'poisonous',
    'x': 'cap-shape',
    's': 'cap-surface',
    'n': 'cap-color',
    't': 'bruises',
    'p.1': 'odor',
    'f': 'gill-attachment',
    'c': 'gill-spacing',
    'n.1': 'gill-size',
    'k': 'gill-color',
    'e': 'stalk-shape',
    'e.1': 'stalk-root',
    's.1': 'stalk-surface-above-ring',
    's.2': 'stalk-surface-below-ring',
    'w': 'stalk-color-above-ring',
    'w.1': 'stalk-color-below-ring',
    'p.2': 'veil-type',
    'w.2': 'veil-color',
    'o': 'ring-number',
    'p.3': 'ring-type',
    'k.1': 'spore-print-color',
    's.3': 'population',
    'u': 'habitat'
})
# Exibindo completude da base
print(df.isnull().sum())
print(df.count())

poisonous                   0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64
poisonous                   8123
cap-shape                   8123
cap-surface                 8123
cap-color                   8123
bruises                     8123
odor                        8123
gill-attachment             8123
gill-spacing                8123
gill-size                   8123


In [35]:
# encondar os valores da base, para desencodarmos podemos usar o inverse_transform do SKLearn
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encodado  = df.apply(le.fit_transform)
df_encodado.head()

Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
1,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
2,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
3,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
4,0,5,3,9,1,0,1,0,0,5,...,2,7,7,0,2,1,4,2,2,1


In [36]:
# Separando a base em treino e teste
from sklearn.model_selection import train_test_split
X = df_encodado.drop('poisonous', axis=1)
y = df_encodado['poisonous']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Treinando o modelo Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Testando o modelo
y_pred = gnb.predict(X_test)

# Avaliando o modelo
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

# Matriz de confusão e o valor predito
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

# Exibindo a matriz de confusão
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predito'], margins=True)

0.9257283545342634


Predito,0,1,All
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1166,81,1247
1,100,1090,1190
All,1266,1171,2437


In [37]:
# Mostrando a feature importance para cada um dos atributos
importances = gnb.theta_
importances = np.array(importances)
importances = importances[0]
importances = importances/sum(importances)
importances = np.array(importances)
importances = importances.reshape(1, 22)
importances = pd.DataFrame(importances, columns=X.columns)
importances = importances.T
importances = importances.rename(columns={0: 'importance'})
importances = importances.sort_values(by='importance', ascending=False)
print(importances)


                          importance
gill-color                  0.122430
stalk-color-above-ring      0.112622
stalk-color-below-ring      0.112190
cap-color                   0.084734
odor                        0.080011
population                  0.061331
cap-shape                   0.061100
spore-print-color           0.058973
ring-type                   0.056508
veil-color                  0.035839
stalk-surface-below-ring    0.033574
stalk-surface-above-ring    0.033018
cap-surface                 0.029896
stalk-root                  0.027813
habitat                     0.021294
ring-number                 0.020813
gill-attachment             0.017729
bruises                     0.012305
stalk-shape                 0.011385
gill-spacing                0.005117
gill-size                   0.001320
veil-type                   0.000000


# Resutaldo

Foram utilizadas 8123 registros, sendo utilizado 30% para testes e 70% para validação. Ao treinar o modelo Naive Bayes, foi encontrado a acurácia de 92%, onde no livro foi encontrado uma acurácia de 97%.
