# Naive Byes Classifier

In [61]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [16]:
df = pd.read_csv('agaricus-lepiota.data',header= None)
df

df.shape

df.columns=['class','cap-shape','cap-surface','cap-color','bruises?','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring', 'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']
df.columns

df.dtypes

df['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [21]:
df_encoded = pd.DataFrame()
enc = LabelEncoder()

#columns_to_encode = ['cap-shape','cap-surface','cap-color','bruises?','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring', 'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']

for col in df.columns :
    df_encoded[col] = enc.fit_transform(df[col])
df_encoded

df_encoded.dtypes

class                       int32
cap-shape                   int32
cap-surface                 int32
cap-color                   int32
bruises?                    int32
odor                        int32
gill-attachment             int32
gill-spacing                int32
gill-size                   int32
gill-color                  int32
stalk-shape                 int32
stalk-root                  int32
stalk-surface-above-ring    int32
stalk-surface-below-ring    int32
stalk-color-above-ring      int32
stalk-color-below-ring      int32
veil-type                   int32
veil-color                  int32
ring-number                 int32
ring-type                   int32
spore-print-color           int32
population                  int32
habitat                     int32
dtype: object

# Implementing feature selection using tree-> feature importances

In [46]:
X = df_encoded.drop(['class'],axis = 1)
y = df_encoded[['class']]

clf = DecisionTreeClassifier(criterion = 'entropy')
clf.fit(X,y)
clf.feature_importances_
clf.feature_names_in_

array(['cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype=object)

In [49]:
feature_scores= pd.DataFrame()
feature_scores['Name'] = clf.feature_names_in_
feature_scores['Importance'] = clf.feature_importances_
feature_scores

Unnamed: 0,Name,Importance
0,cap-shape,0.0
1,cap-surface,0.0
2,cap-color,0.0
3,bruises?,0.027191
4,odor,0.088867
5,gill-attachment,0.0
6,gill-spacing,0.0
7,gill-size,0.150039
8,gill-color,0.26965
9,stalk-shape,0.0


# Creating Naive Byes Classifier Model

In [69]:
X_encoded = df_encoded[['odor','bruises?','gill-color','gill-size','stalk-root','stalk-surface-above-ring','spore-print-color',
                        'population','habitat']]
y_encoded = df_encoded['class']

In [70]:
X_train,X_test,y_train,y_test = train_test_split(X_encoded,y_encoded,test_size = 0.2,random_state = 42)

In [71]:
clf1 = GaussianNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)

In [67]:
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.9218461538461539
[[771  72]
 [ 55 727]]
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       843
           1       0.91      0.93      0.92       782

    accuracy                           0.92      1625
   macro avg       0.92      0.92      0.92      1625
weighted avg       0.92      0.92      0.92      1625

