###  naive bayes mushroom dataset

In [1]:
# goal is to predict class of mushroom given some features of mushroom

In [2]:
import numpy as np
import pandas as pd

#### load the dataset

In [3]:
df=pd.read_csv("mushrooms.csv")



In [4]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [6]:
df.shape

(8124, 23)

####  encode the categorical data into numerical data

In [10]:
# one way to iterate and maintain a dictonary of alphabets with respective number ex p:1  ...
# or use libraray
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [16]:
ls=LabelEncoder()
# ls.fit_transform?
# applies transformation on each column
ds=df.apply(ls.fit_transform)

In [15]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [17]:
# covert to numpy
data=ds.values

In [18]:
data.shape

(8124, 23)

In [20]:
print(data[:5,:])

[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


In [21]:
datay=data[:,0]
datax=data[:,1:]

#####  split the data

In [25]:
x_train, x_test, y_train, y_test = train_test_split(datax,datay,test_size=0.2, random_state=2)

In [27]:
x_train.shape,x_test.shape, y_train.shape,y_test.shape

((6499, 22), (1625, 22), (6499,), (1625,))

In [28]:
# means we have only 2 classes of mushroom
np.unique(y_train)

array([0, 1])

###  building our classifier

In [38]:
# numpy shortcut
a=np.array([1,0,0,1,2,3,1,1,0])
np.sum(a==1) # total number of 1 in numpy array

4

In [35]:
def prior_prob(y_train,label):
    total_example=y_train.shape[0]
    class_examples=np.sum(y_train==label)
    return class_examples/float(total_example)

In [53]:
def cond_prob(x_train,y_train,feature_col, feature_val, label):
    x_filtered=x_train[y_train==label]
    numerator=np.sum(x_filtered[:,feature_col]==feature_val)
    denominator=np.sum(y_train==label)
    return numerator/float(denominator)

### compute posterior probability

In [40]:
np.unique(y_train)

array([0, 1])

In [56]:
def posterior_prob(x_train,y_train,x_test):
    """x_test is a single testing point"""
    classes=np.unique(y_train)
    n_features=x_train.shape[1]
    
    pos_prob=[] # list of prob for all classes given a single testing point
    
    #compute posterior for all classes
    for label in classes:
        """ pos= liklihood* prior"""
        liklihood=1.0
        for f in range (n_features):
            condprob=cond_prob(x_train,y_train,f,x_test[f],label)
            liklihood*=condprob
        prior=prior_prob(y_train,label)
        pos=liklihood*prior
        pos_prob.append(pos)
    
    pred=np.argmax(pos_prob)
    return pred

In [64]:
output=posterior_prob(x_train,y_train,x_test[55])

In [65]:
print(output)
print(y_test[55])

0
0


In [74]:
def score(x_train,y_train,x_test,y_test):
    pred=[]
    for i in range(x_test.shape[0]):
        pred_l=posterior_prob(x_train,y_train,x_test[i])
        pred.append(pred_l)
    pred=np.array(pred)
    
    accuracy=np.sum(pred==y_test)/y_test.shape[0]
    return accuracy

In [75]:
score(x_train,y_train,x_test,y_test)

0.9944615384615385