# Naive Baye's Classifier is a supervised learning algorithm based on Baye's Theorem of Probability.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('D:/LEARNING/DATA SCIENCE/MACHINE LEARNING/Naive Bayes Classifier/mushrooms.csv')

In [3]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


# This is not numerical data but is categorical data. We can change by mapping a dictionary but scikit learn has inbuilt function for us Labelencoder.

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [5]:
# Creating an object for class labelEncoder
le = LabelEncoder()

ds = df.apply(le.fit_transform) # le.fit_transform applies transformation on each column


In [6]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [7]:
# Comversion of pandas dataframe into numpy array
data = ds.values
print(data.shape)
print(type(data))

data_y =data[:,0]
data_x = data[:,1:]

(8124, 23)
<class 'numpy.ndarray'>


## Breaking the data into train and test

In [8]:
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2) # 80% for training and 20% for testing

In [9]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(6499, 22)
(1625, 22)
(6499,)
(1625,)


In [10]:
# Checing the class of mushroom
np.unique(y_train)

array([0, 1])

### We got two types of mushroom

# Building the classifier

In [13]:
# numpy shortcut to count any va;ue , it works faster than loops
a = np.array([0,0,0,5,5,1,1,1,0,1])
print(np.sum(a==5))
print(np.sum(a==1))

2
4


In [14]:
def prior_prob(y_train,label):
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train==label)
    return (class_examples)/float(total_examples)

In [15]:
y=np.array([0,0,0,0,4,4,4,4,1,1])
prior_prob(y,1)

0.2

In [17]:
def conditional_prob(x_train,y_train,feature_col,feature_val,label):
    # suppose there are 3 mushroom and 2 have green color and 1 has white , then the probability of brown is 2/3
    # We could take any feature here out of the given features , above was like P(mushroom=green | class =2)
    x_filter = x_train[y_train==label] # get the rows where y_train has value of label.
    numerator = np.sum(x_filter[:,feature_col]==feature_val)  #no_of_mushroom of the given color i.e feature value 
    # out of the array where we get the feature coloumn
    denominator = np.sum(y_train==label)
    
    return numerator/float(denominator)

## Calculation of Posterior probablity for reach example adn amke predictions

In [22]:
def predict(x_train,y_train,xtest):
    """Xtest is a single testing point , n features"""
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    # compute posterier probability for each class 
    post_prob = [] # List of probabilities for all classes and given a single testing point
    for label in classes:
        # posterier = likelihood*prior
        likelihood = 1.0
        for f in range(n_features):
            cond = conditional_prob(x_train,y_train,f,xtest[f],label)
            likelihood*=cond
        
        prior = prior_prob(y_train,label)
        post = likelihood*prior
        post_prob.append(post)
        
    pred = np.argmax(post_prob)
    return pred

In [27]:
output = predict(x_train,y_train,x_test[2])
print(output)
print(y_test[2])

print()
output = predict(x_train,y_train,x_test[1])
print(output)
print(y_test[1])

1
1

0
0


In [28]:
def score(x_train,y_train,x_test,y_test):
    pred = []
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train,y_train,x_test[i])
        pred.append(pred_label)
    
    pred = np.array(pred)
    accuracy = np.sum(pred==y_test)/y_test.shape[0]
    
    return accuracy

In [29]:
print(score(x_train,y_train,x_test,y_test))

0.9956923076923077


## It has achieved a good accuracy 99.5 %