In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm

In [2]:
data=pd.read_csv("mushroom.csv").values
x=data[:,1:]
y=data[:,0]
y                                                #e means edible mushroom, p means poisonous

array(['e', 'e', 'p', ..., 'e', 'p', 'e'], dtype=object)

In [3]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((5442, 22), (2681, 22), (5442,), (2681,))

In [4]:
x_train

array([['f', 'y', 'g', ..., 'k', 'v', 'd'],
       ['x', 'f', 'n', ..., 'k', 'y', 'd'],
       ['f', 'y', 'n', ..., 'w', 'v', 'l'],
       ...,
       ['f', 'f', 'e', ..., 'k', 'y', 'd'],
       ['x', 'y', 'n', ..., 'n', 'v', 'd'],
       ['x', 'f', 'y', ..., 'h', 'y', 'd']], dtype=object)

In [13]:
class NaiveBayes():
    
    def __init__(self):
        pass
    
    def prior_probability(self,y_train,label):
        
        c=Counter(y_train)
        return c[label]/y_train.shape[0]
    
    def conditional_probability(self,x_train,y_train,label,feature_no,feature_value):
        
        x_train_filtered=x_train[y_train==label]
        c=Counter(x_train_filtered[:,feature_no])
        
        return c[feature_value]/x_train_filtered.shape[0]
    
    def likelihood(self,x_train,y_train,label,x):                              #x is a single query point
        
        pl=1
        for j in range(x_train.shape[1]):
            
            cp=self.conditional_probability(x_train,y_train,label,j,x[j])
            pl*=cp
        
        return pl
            
    def predict(self,x_train,y_train,x_test):
        
        labels=np.unique(y_train)
        predictions=[]
        
        pp={}
        for label in labels:
            prior_prob=self.prior_probability(y_train,label)
            pp[label]=prior_prob
        
        for x in tqdm(x_test):
            
            max_prob=0
            prob_label=-1
            
            for label in labels:
                likelihood_prob=self.likelihood(x_train,y_train,label,x)
                post_prob=likelihood_prob*pp[label]
                
                if post_prob>max_prob:
                    prob_label=label
                    max_prob=post_prob
                
            predictions.append(prob_label)
            
        return predictions

In [14]:
clf=NaiveBayes()
clf

<__main__.NaiveBayes at 0x22a7dc6a470>

In [15]:
predictions=clf.predict(x_train,y_train,x_test)
print(predictions)

100%|██████████████████████████████████████████████████████████████████████████████| 2681/2681 [03:52<00:00, 11.54it/s]


['e', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'e', 'p', 'p', 'p', 'p', 'e', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'e', 'p', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'p', 'e', 'p', 'p', 'e', 'p', 'e', 'e', 'p', 'e', 'p', 'p', 'e', 'p', 'e', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'p', 'p', 'p', 'p', 'p', 'e', 'e', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'e', 'p', 'p', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'p',




In [16]:
#accuracy

Counter(predictions==y_test)[True]/y_test.shape[0]

0.9392017903767251