In [1]:
import numpy as np
import pandas as pd

In [2]:
# Use mushroom dataset

In [3]:
df = pd.read_csv('./datasets/mushrooms.csv')

In [4]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
df.shape

(8124, 23)

In [6]:
# Convert to numeric data

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [8]:
# Label Encoder is used to convert text/characters into numbers
le = LabelEncoder()

In [9]:
# apply function is used to apply the given function on each column
ds = df.apply(func=le.fit_transform)

In [10]:
# type coulumn in dataset refers to y
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [11]:
data = ds.values
data

array([[1, 5, 2, ..., 2, 3, 5],
       [0, 5, 2, ..., 3, 2, 1],
       [0, 0, 2, ..., 3, 2, 3],
       ...,
       [0, 2, 2, ..., 0, 1, 2],
       [1, 3, 3, ..., 7, 4, 2],
       [0, 5, 2, ..., 4, 1, 2]])

In [12]:
X = data[:, 1:]
y = data[:, 0]

In [13]:
X.shape, y.shape

((8124, 22), (8124,))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [15]:
from sklearn.naive_bayes import GaussianNB

In [16]:
model = GaussianNB()

In [17]:
model.fit(X_train,y_train)

GaussianNB()

In [18]:
model.predict(X_test[:10])

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0])

In [19]:
y_test[:10]

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0])

In [20]:
model.score(X_test,y_test)

0.9261469600895188

## Custom Naive Bayes

In [21]:
class CustomNB:
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    # P(y=c|x) = P(x|y=c) * P(y=c)
    # Posterior prob = Likelihood * Prior prob
    
    # Here, label =  the class 'c' for which you want to obtain prior probability
    def prior_prob(self, label):
        # P(y=c) = (number of examples belonging to class c)/Total no of examples
        total = self.y_train.shape[0]
        # class_examples = number of examples belonging to class c
        class_examples = np.sum(self.y_train == label)
        return class_examples / float(total)
    
    # P(Xi=red|y=label) - ith feature (feature column =i) for a single example
    # Likelihood function
    def conditional_prob(self, feature_col, feature_val, label):
        # out of all the examples, what mushrooms have feature as feature_val in the feature_col that belongs to that class label
        # eg: lets say 8 means green in cap_color column
        # how many mushrooms have the color 'green'(8) in the 'cap_color' column
        X_filtered = self.X_train[self.y_train==label] # all the examples in class label 
                                                       # label = 'poisonous', all examples(mushroom) having label as poisonous
        # All the rows, feature column
        # Then match to filter out the rows having value as fetaure_val
        numerator = np.sum(X_filtered[:,feature_col] == feature_val)   # Number of examples in class label having feature_val as value
        denominator = len(X_filtered)                                  # Number of examples in class label  
        return numerator/denominator
    
    # Functional conditional_prob is for single word
    # we are going to do this for all the 22 features that we have for each example
    def predict_point(self, X_test):
        # X_test is a single example with n features
        classes = np.unique(self.y_train)  # number of classes # classes are labelled by default from 0
        n_features = self.X_train.shape[1] # number of features
        # We are going to do prior * likelihood for each class 
        # store those values in a list
        # and then take max of the list
        post_pro = []
        
        # post prob for each class
        for label in classes:
            # post_prob = prior * likelihood
            likehood = 1.0
            for feature in range(n_features):
                cond = self.conditional_prob(feature, X_test[feature], label)
                # Likelihhod is product of all likelihood
                likehood *= cond
                
            prior = self.prior_prob(label)
            post = prior * likehood
            post_pro.append(post)
        
        # ans = max value from all labels
        return np.argmax(post_pro) # return the index of the largest value in array

    
    def predict(self, X_test):
        result = []
        for point in X_test:
            result.append(self.predict_point(point))
        return np.array(result)
    
    def score(self, X_test, y_test):
        return (self.predict(X_test) == y_test).mean()    

In [22]:
model = CustomNB()

In [23]:
model.fit(X_train, y_train)

In [24]:
model.predict(X_test[:10])

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0], dtype=int64)

In [25]:
y_test[:10]

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0])

In [26]:
model.score(X_test, y_test)

0.9973890339425587

In [27]:
# Code to show how np.sum works in prior function

In [28]:
np.array([2,3,3,2,3]) == np.array([3,3,3,2,2])

array([False,  True,  True,  True, False])

In [29]:
np.sum(np.array([2,3,3,2,3]) == np.array([3,3,3,2,2]))

3

In [30]:
np.sum(np.array([2,3,2,3,2])==2)

3