In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB

In [2]:
mushroom = pd.read_csv("bayes/mushrooms.csv")

In [3]:
mushroom.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
le = LabelEncoder()

In [5]:
updated = mushroom.apply(le.fit_transform)

In [6]:
X = updated.drop(["type"],axis=1)
y = updated["type"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [8]:
# len(X[y==0])/len(X) + len(X[y==1])/len(X)


In [11]:
model = {}
prior = {}
for klass in set(y) :
    model[klass] = {}
    selected = X[y==klass]
    prior[klass] = len(selected)/len(X)
    for column in X.columns :
        model[klass][column] = {}
        for value in set(X[column]) :
            model[klass][column][value] = np.sum(selected[column] == value)/len(selected)

In [49]:
class NB :
    def fit(self, X, y) :
        model = {}
        prior = {}
        for klass in set(y) :
            model[klass] = {}
            selected = X[y==klass]
            prior[klass] = len(selected)/len(X)
            for column in X.columns :
                model[klass][column] = {}
                for value in set(X[column]) :
                    model[klass][column][value] = np.sum(selected[column] == value)/len(selected)
        self.model = model
        self.prior = prior
    def predict_point(self, point) :
        model = self.model
        prior = self.prior
        probs = []
        for klass in self.model :
            p = self.prior[klass]
            for column in model[klass] :
                value = point[column]
                p *= model[klass][column][value]
            probs.append(p)
        return np.argmax(probs)
    def predict(self, X) :
        yh = []
        for index, row in X.iterrows():
            yh.append(self.predict_point(row))
        return np.array(yh)
    
    def score(self, X, y) :
        return(sum(self.predict(X) == np.array(y))/len(y))
            

In [50]:
m = NB()

In [51]:
m.fit(X_train, y_train)

In [52]:
m.predict(X_test)

array([0, 1, 1, ..., 0, 0, 0])

In [53]:
m.score(X_test, y_test)

0.9973890339425587

In [54]:
m.score(X_train, y_train)

0.9970604446077531

In [19]:
y_new = list(y)

In [22]:
bern = BernoulliNB()

In [23]:
bern.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [24]:
bern.predict(X_test)

array([0, 1, 1, ..., 0, 1, 0])

In [25]:
bern.score(X_test,y_test)

0.84520701230884

In [26]:
gauss = GaussianNB()

In [27]:
gauss.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [29]:
gauss.predict(X_test)

array([0, 1, 1, ..., 0, 1, 0])

In [30]:
gauss.score(X_test,y_test)

0.9261469600895188

In [31]:
mul = MultinomialNB()

In [32]:
mul.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
mul.predict(X_test)

array([0, 1, 1, ..., 0, 1, 0])

In [34]:
mul.score(X_test,y_test)

0.8008205893323387