In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("mushrooms.csv")

In [14]:
df.head(10)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [15]:
df['cap_shape'].unique()

array(['x', 'b', 's', 'f', 'k', 'c'], dtype=object)

In [9]:
df.columns

Index(['type', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [12]:
df['type'].value_counts()

e    4208
p    3916
Name: type, dtype: int64

In [13]:
df.shape

(8124, 23)

In [16]:
## LabelEncoding 
## OneHot Encoding

In [23]:
d = pd.DataFrame(["a","b","c","b","c","a","b"])


In [26]:
d.columns = [ "Features"]

In [28]:
d

Unnamed: 0,Features
0,a
1,b
2,c
3,b
4,c
5,a
6,b


In [32]:
pd.get_dummies(d, drop_first=True)

Unnamed: 0,Features_b,Features_c
0,0,0
1,1,0
2,0,1
3,1,0
4,0,1
5,0,0
6,1,0


In [33]:
from sklearn.preprocessing import OneHotEncoder

In [34]:
oht = OneHotEncoder()

In [40]:
oht.fit_transform(d).toarray()

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [52]:
d['Features'].astype('category').cat.codes

0    0
1    1
2    2
3    1
4    2
5    0
6    1
dtype: int8

## Encode Categorical Data into Numeric

In [55]:
from sklearn.preprocessing import LabelEncoder

In [60]:
le = LabelEncoder()

In [66]:
df = df.apply(le.fit_transform)

In [90]:
df = df.values

In [91]:
df_x = df[: , 1:]
df_y = df[: , 0]

In [92]:
from sklearn.model_selection import train_test_split

In [93]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=42)

In [94]:
X_train.shape, y_train.shape

((6499, 22), (6499,))

In [95]:
X_test.shape

(1625, 22)

In [96]:
X_train

array([[3, 2, 2, ..., 7, 4, 0],
       [5, 2, 4, ..., 7, 4, 4],
       [2, 3, 2, ..., 7, 4, 2],
       ...,
       [2, 3, 4, ..., 3, 5, 4],
       [3, 2, 2, ..., 7, 4, 4],
       [3, 0, 3, ..., 7, 2, 1]])

# Naive Bayes Classifier

In [104]:
def prior_prob(y_train, label):
    return (y_train == label).sum()/y_train.shape[0]

In [106]:
prior_prob(y_train, 1)

0.48222803508232037

In [118]:
def conditional_prob(x_train, y_train, label, feature_col, feature_val):
    x_filtered = x_train[y_train == label]
    
    numerator = np.sum(x_filtered[:, feature_col] == feature_val)
    denominator = x_filtered.shape[0]
    
    
    return numerator/denominator    

In [123]:
def predict(x_train, y_train, x_test):
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_prob = []
    
    for l in classes:
        likelihood = 1.0
        
        for f in range(n_features):
            con = conditional_prob(x_train, y_train, l, f, x_test[f])
            likelihood *= con
            
        prior = prior_prob(y_train, l)
        post = likelihood*prior
        
        post_prob.append(post)
        
    pred = np.argmax(post_prob)
    return pred

In [128]:
predict(X_train, y_train , X_test[11])

1

In [132]:
def score(x_train, y_train, x_test, y_test):
    y_pred = []
    for x in x_test:
        p = predict(x_train, y_train, x)
        y_pred.append(p)
        
    y_pred = np.array(y_pred)
    
    return (y_pred == y_test).mean()

In [133]:
score(X_train, y_train, X_test, y_test)

0.9963076923076923

In [135]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [136]:
from sklearn.datasets import load_digits

In [139]:
digit = load_digits()

In [140]:
X = digit.data

In [142]:
y = digit.target

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [146]:
mnb = MultinomialNB()
gnb = GaussianNB()
bnb = BernoulliNB()

In [147]:
mnb.fit(X_train, y_train)
gnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [148]:
mnb.score(X_test, y_test)

0.9111111111111111

In [149]:
bnb.score(X_test, y_test)

0.8638888888888889

In [150]:
gnb.score(X_test, y_test)

0.8472222222222222