### Naive Bayes - Mushroom dataset
- Goal is to predict the class of mushrooms, given some features of the mushrooms. We will use Naive Bayes Model
for this classification.

#### Load the Dataset

In [167]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [168]:
df = pd.read_csv('mushrooms.csv')

In [169]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [170]:
df.shape

(8124, 23)

#### Encode the Categorical data into Numerical data

In [171]:
# Preprocessing lib - scikit
le = LabelEncoder()
# Applies transformation on each column
ds = df.apply(le.fit_transform,axis=0)

In [172]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [173]:
data_x = ds.drop('type',axis=1)
data_y = ds['type']

In [174]:
data_x.head()
print(data_x.shape)

(8124, 22)


In [175]:
data = ds.values

#### Break the data into train and text

In [176]:
train_test_split?

In [177]:
X_train,X_test,Y_train,Y_test = train_test_split(data[:,1:],data[:,0],test_size=0.2)


In [178]:
print(X_test.shape,Y_test.shape)

(1625, 22) (1625,)


In [179]:
print(X_train.shape,Y_train.shape)

(6499, 22) (6499,)


In [180]:
np.unique(Y_train)

array([0, 1])

### Building Our classifier

In [181]:
# pd.DataFrame(np.unique(X_train[:,2]))

In [182]:
def prior_probability(y_train,label):
    total_examples = y_train.shape[0]
    class_example = np.sum(y_train == label)

    return class_example/float(total_examples)

In [183]:
y = np.array([0,5,5,1,1,1,1,0,0,0])


In [184]:
prior_probability(y,1)

0.4

In [185]:
# Conditional probability  -->
def cond_prob(x_train,y_train,feature_col,feature_value,label):
    x_filtered = x_train[y_train == label]
    total_examples = np.sum(y_train == label)
    class_example = np.sum(x_filtered[:,feature_col] == feature_value)

    return  class_example/float(total_examples)

In [186]:
cond_prob(X_train,Y_train,0,2,1)

0.40151994933502216

#### Compute Posterior probability for each test example and make predictions

In [187]:
def predict(x_train,y_train,x_test):
    """ x_test is a single testing point , n features """
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_probs = []
    # Compute posterior for each class
    for label in classes:
        # Post_c = likelihood * prior
        likelihood = 1.0
        for f in range(n_features):
            cond = cond_prob(x_train,y_train,f,x_test[f],label)
            likelihood *= cond

        prior = prior_probability(y_train,label)
        post = likelihood*prior
        post_probs.append(post)

    pred = np.argmax(post_probs)
    return pred

In [188]:
output = predict(X_train,Y_train,X_test[2])

In [189]:
output

0

In [190]:
print(Y_test[2])


0


In [191]:
def score(x_train,y_train,x_test,y_test):
    pred = []
    for ex in range(x_test.shape[0]):
        pred.append(predict(x_train,y_train,x_test[ex,:]))
    pred = np.array(pred)

    accuracy = np.sum(pred == y_test)/y_test.shape[0]

    return accuracy



In [192]:
score(X_train,Y_train,X_test,Y_test)



0.9975384615384615