In [214]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from matplotlib import pyplot as plt

In [215]:
#This defines a new class called NewModel and a method called train that takes two parameters:
#input_data (the training data) and labels (the corresponding labels for each data point). 
#The method initializes the class variables input_data, labels, classes, and parameters.
class NaiveBayes():
    def train(self, input_data, labels):
        self.input_data, self.labels = input_data, labels
        self.classes = np.unique(labels)
        self.parameters = []
        #This loops through each unique class in the training data and gets the subset of 
        #training data corresponding to that class. It then appends an empty list to the parameters 
        #list for that class.
        for i, c in enumerate(self.classes):
            data_for_c = input_data[np.where(labels == c)]
            self.parameters.append([])
            #This loops through each feature column in the subset of training data for the
            #current class and calculates the mean and variance for that feature. 
            #It then appends a dictionary containing the mean and variance to the parameters list for that class.
            for col in data_for_c.T:
                parameters = {"mean_val": col.mean(), "variance": col.var()}
                self.parameters[i].append(parameters)

    #This is a helper method that calculates the likelihood of a feature value x given the mean and 
    #variance of that feature for a given class.
    def _calculate_likelihood(self, mean_val, variance, x):
        eps = 1e-4 
        coeff = 1.0 / math.sqrt(2.0 * math.pi * variance + eps)
        exponent = math.exp(-(math.pow(x - mean_val, 2) / (2 * variance + eps)))
        return coeff * exponent

    #This is another helper method that calculates the prior probability of a given class c in the training data.
    def _calculate_prior(self, c):
        freq = np.mean(self.labels == c)
        return freq

    #This is a method that classifies a single sample based on the Naive Bayes model. 
    #It loops through each class in the training data and calculates the posterior probability 
    #of the sample belonging to that class. It does this by calculating the likelihood of each feature
    #value in the sample given the mean and variance of that feature for the current class,
    #and multiplying these likelihoods together with the prior probability of the class. 
    #It then appends the posterior probability to a list of posteriors.
    #Finally, it returns the class with the highest posterior probability.
    def _classify_sample(self, sample):
        posteriors = []
        for i, c in enumerate(self.classes):
            posterior = self._calculate_prior(c)
            for feature_value, params in zip(sample, self.parameters[i]):
                likelihood = self._calculate_likelihood(params["mean_val"], params["variance"], feature_value)
                posterior *= likelihood
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    #This method takes a set of input data and classifies each sample using the _classify_sample method. 
    #It returns a list of predicted classes for each input sample.
    def predict(self, input_data):
        predictions = [self._classify_sample(sample) for sample in input_data]
        return predictions


In [216]:
clm = [
    'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape',
    'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
    'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
    'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population',
    'habitat'
]

In [217]:
data = pd.read_csv("MushroomData_8000.txt", header=None, names=clm)
data.shape

(8000, 23)

In [218]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [219]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8000 non-null   object
 1   cap-shape                 8000 non-null   object
 2   cap-surface               8000 non-null   object
 3   cap-color                 8000 non-null   object
 4   bruises?                  8000 non-null   object
 5   odor                      8000 non-null   object
 6   gill-attachment           8000 non-null   object
 7   gill-spacing              8000 non-null   object
 8   gill-size                 8000 non-null   object
 9   gill-color                8000 non-null   object
 10  stalk-shape               8000 non-null   object
 11  stalk-root                8000 non-null   object
 12  stalk-surface-above-ring  8000 non-null   object
 13  stalk-surface-below-ring  8000 non-null   object
 14  stalk-color-above-ring  

In [220]:
categorical = [var for var in data.columns if data[var].dtype == 'O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

There are 23 categorical variables

The categorical variables are :

 ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


In [221]:
data[categorical].head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [222]:
data[categorical].isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises?                    0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [223]:
for var in categorical:

    print(data[var].value_counts())

e    4141
p    3859
Name: class, dtype: int64
x    3634
f    3137
k     758
b     435
s      32
c       4
Name: cap-shape, dtype: int64
y    3211
s    2480
f    2305
g       4
Name: cap-surface, dtype: int64
n    2214
g    1831
e    1471
y    1072
w    1026
b     168
p     144
c      42
r      16
u      16
Name: cap-color, dtype: int64
f    4625
t    3375
Name: bruises?, dtype: int64
n    3461
f    2137
s     563
y     557
a     400
l     400
p     256
c     192
m      34
Name: odor, dtype: int64
f    7834
a     166
Name: gill-attachment, dtype: int64
c    6710
w    1290
Name: gill-spacing, dtype: int64
b    5543
n    2457
Name: gill-size, dtype: int64
b    1673
p    1487
w    1189
n    1032
g     746
h     732
u     492
k     408
e      96
y      69
o      52
r      24
Name: gill-color, dtype: int64
t    4553
e    3447
Name: stalk-shape, dtype: int64
b    3774
?    2360
e    1120
c     554
r     192
Name: stalk-root, dtype: int64
s    5095
k    2330
f     552
y      23
Name: stalk-sur

In [224]:
for var in categorical:

    print(data[var].value_counts() / float(len(data)))

e    0.517625
p    0.482375
Name: class, dtype: float64
x    0.454250
f    0.392125
k    0.094750
b    0.054375
s    0.004000
c    0.000500
Name: cap-shape, dtype: float64
y    0.401375
s    0.310000
f    0.288125
g    0.000500
Name: cap-surface, dtype: float64
n    0.276750
g    0.228875
e    0.183875
y    0.134000
w    0.128250
b    0.021000
p    0.018000
c    0.005250
r    0.002000
u    0.002000
Name: cap-color, dtype: float64
f    0.578125
t    0.421875
Name: bruises?, dtype: float64
n    0.432625
f    0.267125
s    0.070375
y    0.069625
a    0.050000
l    0.050000
p    0.032000
c    0.024000
m    0.004250
Name: odor, dtype: float64
f    0.97925
a    0.02075
Name: gill-attachment, dtype: float64
c    0.83875
w    0.16125
Name: gill-spacing, dtype: float64
b    0.692875
n    0.307125
Name: gill-size, dtype: float64
b    0.209125
p    0.185875
w    0.148625
n    0.129000
g    0.093250
h    0.091500
u    0.061500
k    0.051000
e    0.012000
y    0.008625
o    0.006500
r    0.003000
N

In [225]:
le = LabelEncoder()
le2 = LabelEncoder()
y = data['class']
X = data.drop(['class'], axis=1)
y = le.fit_transform(y)
X = X.apply(le2.fit_transform)

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

In [227]:
X_test

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
2215,5,3,4,1,5,1,0,0,10,1,...,2,7,3,0,2,1,4,3,4,0
2582,2,0,2,1,5,1,0,0,5,1,...,2,7,3,0,2,1,4,3,4,0
1662,5,3,9,1,0,1,0,0,2,0,...,2,7,7,0,2,1,4,2,2,1
3027,5,0,3,0,2,1,0,0,2,0,...,1,4,0,0,2,1,2,1,5,0
4343,5,3,9,0,2,1,0,0,3,0,...,1,6,6,0,2,1,2,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7058,2,2,2,0,7,1,0,1,0,1,...,2,7,6,0,2,1,0,7,4,4
7312,3,3,2,0,8,1,0,1,0,1,...,2,7,6,0,2,1,0,7,4,2
1471,2,2,3,0,5,1,1,0,7,1,...,0,7,7,0,2,1,0,2,0,1
6260,5,2,2,0,2,1,0,1,0,1,...,1,6,6,0,2,1,0,7,4,4


In [228]:
y_test

array([0, 0, 0, ..., 0, 1, 1])

In [229]:
nb =NaiveBayes()
nb.train(X_train.values,y_train)

In [230]:
unknown_data = pd.read_csv('MushroomData_Unknwon_100.txt', names=clm[1:])
unknown_data = unknown_data.apply(le2.fit_transform)

In [231]:
y_pred = nb.predict(unknown_data.values)
print(y_pred)

[0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0]


In [238]:
le.inverse_transform(y_pred)

array(['e', 'e', 'e', ..., 'e', 'p', 'p'], dtype=object)

In [233]:
y_pred = nb.predict(X_test.values)
acc = metrics.accuracy_score(y_test, y_pred)

In [234]:
acc

0.9266666666666666