### Importing Libraries

In [111]:
import pandas as pd

In [112]:
import numpy as np

In [113]:
import matplotlib.pyplot as plt

### Importing the dataset

In [114]:
RawData = pd.read_csv("mushrooms.csv")

### Let's have a look at the dataset

In [115]:
RawData.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


As we can see the data is full of labels. So in each Feature we need to calculate the relative frequency of each label when the class is 'p' and 'e'.

In [116]:
print("Total no. of rows", RawData.shape[0])

Total no. of rows 8124


In [117]:
RawData.shape[0]*0.75

6093.0

In [118]:
(RawData.shape[0]*0.75)/2

3046.5

### Splitting the class p and class e seperately.

In [119]:
eTypeData = RawData[RawData['class'] == 'e']

In [120]:
print(eTypeData.shape)

(4208, 23)


In [121]:
pTypeData = RawData[RawData['class'] == 'p']

In [122]:
print(pTypeData.shape)

(3916, 23)


### Splitting the dataset in Training set and Test set

In [123]:
eTypeTrain = eTypeData.iloc[:3046,:]
eTypeTest = eTypeData.iloc[3046:,:]

In [124]:
pTypeTrain = pTypeData.iloc[:3046,:]
pTYpeTest = pTypeData.iloc[3046:,:]

In [125]:
eTypeTrainDict = {}

In [126]:
columns = eTypeTrain.columns[1:]
print(columns)

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


In [127]:
eTypeData['cap-shape'].unique()

array(['x', 'b', 's', 'f', 'k'], dtype=object)

**Making a dictionary of dictionary containing relative frequency of each value each label in its corresponding Feature.**

In [128]:
for i in columns:
    temp_dict = {}
    for j in RawData[i].unique():
        
        temp_dict[j] = np.mean((eTypeTrain[i] == j)*1)
        # Smoothing is done so that if there is a label with no occurence in training set then its value should be low but not zero
        if temp_dict[j] == 0:
            temp_dict[j] = 1/(3046+len(RawData[i].unique()))
            
        
    eTypeTrainDict[i] = temp_dict

In [129]:
eTypeTrainDict

{'bruises': {'f': 0.28365068942875904, 't': 0.716349310571241},
 'cap-color': {'b': 0.0003272251308900524,
  'c': 0.0003272251308900524,
  'e': 0.139198949441891,
  'g': 0.25344714379514116,
  'n': 0.29218647406434667,
  'p': 0.0003272251308900524,
  'r': 0.0003272251308900524,
  'u': 0.0003272251308900524,
  'w': 0.18384766907419567,
  'y': 0.1313197636244255},
 'cap-shape': {'b': 0.0840446487196323,
  'c': 0.000327653997378768,
  'f': 0.3670387393302692,
  'k': 0.000327653997378768,
  's': 0.010505581089954037,
  'x': 0.5384110308601444},
 'cap-surface': {'f': 0.41464215364412343,
  'g': 0.0003278688524590164,
  's': 0.22586999343401182,
  'y': 0.35948785292186475},
 'gill-attachment': {'a': 0.00032808398950131233, 'f': 1.0},
 'gill-color': {'b': 0.0003270111183780249,
  'e': 0.0003270111183780249,
  'g': 0.04990151017728168,
  'h': 0.06303348653972422,
  'k': 0.11293499671700591,
  'n': 0.25902823374917927,
  'o': 0.0003270111183780249,
  'p': 0.21831910702560736,
  'r': 0.000327011

In [130]:
pTypeTrainDict = {}

In [131]:
columns = pTypeData.columns[1:]
print(columns)

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


In [132]:
for i in columns:
    temp_dict = {}
    for j in RawData[i].unique():
        temp_dict[j] = np.mean((pTypeTrain[i] == j)*1)
        
# Smoothning is done to give the probability be very small but not zero.
        
    if temp_dict[j] == 0:
            temp_dict[j] = 1/(3046+len(pTypeData[i].unique()))
            
    pTypeTrainDict[i] = temp_dict

In [133]:
RawData['cap-shape'].unique()

array(['x', 'b', 's', 'f', 'k', 'c'], dtype=object)

In [134]:
pTypeTrainDict

{'bruises': {'f': 0.7951411687458962, 't': 0.20485883125410373},
 'cap-color': {'b': 0.03939592908732764,
  'c': 0.00032829940906106366,
  'e': 0.1421536441234406,
  'g': 0.26526592252133946,
  'n': 0.20091923834537098,
  'p': 0.028890347997373604,
  'r': 0.0003274394237066143,
  'u': 0.0,
  'w': 0.10505581089954039,
  'y': 0.2179908076165463},
 'cap-shape': {'b': 0.015101772816808929,
  'c': 0.0006565988181221273,
  'f': 0.44353250164149705,
  'k': 0.022652659225213395,
  's': 0.0,
  'x': 0.5180564674983585},
 'cap-surface': {'f': 0.2495075508864084,
  'g': 0.0013131976362442547,
  's': 0.3148391332895601,
  'y': 0.4343401181877873},
 'gill-attachment': {'a': 0.00032829940906106366, 'f': 0.9996717005909389},
 'gill-color': {'b': 0.29546946815495734,
  'e': 0.0,
  'g': 0.1654629021667761,
  'h': 0.17334208798424164,
  'k': 0.021011162179908074,
  'n': 0.036769533814839134,
  'o': 0.0003272251308900524,
  'p': 0.21011162179908077,
  'r': 0.007879185817465528,
  'u': 0.015758371634931056

In [135]:
pTYpeTest.shape

(870, 23)

In [136]:
eTypeTest.shape

(1162, 23)

### Finding the probability of class 'p' that on the given data of Feature.

In [145]:
def multivariteNaiveBayesClassifier(oneExample):
    mul1 = 1
    mul2 = 1
    
    for i in oneExample.keys():
        mul1 *= pTypeTrainDict[i][oneExample[i]]
        mul2 *= eTypeTrainDict[i][oneExample[i]] 
        
    #Here mul1 is Probability of given data when class in 'p'
    #Here mul2 is Probability of given data when class in 'e'
    
    priorOfeTYpe = 0.5
    priorOfpType = 0.5
    
    numerator = priorOfpType*mul1
    evidence = priorOfeTYpe*mul2 + numerator
    
    return numerator/evidence

In [146]:
multivariteNaiveBayesClassifier(eTypeTest.iloc[0,1:])

0.0

In [147]:
multivariteNaiveBayesClassifier(pTYpeTest.iloc[0,1:])

1.0

In [148]:
eTypeTest.iloc[0,0]

'e'

### We are going to test the model on eType Test set of mushrooms

In [174]:
truePositive = 0
trueNegative = 0
falsePositive = 0
falseNegative = 0

for i in range(eTypeTest.shape[0]):
    
    PmushroomIsPgivenOnFeatures = multivariteNaiveBayesClassifier(eTypeTest.iloc[i,1:])
    
    actualClass = eTypeTest.iloc[i,0]
    
    if PmushroomIsPgivenOnFeatures>0.5 and actualClass == 'p':
        truePositive +=1
    elif PmushroomIsPgivenOnFeatures <0.5 and actualClass == 'e':
        trueNegative +=1
    elif PmushroomIsPgivenOnFeatures>0.5 and actualClass == 'e':
        falsePositive +=1
    else:
        falseNegative +=1

This test data was all about eType mushroooms. So we should be concerned in finding out the trueNegative and falsePositive.

In [175]:
eTypeTest.shape[0]

1162

This is the data which was actually was 'e' and our model also predicted it 'e'.

In [176]:
trueNegative 

922

This is the data which was actually was 'e' and our model also predicted it 'p'.

In [177]:
falsePositive

240

In [179]:
accuracy = (truePositive + trueNegative )/eTypeTest.shape[0]
print(accuracy)

0.7934595524956971


### Now we are going to test our model on pType mushrooms.

In [180]:
truePositive = 0
trueNegative = 0
falsePositive = 0
falseNegative = 0

for i in range(pTYpeTest.shape[0]):
    
    PmushroomIsPgivenOnFeatures = multivariteNaiveBayesClassifier(pTYpeTest.iloc[i,1:])
    
    actualClass = pTYpeTest.iloc[i,0]
    
    if PmushroomIsPgivenOnFeatures>0.5 and actualClass == 'p':
        truePositive +=1
    elif PmushroomIsPgivenOnFeatures <0.5 and actualClass == 'e':
        trueNegative +=1
    elif PmushroomIsPgivenOnFeatures>0.5 and actualClass == 'e':
        falsePositive +=1
    else:
        falseNegative +=1

The Test Data is all about pType mushrooms so we should be concerned about truePositive and falseNegative.

In [181]:
pTYpeTest.shape[0]

870

The model predicted the data to be 'p' type and our data was actually 'p' type.

In [182]:
truePositive

848

The model predicted the data to be 'e' type but our data was of 'p' type.

In [183]:
falseNegative

22

In [185]:
accuracy = ((truePositive + trueNegative)/pTYpeTest.shape[0])*100

In [186]:
print(accuracy)

97.47126436781609
