In [None]:
# # # test data
# # Dataset = [[6,148,72,35,0,33.6,0.627,50,1],[1,85,66,29,0,26.6,0.351,31,0],[8,183,64,0,0,23.3,0.672,32,1],
# #            [1,89,66,23,94,28.1,0.167,21,0],[0,137,40,35,168,43.1,2.288,33,1]]
# Dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
# data = []
# label = []
# for line in Dataset:
#     data.append(line[:-1])
#     label.append(line[-1])

# print(data)
# print(label)
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
label = iris.target

In [None]:
# naive bayes model with Gaussian distribution
class naive_bayes(object):
    
    import math
    import numpy as np
    
    def __init__(self, priors=None):
        self.priors = priors

    def fit(self, data, label):
        if(len(data)!=len(label)):
            print("You have inconsistent length between data and label")
        else:
            # check if input is numpy array or not
            if type(data) is self.np.ndarray:
                data = data.tolist()
            self.data = data
            self.label = label
            self.summaries = self.class_statistic(data,label)
            print(self.summaries)
            return self
    # separate data for each class
    def Separate_data_by_class(self, data, label):
        separated = {}
        for i in range(len(data)):
            if(label[i] not in separated):
                separated[label[i]] = []
            separated[label[i]].append(data[i])
        # count sample size of each class
        class_count = {}
        for key, value in separated.items():
            class_count[key] = len(value)
        self.count_per_class = class_count
        # calculate log priors based on class
        totalSample = len(data)
        priors = {}
        for key, value in class_count.items():
            priors[key] = self.math.log(value/totalSample)
        self.class_priors = priors
        return separated
    
    # calculate central middle or central tendency of the data,
    # use it as the middle of our gaussian distribution when calculating probabilities.
    def mean(self,numbers):
        return sum(numbers)/float(len(numbers))
    
    # standard deviation describes the variation of spread of the data
    # standard deviation calculated as the square root of the variance
    # The variance is calculated as the average of the squared differences for each attribute value from the mean. 
    def stdev(self,numbers):
        avg = self.mean(numbers)
        # len(numbers-1) for Bessel's correction 
        variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
        return self.math.sqrt(variance)
    
    def calculate_statistic(self,inputData):
        summaries = [(self.mean(feature), self.stdev(feature)) for feature in zip(*inputData)]
        return summaries
    
    # calculate overall(all samples) mean and standard deviation
    def overall_statistic(self):
        return self.calculate_statistic(data)
    
    # calculate class based mean and standard deviation
    def class_statistic(self,data,label):
        separated = self.Separate_data_by_class(data,label)
        class_summaries = {}
        for classLabel, classdata in separated.items():
            class_summaries[classLabel] = self.calculate_statistic(classdata)
        return class_summaries
    
    # Calculate Gaussian Probability Density Function
    # Calculate the probability of an feature of X belonging to a class
    def GaussianProbability(self,x, mean, stdev):
        exponent = self.math.exp(-(self.math.pow(x-mean,2)/(2*self.math.pow(stdev,2))))
        return self.math.log( exponent/ (self.math.sqrt(2*self.math.pi) * stdev))
    
    # combine the probabilities of all of the feature for a data instance and 
    # come up with a probability of the entire data instance belonging to the class.
    def calculateClassProbabilities(self, inputVector):
        probabilities = {}
        for classLabel, classSummaries in self.summaries.items():
            probabilities[classLabel] = 1
            for i in range(len(classSummaries)):
                mean, stdev = classSummaries[i]
                x = inputVector[i]
                probabilities[classLabel] += self.GaussianProbability(x, mean, stdev)
                probabilities[classLabel] += self.class_priors[classLabel]
        return probabilities
    
    
    # find the higher probability score as final label
    # calculate class probability when predict
    def predict(self, inputVector):
        probabilities = self.calculateClassProbabilities(inputVector)
        #print(probabilities)
        bestLabel, bestProb = None, -1
        for classLabel, probability in probabilities.items():
            if bestLabel is None or probability > bestProb:
                bestProb = probability
                bestLabel = classLabel
        return bestLabel
    
    def predicts(self, inputVector):
        results = []
        for vector in inputVector:
            #print(vector)
            results.append(self.predict(vector))
        return results

In [None]:
# test
clf = naive_bayes().fit(data, label)
print(clf)
print(clf.predicts(data[-5:]))
#1: 39.0991315159432, 0: 60.900868484056815

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
print(data[:5])
print(gnb.fit(data, label).predict_log_proba(data[-5:]))
gnb.predict(data[-5:])

# Multinomial naive Bayes

In [56]:
class MultinomialNB(object):
    
    import numpy as np
    import math
    
    def __init__(self, alpha=1.0):
        # smooth parameter
        self.alpha = alpha

    def fit(self, data, label):
        # check if input is numpy array or not
        if type(data) is not np.ndarray:
            data = np.asarray(data)
        # total sample
        totalSample = len(data)
        # separate the data based on class
        separated = {}
        for i in range(len(data)):
            if(label[i] not in separated):
                separated[label[i]] = []
            separated[label[i]].append(data[i])
        # count sample size of each class
        class_count = {}
        for key, value in separated.items():
            class_count[key] = len(value)
        self.count_per_class = class_count
        # calculate class log prior proba
        priors = {}
        for key, value in class_count.items():
            priors[key] = self.math.log(value/totalSample)
        self.class_log_prior_ = priors
        # count total feature per class (by adding feature from each sample together)
        # in text classification case, will be total term count per class
        print("Sample with it's class label: ",separated)
        feature_sum = {}
        smooth_feature_sum = {}
        for classLabel, classFeature in separated.items():
            class_feature_sum = [sum(x) for x in zip(*classFeature)]
            # smoothing: add alpha to every feature to prevent zero estimate
            smoothed = [x+1 for x in class_feature_sum]
            feature_sum[classLabel] = class_feature_sum
            smooth_feature_sum[classLabel] = smoothed
        print("No smoothing class based count: ", feature_sum)
        print("Smoothed class based count: ", smooth_feature_sum)
        # calculate log probability
        probabilities = {}
        for classLabel, classFeature in smooth_feature_sum.items():
            cond_prob_list = []
            for f in classFeature:
                print(f, " ", sum(classFeature), " " , f/sum(classFeature))
                cond_prob_list.append(self.np.log(f/sum(classFeature)))
            probabilities[classLabel] = cond_prob_list
        self.log_cond_prob_feature = probabilities
        return self

    def joint_log_likelihood(self, data):
        if type(data) is np.ndarray:
            data = data.tolist()
        result_list = []
        for datapoint in data:
            result_proba = {}
            for classLabel, classFeature in self.log_cond_prob_feature.items():
#                 print([a*b for a,b in zip(classFeature,datapoint)])
                result_proba[classLabel] = sum([a*b for a,b in zip(classFeature,datapoint)]) + self.class_log_prior_[classLabel]
            result_list.append(result_proba)
        return result_list
    
    def dict_to_array(self, input_list):
        array = []
        for point in input_list:
            array.append(list(point.values()))
        return array
    
    def predict_log_proba(self, data):
        jointlogli = self.joint_log_likelihood(data)
        jointlogli = self.dict_to_array(jointlogli)
        # normalize likelihood
        # exponential
        exp_logli = np.exp(jointlogli)
        # sum over classes
        sum_exp_logli = np.sum(exp_logli, axis=1)
        # take log
        log_sum_exp_logli = np.log(sum_exp_logli)
        return jointlogli - np.atleast_2d(log_sum_exp_logli).T
    
    def predict_proba(self, data):
        return np.exp(self.predict_log_proba(data))

    def predict(self, data):
        result = []
        for feature_proba in self.joint_log_likelihood(data):
            bestProb, bestLabel = -1, None
            for key, value in feature_proba.items():
                if bestLabel==None or bestProb < value :
                    bestProb = value
                    bestLabel = key
            result.append(bestLabel)
        return result

Denote $x = \{x_1,x_2,...,x_i\}$ and it represent one input data point, $\{x_1, x_2, ..., x_i\}$ represents feature of input data point.

1. Bayes Theorem
\begin{equation}
P(y|x) = \frac{P(y)*P(x|y)}{P(x)}
\end{equation}

2. Bayes Theorem with independent ("naive") assumption (Each feature in data point is independent from each other)
\begin{equation}
P(y|x)=\frac{P(y)*\prod_{i=1}^{|x|}P(x_i|y)}{P(x)}
\end{equation}

3. $P(x)$ is a constant (equally weight for different feature)
\begin{equation}
P(y|x) = P(y)*\prod_{i=1}^{|x|}P(x_i|y)
\end{equation}

4. Prior probabilities P(y)
\begin{equation}
P(y) = \frac{Class Sample Count}{Total Sample Count}
\end{equation}

5. Multinomial distribution
\begin{equation}
P(x_i|y) = P(t|y) = \frac{\sum_{j=0}^{D_{y}}x_{t, j}}{\sum_{t'\in T}\sum_{j=0}^{D_{y}}x_{t', j}}
\end{equation}

6. Add one smoothing
\begin{equation}
P(x_i|y) = P(t|y) = \frac{\sum_{j=0}^{D_{y}}x_{t, j} + 1}{\sum_{t'\in T}\sum_{j=0}^{D_{y}}x_{t', j} + |T|}
\end{equation}

7. Apply log to avoid underflow errors
\begin{equation}
P(y|x_1,x_2,....,x_i) = \log (P(y)) + \sum_{i=1}^{|X|}\log(P(x_i|y))
\end{equation}

8. For new sample (Test sample), we calculate joint log likelihood by using per class statistic we just trained, times input data.

In [57]:
import numpy as np
X = np.array([
    [2,1,0,0,0,0],
    [2,0,1,0,0,0],
    [1,0,0,1,0,0],
    [1,0,0,0,1,1]
])
y = np.array([0,0,0,1])
X_test = np.array([[3,0,0,0,1,1],[0,1,1,0,1,1]])
nb = MultinomialNB().fit(X, y)
print("log prior prob: ",nb.class_log_prior_)
print("log prob", nb.log_cond_prob_feature)
print("Test samples: \n", X_test)
#print(nb.predict_proba(X_test))
print(nb.joint_log_likelihood(X_test))
print(nb.predict_log_proba(X_test))
print(nb.predict_proba(X_test))
print(nb.predict(X_test))

Sample with it's class label:  {0: [array([2, 1, 0, 0, 0, 0]), array([2, 0, 1, 0, 0, 0]), array([1, 0, 0, 1, 0, 0])], 1: [array([1, 0, 0, 0, 1, 1])]}
No smoothing class based count:  {0: [5, 1, 1, 1, 0, 0], 1: [1, 0, 0, 0, 1, 1]}
Smoothed class based count:  {0: [6, 2, 2, 2, 1, 1], 1: [2, 1, 1, 1, 2, 2]}
6   14   0.42857142857142855
2   14   0.14285714285714285
2   14   0.14285714285714285
2   14   0.14285714285714285
1   14   0.07142857142857142
1   14   0.07142857142857142
2   9   0.2222222222222222
1   9   0.1111111111111111
1   9   0.1111111111111111
1   9   0.1111111111111111
2   9   0.2222222222222222
2   9   0.2222222222222222
log prior prob:  {0: -0.2876820724517809, 1: -1.3862943611198906}
log prob {0: [-0.8472978603872037, -1.9459101490553135, -1.9459101490553135, -1.9459101490553135, -2.639057329615259, -2.639057329615259], 1: [-1.5040773967762742, -2.1972245773362196, -2.1972245773362196, -2.1972245773362196, -1.5040773967762742, -1.5040773967762742]}
Test samples: 
 [[3 0 

In [18]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
print(clf.class_log_prior_)
print(clf.feature_log_prob_)

print("Test samples: \n", X_test)
print(clf._joint_log_likelihood(X_test))
print(type(clf._joint_log_likelihood(X_test)))

print(clf.predict_log_proba(X_test))
print(clf.predict_proba(X_test))
print(clf.predict(X_test))

[-0.28768207 -1.38629436]
[[-0.84729786 -1.94591015 -1.94591015 -1.94591015 -2.63905733 -2.63905733]
 [-1.5040774  -2.19722458 -2.19722458 -2.19722458 -1.5040774  -1.5040774 ]]
Test samples: 
 [[3 0 0 0 1 1]
 [0 1 1 0 1 1]]
[[-8.10769031 -8.90668135]
 [-9.45761703 -8.78889831]]
<class 'numpy.ndarray'>
[[-0.37141358 -1.17040461]
 [-1.08239313 -0.41367441]]
[[0.68975861 0.31024139]
 [0.3387838  0.6612162 ]]
[0 1]


In [None]:
%reset