# Implementation of Naive Bayes Algoritm (Gaussian)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.stats import norm

In [2]:
class Bayes():
    
    def __init__(self, priors=None):
        """Implementation of Gaussian Naive Bayes Classifier"""
        self.means_dict = {}  # this dictionary will be filled as follows 'class_n name': 'features' means for class_n'
        self.stds_dict = {}  # this dictionary will be filled as follows 'class_n name': 'features' standard deviations for class_n'
        self.priors = priors  # dictionary with prior probabilities
        self.classes = []  # list of unique class names
        self.scores = None
    
    def _create_class_stats_dict(self, x_data, y_data):
        """Creates dictionaries with calculated means and standard deviations for every feature in dataset"""
        for class_ in self.classes:
            self.means_dict[class_] = np.mean(x_data[y_data == class_], axis=0)
            self.stds_dict[class_] = np.std(x_data[y_data == class_], axis=0, ddof=1)
            
    def fit(self, x_data, y_data):
        """Takes numpy arrays, recognizes unique classes and creates priors dictionary if necessary"""
        self.classes, counts = np.unique(y_data, return_counts=True)
        self._create_class_stats_dict(x_data, y_data)
        
        if not self.priors:
            self.priors = dict(zip(self.classes, counts/counts.sum()))
            
    def _get_scores(self, data):
        """Calculates scores for given data points. Each score is calculated as a product of pdf functions values.
        Each pdf function is generated with assumption, that feature has normal distribution with mean and standard 
        deviation well reflected by the training data."""
        self.scores = np.empty(shape=(0, len(self.classes)))
        for vector in data:
            c_score = []
            for class_ in self.classes:
                score_for_class = self.priors[class_]
                for feature_index, value in enumerate(vector):
                    score_for_class *= norm(loc=self.means_dict[class_][feature_index], 
                                            scale=self.stds_dict[class_][feature_index]).pdf(value)
                c_score.append(score_for_class)
            self.scores = np.append(self.scores, [c_score], axis=0)
        return self.scores

    def predict(self, data):
        """Predicts class of given data"""
        self._get_scores(data)
        return np.vectorize(lambda x: self.classes[x])(np.argmax(self.scores, axis=1))
    
    def score(self, data, labels):
        """Returns the mean accuracy on the given data and labels"""
        return np.sum(np.equal(self.predict(data), labels)) / len(labels)

The above implementation is quite limited, but anyway let's test it on some data.

In [3]:
iris = sns.load_dataset('iris')
iris.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


In [4]:
y_iris = iris['species']
iris.drop('species', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(iris, y_iris, test_size=.3, random_state=42)

In [5]:
model = Bayes()
model.fit(X_train.values, y_train.values)
model.score(X_test.values, y_test.values)

0.97777777777777775

Let's compare this accuracy score with Gaussian Naive Bayes implementation form Scikit-learn to see if I haven't made some conspicuous mistakes.

In [7]:
from sklearn.naive_bayes import GaussianNB

In [13]:
sklearn_model = GaussianNB()
sklearn_model.fit(X_train, y_train)
sklearn_model.score(X_test, y_test)

0.97777777777777775

No complaints here ;)