In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['text.color'], mpl.rcParams['axes.labelcolor'], mpl.rcParams['xtick.color'], mpl.rcParams['ytick.color'] = ['white']*4

In [313]:
class NaiveBayesClf:
    def __init__(self, algo='gaussian', alpha=1.0):
        '''
        algo:str - 'gaussian': For normal distribution, Used for classification.
                   'bernoulli': used when feature vectors are binary
                   'multinomial': For multinomially distributed data, if alpha=1, Laplace smoothing
                                                                      elif alpha < 1, Lidstone smoothing
                                                                      else alpha >= 0, prevents zero probabilities
                   'complement': Adaptation of the standard Multinomial Naive Bayes (MNB) algorithm 
                       that is particularly suited for imbalanced data sets wherein the algorithm 
                       uses statistics from the complement of each class to compute the modelâ€™s weight.
        alpha:float - 
        '''
        self.alpha = alpha
        self.algo = algo
        
    def train(self, X, y, use_pandas=False):
        '''
        X - (n_datapoints, n_features)
        y - (n_datapoints, 1)
        '''
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        n_datapoints, n_features = X.shape
        assert len(y) == n_datapoints
        y = y.reshape(-1, 1)
        # https://towardsdatascience.com/how-to-impliment-a-gaussian-naive-bayes-classifier-in-python-from-scratch-11e0b80faf5a
        # Frequency Table / Prior probabilities
        classes = np.unique(y)
        if use_pandas:
            import pandas as pd
            f_table = pd.crosstab(X, [y]).values
        else:
            unique_X = np.unique(X)
            f_table = np.zeros((len(unique_X), len(classes)))
            for i in range(len(unique_X)):
                l, c = np.unique(y[X == unique_X[i]], return_counts=True)
                f_table[i, l] = c
        # Likelihood table
#         l_table = np.hstack([f_table, f_table.sum(axis=1, keepdims=True) / n_datapoints])
#         l_table = np.vstack((l_table, l_table.sum(axis=0, keepdims=True) / n_datapoints))
        self.priors = f_table.sum(axis=0, keepdims=True) / n_datapoints
        self.mu = f_table.mean(axis=0, keepdims=True)
        self.sigma = f_table.std(axis=0, keepdims=True)
    
    def predict(self, X, return_probs=False):
        '''
        https://wikimedia.org/api/rest_v1/media/math/render/svg/1eaed580cf7c29f044a9e517f1cd4a7dd69c4b1f
        '''
        # Creates a 3 dimensional array with X transposed and repeated on the 3rd dimension (n_unique_classes) times
        if self.algo == 'gaussian':
            # cond_prob - ((X.shape[1], X.shape[0], repeats))
            cond_prob = np.exp(-((np.tile(X.T, (1, 1, self.mu.shape[0])) - self.mu.T)**2 / 2 / self.sigma.T**2)) / (2*np.pi)**0.5 / self.sigma.T
        elif self.algo == '':
            f_table = f_table
        probs = self.priors.T * cond_prob.prod(axis=0)
        yp = probs.argmax(axis=0)
        if return_probs:
            return yp, probs
        return yp
        
        
            
        

In [319]:
clf_sk = GaussianNB()
clf_sk.fit(X, y)

y, clf_sk.predict(X)

(array([1, 1, 0, 1, 0, 0, 0, 1, 1]), array([1, 1, 0, 1, 1, 1, 0, 0, 1]))

In [318]:
clf = NaiveBayesClf()
clf.train(X, y)
yp, p = clf.predict(X, return_probs=True)
y, yp

(array([1, 1, 0, 1, 0, 0, 0, 1, 1]),
 array([0, 1, 1, 1, 1, 0, 1, 1, 0], dtype=int64))

## Gaussian naive Bayes
<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/685339e22f57b18d804f2e0a9c507421da59e2ab">

## Multinomial naive Bayes
<img src="">

## Bernoulli naive Bayes
<img src="">

In [None]:
class GaussianNB:
    def __init__(self, ):
        super().__init__(algo='gaussian')
class BernoulliNB:
    def __init__(self, ):
        super().__init__(algo='bernoulli')
class MultinomialNB:
    def __init__(self, ):
        super().__init__(algo='multinomial')
class ComplementNB:
    def __init__(self, ):
        super().__init__(algo='complement')

# Comparing with sklearn module

### Helper function to compare and plot

In [315]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, plot_confusion_matrix
from sklearn.naive_bayes import GaussianNB
clf_sk = GaussianNB()
clf_sk.fit(X_train, y_train)

yp_sk = clf_sk.predict(X_test)

NameError: name 'X_train' is not defined

In [None]:
from sklearn.model_selection import train_test_split
def plot_pipe(X, y, models, feature=None, fit_intercept=True, normalize=False, norm_method="fro", penalty=0.1, test_size=0.3):
    plt.figure(figsize=(10,15))
    if X.ndim == 2:
        if X.shape[1] > 1:
            if feature is None:
                print("Supported only for 1 feature, ignoring other features")
                X = X[:, 0].reshape(-1, 1)
            else:
                X = X[:, feature].reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    plt.scatter(X_train, y_train, color='black')
    plt.scatter(X_test, y_test, color='blue')
    legend = []
    for k in models:
        if 'sk' in k:
            r = models[k](fit_intercept=fit_intercept, normalize=normalize)
            r.fit(X_train, y_train.reshape(-1, 1))
            cf = [r.intercept_, r.coef_]
        else:
            if normalize:
                X_train = Normalizer(method=norm_method).normalize(X_train)
                X_test = Normalizer(method=norm_method).normalize(X_test)
            r = models[k](fit_intercept=fit_intercept, penalty=penalty)
            r.train(X_train, y_train)
            cf = r.W

        y_pred = r.predict(X)
        print("-"*20, k, "-"*20)
        print("Coefficients - ", cf)
        print("sklearn R2 scores ---------")
        print("Test R2 score = ", r2_score(y_test, r.predict(X_test)))
        print("Whole dataset R2 score = ", r2_score(y, y_pred))
        print("My R2 scores ---------")
        print("Test R2 score = ", LinearRegression().r2_score(y_test, r.predict(X_test)))
        print("Whole dataset R2 score = ", LinearRegression().r2_score(y, y_pred))
        
        plt.plot([X.max(), X.min()], [y_pred.max(), y_pred.min()])
        
        legend.append(k)
    legend.extend(["Train", "Test"])
    plt.legend(legend)

In [317]:

X = np.array([1, 2, 5, 2, 2, 1, 5, 5, 1]).reshape(-1, 1)
y = np.array([1,1,0,1,0,0,0,1,1])
# models = {
#     'skLR': linear_model.LinearRegression,
#     'myLR': LinearRegression,
#     'skLasso': linear_model.Lasso,
#     'myLasso': Lasso,
#     'skRidge': linear_model.Ridge,
#     'myRidge': Ridge
# }
# plot_pipe(X, y, models, feature=2, fit_intercept=True, penalty=1)

In [None]:
from sklearn import datasets, linear_model
X, y = datasets.load_diabetes(return_X_y=True)