In [1]:
# Naive Bayes has this interesting relation with PCA.
# PCA makes sure that all the principal components be normal to each other, that means their correlation is zero.
# If the principal components are gaussian then naive assumption of Naive BAyes become true.
# although just to be clear, 0 correlation does not mean independence.

In [22]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
# from sklearn.naive_bayes import GaussianNB # doesn't have smoothing
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn
from sklearn.model_selection import train_test_split
from future.utils import iteritems

In [31]:
class GaussianNB(object):

    def fit(self, X, Y, smoothing=1e-2):
        self.gaussians = dict()
        self.priors = dict()

        # getting labels for y
        labels = set(Y)
        
        for c in labels:
            # for all the unique labels
            # get all the x for which label == c
            current_x = X[Y == c]
            # calculate and assign mean and variance for each class, for each feature (each class has vector of 784)
            # smoothing is added to avoid nan
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': current_x.var(axis=0) + smoothing,
            }
            # calculating priors, just probability of a class
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)

    def predict(self, X):
        N, D = X.shape
        
        # length of class - number of classes
        K = len(self.gaussians)
        
        # probability matrix - for n numbers and for each class
        P = np.zeros((N, K))
        
        # for each gaussian or each class distribution, get the mean and variance 
        # explanation of below lines required....
        # we calculate log posterior probability for each class
        for c, g in iteritems(self.gaussians):
            mean, var = g['mean'], g['var']
            P[:,c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
        # to get the prediction, we take the argmax of P - P contains 'c' columns 
        return np.argmax(P, axis=1)

In [16]:
data = pd.read_csv("../unsupervised_learning_clustering_part1/data/mnist_train.csv")

In [32]:
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, shuffle=True)

In [34]:
# try NB by itself
model1 = GaussianNB()
model1.fit(X, y)
print("NB train score:", model1.score(X_train, y_train))
print("NB test score:", model1.score(X_test, y_test))

NB train score: 0.6151190476190476
NB test score: 0.6133888888888889


In [35]:
# try NB with PCA first
pca = PCA(n_components=50)
Z_train = pca.fit_transform(X_train)
Z_test = pca.transform(X_test)

In [36]:
model2 = GaussianNB()
model2.fit(Z_train, y_train)
print("NB+PCA train score:", model2.score(Z_train, y_train))
print("NB+PCA test score:", model2.score(Z_test, y_test))

NB+PCA train score: 0.870452380952381
NB+PCA test score: 0.8677777777777778


In [30]:
# we see, scores have been increased, so naive bayes works better here after we do PCA preprocessing

In [None]:
# full covariance vs diagonal covariance