In [1]:
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix

In [2]:
train_labels = np.genfromtxt('digits_data/traininglabels')
test_labels = np.genfromtxt('digits_data/testlabels')
train_size = len(train_labels)
test_size = len(test_labels)

In [3]:
digit_images_train = np.loadtxt('digits_data/trainingimages',dtype='str',delimiter='\n',comments=None).reshape(train_size,-1)
digit_images_test = np.loadtxt('digits_data/testimages',dtype='str',delimiter='\n',comments=None).reshape(test_size,-1)

In [4]:
# Binarize the image 
def add_data(source):
    data = []
    for spectrogram in source:
        text = np.array( list("".join(spectrogram)))
        data.append(np.where(text=='#',1,0))
    return data

In [5]:
bin_data_train = np.array(add_data(digit_images_train))
bin_data_test = np.array(add_data(digit_images_test))

### Calculate priors

In [6]:
class_count = dict(zip(*np.unique(train_labels,return_counts=True)))
class_priors = np.array(list(class_count.values()))/train_size

### Smoothing parameter

In [7]:
k=0.001 

### Calculate conditional probabilites

In [8]:
# Calculate the conditional probablities for feature_value = 1 
# for all classes across all pixels
def bernoulli_train(train_data):
    feature_conditional_prob_one = np.zeros((10,784))
    for num in range(10):
        mask = (train_labels == num)
        filtered_data = bin_data_train[mask]
        feature_conditional_prob_one[num] = (np.sum(filtered_data,axis=0)+k)/(2*k+sum(mask))
    return feature_conditional_prob_one

In [9]:
def bernoulli_predict(x,class_priors,conditional_probabilities):
    pred = []
    
    for pixel_val in x:
        
        res = conditional_probabilities.copy()
        mask = (pixel_val==0)
        # For those pixels which are coded as 0 
        res[:,mask] = 1-res[:,mask]
       
        # log transformed to avoid any underflows as prob values are small
        res = np.log(res)        
        cond_sum = res.sum(axis=1)
        
        pred.append(np.argmax(np.log(class_priors)+cond_sum))
    return pred

In [10]:
conditional_prob = bernoulli_train(bin_data_train)
# conditional_prob = bernoulli_train(digit_images_train)

In [11]:
pred_coded = bernoulli_predict(bin_data_test,class_priors,conditional_prob)

In [12]:
accuracy_score(test_labels,pred_coded)

0.762

In [13]:
confusion_matrix(test_labels,pred_coded)

array([[ 75,   0,   1,   1,   0,   7,   4,   0,   2,   0],
       [  0, 103,   0,   0,   0,   2,   1,   0,   2,   0],
       [  2,   6,  71,   6,   1,   1,   7,   2,   7,   0],
       [  0,   2,   1,  79,   0,   7,   1,   3,   2,   5],
       [  0,   0,   0,   0,  78,   1,   2,   1,   2,  23],
       [  1,   0,   1,   7,   4,  68,   1,   1,   3,   6],
       [  0,   4,   4,   0,   4,   7,  71,   0,   1,   0],
       [  0,   7,   3,   0,   5,   0,   0,  74,   2,  15],
       [  1,   5,   3,   8,   5,  11,   0,   1,  59,  10],
       [  1,   0,   0,   2,   8,   2,   0,   1,   2,  84]])

### Verify with sklearn

In [14]:
clf = BernoulliNB(alpha=k)
clf.fit(bin_data_train,train_labels)
pred_lib = clf.predict(bin_data_test)

In [15]:
accuracy_score(test_labels,pred_lib)

0.762

In [16]:
confusion_matrix(test_labels,pred_lib)

array([[ 75,   0,   1,   1,   0,   7,   4,   0,   2,   0],
       [  0, 103,   0,   0,   0,   2,   1,   0,   2,   0],
       [  2,   6,  71,   6,   1,   1,   7,   2,   7,   0],
       [  0,   2,   1,  79,   0,   7,   1,   3,   2,   5],
       [  0,   0,   0,   0,  78,   1,   2,   1,   2,  23],
       [  1,   0,   1,   7,   4,  68,   1,   1,   3,   6],
       [  0,   4,   4,   0,   4,   7,  71,   0,   1,   0],
       [  0,   7,   3,   0,   5,   0,   0,  74,   2,  15],
       [  1,   5,   3,   8,   5,  11,   0,   1,  59,  10],
       [  1,   0,   0,   2,   8,   2,   0,   1,   2,  84]])