In [1]:
import numpy as np 



In [2]:
x_train= np.array([
    [0,1,1],
    [0,0,1],
    [0,0,0],
    [1,1,0]
])
y_train= ['Y','N','Y','Y']
x_test= np.array([[1,1,0]])



In [4]:
def get_label_indices(labels):
    '''
    Group samples based on their labels and return
    indices 
    labels: lsit of labels
    return dict, {class1: [indices], class2: [indices]}
    '''

    from collections import defaultdict
    label_indices = defaultdict(list)
    for index, label in enumerate(labels):
        label_indices[label].append(index)
    return label_indices

label_indices = get_label_indices(y_train)
print ( f'label indices: \n {label_indices}')

label indices: 
 defaultdict(<class 'list'>, {'Y': [0, 2, 3], 'N': [1]})


In [5]:
def get_prior(label_indices):
    '''
    compute prior based on training samples
    label_indices: group sample indices by class 
    return: dictionary, with class label as key, 
    corresponding prior as the value. 
    '''
    prior = {label: len(indices) for label, indices in label_indices.items()}
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= total_count
    return prior
prior= get_prior(label_indices)
print(f'Prior: {prior}')
    

Prior: {'Y': 0.75, 'N': 0.25}


In [8]:
def get_likelihood( features, label_indices, smoothing=0):
    '''
    compute likelihood based on training samples
    features: matrix of features
    label_indices: group sample indices by class 
    smoothing: integer, addictive smoothing parameter
    return dictionary, with class as key, corresponding
    conditional probability P(feature|class) vector as value
    '''
    likelihood={}
    for label, indices in label_indices.items():
        likelihood[label] = features[indices, : ].sum(axis=0) + smoothing
        total_count=len(indices)
        likelihood[label]=likelihood[label]/(total_count + 2*smoothing)
        return likelihood
    
smoothing= 1
likelihood= get_likelihood(x_train, label_indices,smoothing)
print( f'Likelihood: \n {likelihood}')

Likelihood: 
 {'Y': array([0.4, 0.6, 0.4])}


In [11]:
def get_posterior(x,prior,likelihood):
    ''' compute the posterior of the testing samples, based on 
    prior and likelihood
    x:testing samples
    prior: dictionary, with class label key,
    corresponding prior as a value
    likelihood:dictionary, with class label as key,
     corresponding conditional probability
     vector as value
     return: dictionary with class label as key,corresponding posterior as value.
     '''
    posteriors=[]
    for t in x:
        posterior= prior.copy()
        for label,likelihood_label in likelihood.items():
            for index,bool_value in enumerate(t):
                posterior[label]= likelihood_label[index] if bool_value else(1-likelihood_label[index])
        #normalize so that it all adds up to one 
        sum_posterior=sum(posterior.values())
        for label in posterior:
            if posterior[label]==float('inf'):
                posterior[label]=1.0
            else: 
                posterior[label]/=sum_posterior
        posteriors.append(posterior.copy())

    return posteriors

posterior= get_posterior(x_test,prior, likelihood)
print(f'Posterior:\n {posterior}')

Posterior:
 [{'Y': 0.7058823529411765, 'N': 0.29411764705882354}]


In [13]:
from sklearn.naive_bayes import BernoulliNB
clf= BernoulliNB(alpha= 1.0, fit_prior = True)
clf.fit(x_train,y_train)

pred_prob= clf.predict_proba(x_test)
print(f'print([scikit-learn] Predicted probabilities:\n  {pred_prob}')

print([scikit-learn] Predicted probabilities:
  [[0.07896399 0.92103601]]


In [14]:
pred_test= clf.predict(x_test)
print(f'[scikit-learn] Prediction:\n {pred_test}')

[scikit-learn] Prediction:
 ['Y']
