In [8]:
import numpy as np
from numpy.random import default_rng
import pandas as pd

from sklearn.naive_bayes import BernoulliNB

# Table

In [9]:
n = 5
m = 10
rng = default_rng(58)

In [10]:
cd = 0.5
fp = [[], []]
for _ in range(n):
    fp[0].append(rng.random())
    fp[1].append(rng.random())

feature_names = ["F"+str(i) for i in range(n)]

In [11]:
D = []
y = []
for _ in range(m):
    c = int(rng.random() < cd)
    y.append(c)
    d = []
    for j in range(n):
        d.append(rng.random() < fp[c][j])
    
    D.append(d)

In [12]:
df = pd.DataFrame(D, columns = feature_names)
df['y'] = y
df

Unnamed: 0,F0,F1,F2,F3,F4,y
0,True,False,True,True,False,0
1,False,False,True,False,True,0
2,True,True,True,True,True,0
3,True,False,True,False,True,1
4,False,False,True,False,False,0
5,True,False,False,False,True,1
6,False,False,True,False,False,0
7,True,True,False,False,True,0
8,False,True,True,False,False,0
9,True,False,True,False,False,1


In [13]:
clf = BernoulliNB(alpha=1)

In [14]:
clf.fit(D, y)

BernoulliNB(alpha=1)

In [15]:
clf.classes_

array([0, 1])

In [16]:
# #(y)
pd.DataFrame(clf.class_count_, columns = ['Y'])

Unnamed: 0,Y
0,7.0
1,3.0


In [17]:
# #(T, y)
pd.DataFrame(clf.feature_count_, columns = feature_names)

Unnamed: 0,F0,F1,F2,F3,F4
0,3.0,3.0,6.0,2.0,3.0
1,3.0,0.0,2.0,0.0,2.0


In [18]:
# P(Y)
pd.DataFrame(np.exp(clf.class_log_prior_), columns = ['Y'])

Unnamed: 0,Y
0,0.7
1,0.3


In [19]:
# P(True | y), not smoothed
pd.DataFrame(clf.feature_count_/clf.class_count_.reshape(-1, 1), columns = feature_names)

Unnamed: 0,F0,F1,F2,F3,F4
0,0.428571,0.428571,0.857143,0.285714,0.428571
1,1.0,0.0,0.666667,0.0,0.666667


In [21]:
# p(True | y), smoothed
pd.DataFrame(np.exp(clf.feature_log_prob_), columns = feature_names)

Unnamed: 0,F0,F1,F2,F3,F4
0,0.444444,0.444444,0.777778,0.333333,0.444444
1,0.8,0.2,0.6,0.2,0.6


In [22]:
# p(True | y), smoothed
pd.DataFrame((clf.feature_count_+1)/(clf.class_count_+2).reshape(-1, 1), columns = feature_names)

Unnamed: 0,F0,F1,F2,F3,F4
0,0.444444,0.444444,0.777778,0.333333,0.444444
1,0.8,0.2,0.6,0.2,0.6


In [32]:
# ln(p(True | y)), smoothed
pd.DataFrame(clf.feature_log_prob_, columns = feature_names)

Unnamed: 0,F0,F1,F2,F3,F4
0,-0.81093,-0.81093,-0.251314,-1.098612,-0.81093
1,-0.223144,-1.609438,-0.510826,-1.609438,-0.510826


In [33]:
d = [True, False, True, False, True]
clf.predict_log_proba([d])

array([[-0.87056257, -0.5425157 ]])

In [35]:
false_log_probs = np.log(1 - np.exp(clf.feature_log_prob_))

In [36]:
lp = np.zeros(2)
lp += clf.class_log_prior_
for i in range(n):
    f = d[i]
    if f:
        lp += clf.feature_log_prob_[:, i]
    else:
        lp += false_log_probs[:, i]

In [38]:
lp

array([-3.22310158, -2.89505471])

In [None]:
clf.predict_proba([d])

In [None]:
np.exp(lp)/np.exp(lp).sum()

In [None]:
from scipy.special import logsumexp

In [None]:
lp-logsumexp(lp)

In [None]:
np.exp(lp-logsumexp(lp))

## Logsumexp

In [None]:
# log(a)
loga = -1000
# log(b)
logb = -1001
# Need log(a+b)
np.log(np.exp(loga)+np.exp(logb))

In [None]:
lse=logsumexp([loga, logb])
lse

In [None]:
loga-lse

In [None]:
np.exp(loga-lse)

In [None]:
logb-lse

In [None]:
np.exp(logb-lse)