In [1]:
raw_docs = ['eat turkey on turkey day holiday',
          'i like to eat cake on holiday',
          'turkey trot race on thanksgiving holiday',
          'snail race the turtle',
          'time travel space race',
          'movie on thanksgiving',
          'movie at air and space museum is cool movie',
          'aspiring movie star']

docs = [d.split() for d in raw_docs]
docs

[['eat', 'turkey', 'on', 'turkey', 'day', 'holiday'],
 ['i', 'like', 'to', 'eat', 'cake', 'on', 'holiday'],
 ['turkey', 'trot', 'race', 'on', 'thanksgiving', 'holiday'],
 ['snail', 'race', 'the', 'turtle'],
 ['time', 'travel', 'space', 'race'],
 ['movie', 'on', 'thanksgiving'],
 ['movie', 'at', 'air', 'and', 'space', 'museum', 'is', 'cool', 'movie'],
 ['aspiring', 'movie', 'star']]

In [2]:
import itertools
vocab = list(dict.fromkeys(
    itertools.chain.from_iterable(docs)))

vocab

['eat',
 'turkey',
 'on',
 'day',
 'holiday',
 'i',
 'like',
 'to',
 'cake',
 'trot',
 'race',
 'thanksgiving',
 'snail',
 'the',
 'turtle',
 'time',
 'travel',
 'space',
 'movie',
 'at',
 'air',
 'and',
 'museum',
 'is',
 'cool',
 'aspiring',
 'star']

In [3]:
from collections import defaultdict

d = defaultdict(lambda: len(d))
docs = [[d[w] for w in doc] for doc in docs]
docs

[[0, 1, 2, 1, 3, 4],
 [5, 6, 7, 0, 8, 2, 4],
 [1, 9, 10, 2, 11, 4],
 [12, 10, 13, 14],
 [15, 16, 17, 10],
 [18, 2, 11],
 [18, 19, 20, 21, 17, 22, 23, 24, 18],
 [25, 18, 26]]

In [4]:
## PARAMETERS
K = 2 # number of topics

alpha = 1 # hyperparameter. single value indicates symmetric dirichlet prior. 
# higher=>scatters document clusters

eta = .001 # hyperparameter

iterations = 3 # iterations for collapsed gibbs sampling.  
# This should be a lot higher than 3 in practice.

In [5]:
import numpy as np

# word-topic count matrix. How many times a specific word appears in a topic
wt = np.zeros((K, len(vocab)))
wt

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [6]:
d = defaultdict(int)
ta = [[d[w] for w in doc] for doc in docs]
ta

[[0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0]]

In [7]:
for d in range(len(docs)):
    for w in range(len(docs[d])):
        ta[d][w] = np.random.choice(range(K))
        wt[ta[d][w], docs[d][w]] += 1

In [8]:
wt

array([[1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 2., 1., 1., 1., 1., 0.,
        0., 1., 3., 0., 0., 0., 1., 0., 1., 1., 1.],
       [1., 2., 3., 1., 2., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1.,
        1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0.]])

In [9]:
ta

[[0, 0, 1, 1, 1, 1],
 [0, 1, 1, 1, 0, 1, 0],
 [1, 0, 1, 1, 1, 1],
 [0, 0, 0, 0],
 [1, 1, 0, 0],
 [0, 0, 0],
 [1, 1, 1, 1, 1, 0, 1, 0, 0],
 [0, 0, 0]]

In [10]:
dt = np.zeros((len(docs), K))
dt

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [11]:
for d in range(len(docs)):
    for t in range(K):
        dt[d, t] = np.sum(np.array(ta[d]) == t)

In [12]:
dt

array([[2., 4.],
       [3., 4.],
       [1., 5.],
       [4., 0.],
       [2., 2.],
       [3., 0.],
       [3., 6.],
       [3., 0.]])

In [13]:
docs

[[0, 1, 2, 1, 3, 4],
 [5, 6, 7, 0, 8, 2, 4],
 [1, 9, 10, 2, 11, 4],
 [12, 10, 13, 14],
 [15, 16, 17, 10],
 [18, 2, 11],
 [18, 19, 20, 21, 17, 22, 23, 24, 18],
 [25, 18, 26]]

In [14]:
for i in range(iterations):
    for d in range(len(docs)):

        for w in range(len(docs[d])):

            t0 = ta[d][w]
            wid = docs[d][w]
            
            print(t0)
            print(wid)

            
            dt[d, t0] = dt[d, t0] - 1
            wt[t0, wid] = wt[t0, wid] - 1
            
            denom_a = np.sum(dt[d]) + K * alpha
            denom_b = np.sum(wt, axis=1) + len(vocab) * eta
            
            p_z = (wt[:, wid] + eta)  / denom_b * (dt[d] + alpha) / denom_a
            t1 = np.random.choice(range(K), p=p_z/np.sum(p_z))
            
            ta[d][w] = t1

            dt[d, t0] = dt[d, t0] + 1
            wt[t0, wid] = wt[t0, wid] + 1
            

0
0
0
1
1
2
1
1
1
3
1
4
0
5
1
6
1
7
1
0
0
8
1
2
0
4
1
1
0
9
1
10
1
2
1
11
1
4
0
12
0
10
0
13
0
14
1
15
1
16
0
17
0
10
0
18
0
2
0
11
1
18
1
19
1
20
1
21
1
17
0
22
1
23
0
24
0
18
0
25
0
18
0
26
1
0
1
1
1
2
0
1
1
3
0
4
1
5


ValueError: probabilities are not non-negative