In [1]:
import itertools

import numpy as np
import pandas as pd

In [2]:
# REFERENCES: 
#   - https://ethen8181.github.io/machine-learning/clustering_old/topic_model/LDA.html


## 1. Most basic Assumption: Each document is a mix of topics.

## LDA -> 
##    1. learn the topic mixture (distribution) for each document,
##         - θ∼Dirichlet(α) - topic to document distribution
##    2. Learn the term distribution that are associated to a topic,
##         - ϕ∼Dirichlet(η) - term to topic distribution
##         - this allows us to know what a topic is all about (talking about). Provides context.


## GOAL: Estimate θ, ϕ.
    ## Which topics are important, for each document?    θ∼Dirichlet(α)
    ## Which words are important, for each topic?        ϕ∼Dirichlet(η)

## HOW TO ESTIMATE? GIBBS SAMPLING (one method for doing this).

In [3]:
def convert_t2w_to_pandas(t2w, vocab):
    df = pd.DataFrame(t2w)
    df.columns = vocab
    
    return df

def convert_d2t_to_pandas(d2t, topics):
    df = pd.DataFrame(d2t)
    df.columns = topics
    
    return df

In [4]:
np.random.seed(1234)

In [5]:
corpus = [
    'You do not want to use them. They are fine for many machine learning tasks, just not deep learning.',
    'It’s always a good idea to examine our data before we get started plotting.',
    'The problem is supervised text classification problem.',
    'Our goal is to investigate which supervised machine learning methods are best suited to solve it.'
]

stopwords = [ 
    'i', 'on', 'at', 'a', 'with', 
    'of', 'and', 'by', 'or', 'the', 
    'is', 'in', 'to', 'are' 
]

In [6]:
def parse_document(document):
    def parse_term(term):
        for char_to_replace in [ '.', ',' ]:
            term = term.replace(char_to_replace, '')
        return term
    
    return [ 
        parse_term(term) 
        for term in document.lower().split(' ') 
    ]

documents = [ 
    [ 
        term for term in parse_document(document) 
    ] 
    for document in corpus 
]

n_documents = len(documents)

In [7]:
[ doc[:5] for doc in documents ]

[['you', 'do', 'not', 'want', 'to'],
 ['it’s', 'always', 'a', 'good', 'idea'],
 ['the', 'problem', 'is', 'supervised', 'text'],
 ['our', 'goal', 'is', 'to', 'investigate']]

In [8]:
all_terms = []
for terms in [ parse_document(doc) for doc in corpus ]:
    all_terms.extend(terms)


BOW = dict([ 
    (k, len(list(g))) 
    for k, g in itertools.groupby(\
                    sorted(all_terms), key=lambda x: x) 
])


vocab = list(BOW.keys())
n_vocab = len(vocab)

doc_2_term_dict = [ 
    [ 
        vocab.index(term) 
        for term in parse_document(doc) 
    ] 
    for doc in corpus 
]

doc_2_term_dict

[[43, 8, 25, 40, 38, 39, 36, 37, 2, 10, 11, 23, 22, 21, 33, 20, 25, 7, 21],
 [19, 1, 0, 14, 15, 38, 9, 26, 6, 3, 41, 12, 30, 27],
 [35, 28, 17, 32, 34, 5, 28],
 [26, 13, 17, 38, 16, 42, 32, 22, 21, 24, 2, 4, 31, 38, 29, 18]]

In [9]:
## GLOBALS,

K = 3
TOPICS = [ i for i in range(K) ]

beta = .001
alpha = 1

n_iterations = 3

## END GLOBALS,

In [10]:
def setup_lda():
    ## tracks a terms topic assignment, each term in a document
    topic_assignments = [ 
        [ 0 for term in document ] for document in documents 
    ]

    ## tracks frequency of a word being assigned to a topic,
    t2w = np.zeros((K, n_vocab))

    p = [ 1/K for i in TOPICS ]

    ## topic to term frequency counts,
    for d_i in range(n_documents):
        for t_i in range(len(documents[d_i])):

            k = np.random.choice(TOPICS, p=p)
            topic_assignments[d_i][t_i] = k

            index = doc_2_term_dict[d_i][t_i]
            t2w[k, index] += 1

    ## setup document to topic frequency counts,
    d2t = np.zeros((n_documents, K))
    for d_i in range(n_documents):
        for k in TOPICS:
            d2t[d_i, k] = np.sum(np.array(topic_assignments[d_i]) == k)
            
    return topic_assignments, t2w, d2t

topic_assignments, t2w, d2t = setup_lda()

In [11]:
# initial view,
convert_t2w_to_pandas(t2w, vocab).head()

Unnamed: 0,a,always,are,before,best,classification,data,deep,do,examine,...,text,the,them,they,to,use,want,we,which,you
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0


In [12]:
# initial view,
convert_d2t_to_pandas(d2t, TOPICS)

Unnamed: 0,0,1,2
0,4.0,7.0,8.0
1,2.0,7.0,5.0
2,2.0,1.0,4.0
3,7.0,7.0,2.0


In [13]:
def gibbs_sampling(document_index, term_index):
    
    z0 = topic_assignments[document_index][term_index]
    word_index = doc_2_term_dict[document_index][term_index]
    
    ## remove current assignment
    d2t[document_index, z0] -= 1
    t2w[z0, word_index] -= 1
    topic_assignments[document_index][term_index] = -1
    
    a = np.sum(d2t[document_index]) + (K * alpha)
    b = np.sum(t2w, axis=1) + (n_vocab * beta)
    
    p = ( t2w[:, word_index] + beta ) / b * (d2t[document_index] + alpha) / a
    p = p / np.sum(p)

    z1 = np.random.choice(TOPICS, p=p)
    
    ## set "new" assignment   
    d2t[document_index, z1] += 1
    t2w[z1, word_index] += 1
    
    ## set the terms topic assignment.
    topic_assignments[document_index][term_index] = z1
    
    return z0, z1


In [14]:
topic_assignments, t2w, d2t = setup_lda()

history = [ 
    [ 
        [ 
            (
                document_index, 
                document_term_index, 
                gibbs_sampling(document_index, document_term_index)
            )
            
            ## per term in document
            for document_term_index 
                in range(len(documents[document_index])) 
        ] 
        ## per document
        for document_index in range(n_documents) 
    ] 
    ## per iteration
    for iteration in range(n_iterations) 
]

## view last iterations document #0 changes,
history[-1][0]

[(0, 0, (2, 0)),
 (0, 1, (2, 1)),
 (0, 2, (1, 1)),
 (0, 3, (0, 0)),
 (0, 4, (2, 2)),
 (0, 5, (2, 0)),
 (0, 6, (0, 0)),
 (0, 7, (2, 1)),
 (0, 8, (0, 0)),
 (0, 9, (2, 2)),
 (0, 10, (2, 0)),
 (0, 11, (2, 0)),
 (0, 12, (1, 1)),
 (0, 13, (0, 0)),
 (0, 14, (2, 2)),
 (0, 15, (2, 0)),
 (0, 16, (1, 0)),
 (0, 17, (0, 0)),
 (0, 18, (0, 0))]

In [15]:
# final view,
convert_t2w_to_pandas(t2w, vocab).head()

Unnamed: 0,a,always,are,before,best,classification,data,deep,do,examine,...,text,the,them,they,to,use,want,we,which,you
0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0


In [16]:
# final view,
convert_d2t_to_pandas(d2t, TOPICS)

Unnamed: 0,0,1,2
0,12.0,4.0,3.0
1,4.0,7.0,3.0
2,1.0,3.0,3.0
3,5.0,3.0,8.0


In [17]:
theta = ( (d2t + alpha).T / np.sum(d2t + alpha, axis=1) ).T

## document to topic distributions,
convert_d2t_to_pandas(theta, TOPICS).head()

Unnamed: 0,0,1,2
0,0.590909,0.227273,0.181818
1,0.294118,0.470588,0.235294
2,0.2,0.4,0.4
3,0.315789,0.210526,0.473684


In [18]:
phi = ( (t2w + beta).T / (np.sum(t2w, axis=1) + beta) ).T # topic probabilities per word

## topic to term distributions,
t2w_df = convert_t2w_to_pandas(phi, vocab)
t2w_df.head()

Unnamed: 0,a,always,are,before,best,classification,data,deep,do,examine,...,text,the,them,they,to,use,want,we,which,you
0,4.5e-05,0.045498,0.09095,4.5e-05,4.5e-05,4.5e-05,4.5e-05,0.045498,4.5e-05,4.5e-05,...,4.5e-05,4.5e-05,0.045498,4.5e-05,4.5e-05,0.045498,0.045498,0.045498,4.5e-05,0.045498
1,0.058879,5.9e-05,5.9e-05,0.058879,5.9e-05,0.058879,5.9e-05,5.9e-05,0.058879,5.9e-05,...,5.9e-05,5.9e-05,5.9e-05,0.058879,5.9e-05,5.9e-05,5.9e-05,5.9e-05,5.9e-05,5.9e-05
2,5.9e-05,5.9e-05,5.9e-05,5.9e-05,0.058879,5.9e-05,0.058879,5.9e-05,5.9e-05,0.058879,...,0.058879,0.058879,5.9e-05,5.9e-05,0.235339,5.9e-05,5.9e-05,5.9e-05,0.058879,5.9e-05


In [19]:
TOP_K = 8
top_words_per_topic = [
    (k, t2w_df.iloc[k][t2w_df.iloc[k] > .001].sort_values(ascending=False).index.values.tolist()[:TOP_K])
    for k in TOPICS
]

In [20]:
## top words per topic,
top_words_per_topic

[(0, ['learning', 'are', 'is', 'you', 'we', 'deep', 'for', 'good']),
 (1, ['problem', 'our', 'machine', 'they', 'started', 'not', 'it’s', 'it']),
 (2,
  ['to',
   'supervised',
   'which',
   'the',
   'text',
   'tasks',
   'suited',
   'investigate'])]