In [504]:
import itertools
from collections import defaultdict
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from numba import njit, jit

In [522]:
raw_docs = pd.read_csv("billboard.csv", nrows = 20).dropna()['Lyrics'].values.tolist()

In [523]:
# # create sample documents
# doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
# doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
# doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
# doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
# doc_e = "Health professionals say that brocolli is good for your health." 

# # compile sample documents into a list
# raw_docs = [doc_a, doc_b, doc_c, doc_d, doc_e]

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# list for tokenized documents in loop
docs = []

# loop through document list
for i in raw_docs:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    docs.append(stemmed_tokens)
    
vocab = defaultdict(lambda: len(vocab))
docs = [[vocab[w] for w in doc] for doc in docs]

In [528]:
K = 15 # number of topics
alpha = 1 # hyperparameter. single value indicates symmetric dirichlet prior. higher=>scatters document clusters
eta = .001 # hyperparameter
iterations = 2000 # iterations for collapsed gibbs sampling.  This should be a lot higher than 3 in practice.

In [529]:
wt = np.zeros((K, len(vocab)))
d = defaultdict(int)
ta = [[d[w] for w in doc] for doc in docs]
for d in range(len(docs)):
    for w in range(len(docs[d])):
        ta[d][w] = np.random.choice(K)
        ti = ta[d][w]
        wi = docs[d][w]
        wt[ti, wi] += 1
        
dt = np.zeros((len(docs), K))
for d in range(len(docs)):
    for t in range(K):
        dt[d, t] = (np.array(ta[d]) == t).sum()

In [530]:
def lda(iterations, docs, wt, ta, dt, vocab_len):
    for i in range(iterations): # for each pass through the corpus
        for di in range(len(docs)): # for each document
            for wi in range(len(docs[di])):
                t0 = ta[di][wi] # initial topic assignment to token w
                word_ID = docs[di][wi] # wordID of token w

                dt[di, t0] = dt[di, t0] - 1 # we don't want to include token w in our document-topic count matrix when sampling for token w
                wt[t0, word_ID] = wt[t0, word_ID] - 1 # we don't want to include token w in our word-topic count matrix when sampling for token w

                ## UPDATE TOPIC ASSIGNMENT FOR EACH WORD -- COLLAPSED GIBBS SAMPLING
                denom_a = dt[di, :].sum() + K * alpha # number of tokens in document + number topics * alpha
                denom_b = wt.sum(axis = 1) + vocab_len * eta # number of tokens in each topic + # of words in vocab * eta
                p_z = (wt[:, word_ID] + eta) / denom_b * (dt[di, :] + alpha) / denom_a # calculating probability word belongs to each topic
                prob = p_z / p_z.sum()
                prob[prob < 0] = 0
                prob[prob > 1] = 1
                t1 = np.random.choice(range(K), p = prob) # draw topic for word n from multinomial using probabilities calculated above

                ta[di][wi] = t1 # update topic assignment list with newly sampled topic for token w.
                dt[di, t1] = dt[di, t1] + 1 # re-increment document-topic matrix with new topic assignment for token w.
                wt[t1, word_ID] = wt[t1, word_ID] + 1 #re-increment word-topic matrix with new topic assignment for token w.
        if i % 100 == 0:
            print(f"Finished {i}")
    return wt, ta, dt

In [531]:
wt, ta, dt = lda(iterations, docs, wt, ta, dt, len(vocab))

Finished 0
Finished 100
Finished 200
Finished 300
Finished 400
Finished 500
Finished 600
Finished 700
Finished 800
Finished 900
Finished 1000
Finished 1100
Finished 1200
Finished 1300
Finished 1400
Finished 1500
Finished 1600
Finished 1700
Finished 1800
Finished 1900


In [532]:
theta = (dt + alpha) / (dt + alpha).sum(axis = 1).reshape(-1, 1) # topic probabilities per document
phi = (wt + eta) / (wt + eta).sum(axis = 1).reshape(-1, 1) # topic probabilities per word

In [534]:
df = pd.DataFrame(phi)
df.columns = list(vocab.keys())
df['topic'] = list(range(15))
df = pd.melt(df, id_vars = 'topic')

In [535]:
df[df['topic'] == 0].sort_values(by = 'value', ascending = False).head(10)

Unnamed: 0,topic,variable,value
1695,0,got,0.326094
6840,0,babe,0.073837
1725,0,whoaoh,0.061532
1710,0,troubl,0.049227
3810,0,wont,0.049227
1680,0,mind,0.049227
2010,0,walk,0.030769
1740,0,worri,0.030769
1755,0,wound,0.018464
1845,0,yeah,0.018464


In [537]:
df[df['topic'] == 4].sort_values(by = 'value', ascending = False).head(10)

Unnamed: 0,topic,variable,value
4864,4,well,0.104398
2134,4,like,0.072278
2794,4,youv,0.064248
409,4,come,0.056218
1504,4,heart,0.056218
1054,4,eye,0.056218
7174,4,doesnt,0.056218
6229,4,brown,0.048188
2404,4,make,0.048188
1144,4,there,0.040158


In [541]:
raw_docs[1]

' sugar pie honey bunch you know that i love you i cant help myself i love you and nobody elsein and out my life you come and you go leaving just your picture behind and i kissed it a thousand timeswhen you snap your finger or wink your eye i come arunning to you im tied to your apron strings and theres nothing that i can docant help myself no i cant help myselfsugar pie honey bunch im weaker than a man should be i cant help myself im a fool in love you seewanna tell you i dont love you tell you that were through and ive tried but every time i see your face i get all choked up insidewhen i call your name girl it starts the flame burning in my heart tearing it all apart no matter how i try my love i cannot hidecause sugar pie honey bunch you know that im weak for you cant help myself i love you and nobody elsesugar pie honey bunch do anything you ask me to cant help myself i want you and nobody elsesugar pie honey bunch you know that i love you i cant help myself i cant help myself '