In [1]:
import numpy as np
import spacy
import random
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [2]:
df = pd.read_csv("bible_asv.csv")

In [62]:
ALPHA = 0.1
BETA = 0.1
NUM_TOPICS = 40
sp = spacy.load("en_core_web_sm")

np.random.seed(42)
random.seed(42)

In [63]:
def generate_frequencies(data, max_docs=31102):
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    nr_tokens = 0

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)
        for token in tokens:
            token_text = token.text.lower()
            if token_text not in all_stopwords and token.is_alpha:
                nr_tokens += 1
                freqs[token_text] += 1

    return freqs
    

def get_vocab(freqs, freq_threshold=3):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word] = vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx += 1

    return vocab, vocab_idx_str
    

def tokenize_dataset(data, vocab, max_docs=31102):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens += 1
            nr_docs += 1
            docs.append(doc)

    print(f"Number of verses: {nr_docs}")
    print(f"Number of tokens: {nr_tokens}")

    corpus = []

    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))

    return docs, corpus


In [82]:
data = df['Text'].values

missing_values = df['Text'].isnull().sum()
print(f"Number of missing values in 'Text' column: {missing_values}")

if missing_values > 0:
  df.dropna(subset=['Text'], inplace=True)
  print(f"Dropped {missing_values} rows with missing values in 'Text' column.")

data = [str(x) for x in data]


Number of missing values in 'Text' column: 0


In [83]:
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = tokenize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Number of verses: 31101
Number of tokens: 328640
Vocab size: 6684


In [88]:
def LDA_Collapsed_Gibbs(corpus, num_iter=400):
    Z = []
    num_docs = len(corpus)

    for doc_idx, doc in enumerate(corpus):
        Zd = np.random.randint(low=0, high=NUM_TOPICS, size=(len(doc)))
        Z.append(Zd)

    ndk = np.zeros((num_docs, NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d, k] = np.sum(Z[d] == k)

    nkw = np.zeros((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] += 1

    nk = np.sum(nkw, axis=1)
    topic_list = [i for i in range(NUM_TOPICS)]

    for _ in tqdm(range(num_iter)):
        for doc_idx, doc in enumerate(corpus):
            for i in range(len(doc)):
                word = doc[i]
                topic = Z[doc_idx][i]

                ndk[doc_idx, topic] -= 1
                nkw[topic, word] -= 1
                nk[topic] -= 1

                p_z = (ndk[doc_idx, :] + ALPHA) * (nkw[:, word] + BETA) / (nk[:] + BETA*vocab_size)
                topic = random.choices(topic_list, weights=p_z, k=1)[0]

                Z[doc_idx][i] = topic
                ndk[doc_idx, topic] += 1
                nkw[topic, word] += 1
                nk[topic] += 1

    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDA_Collapsed_Gibbs(corpus)


100%|██████████| 400/400 [28:13<00:00,  4.23s/it]


In [89]:
phi = nkw / nk.reshape(NUM_TOPICS, 1)

num_words = 30

for k in range (NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::1][:num_words]
    print(f"Topic {k} most common words: ")

    for word in most_common_words:
        print(vocab_idx_str[word])

    print("\n")

Topic 0 most common words: 
beginning
conferred
shunammite
abishag
araunah
mercies
foes
gareb
ithrite
bani
maacathite
azmaveth
netophathite
maharai
anathothite
tekoite
ikkesh
helez
ariel
jeopardy
defended
plot
piped
hararite
pipes
uproar
fixed
costly
lover
ezrahite


Topic 1 most common words: 
beginning
prayer
telleth
leaping
shouting
gittite
cymbals
psalteries
fir
ahio
bridle
uzzah
perazim
eliphelet
eliada
ibhar
solomon
nathan
shobab
masons
carpenters
mulberry
hiram
hadadezer
succor
hadarezer
buttocks
beards
comforters
hanun


Topic 2 most common words: 
beginning
bloody
ira
dispersed
mound
bichri
cost
chimham
sustain
aged
gibeonites
perversely
comfortably
perceive
hatest
mourneth
weepeth
porter
apace
watchman
lifetime
boat
zeal
giant
sibbecai
teacheth
hinds
tried
haughty
recompensed


Topic 3 most common words: 
beginning
guests
pipes
piped
conferred
shunammite
abishag
araunah
foes
gareb
ithrite
bani
maacathite
azmaveth
netophathite
maharai
anathothite
tekoite
ikkesh
helez
ariel
jeo