In [1]:
'''This is were i am going to build my LDA Model for topic modeling from scratch'''


'This is were i am going to build my LDA Model for topic modeling from scratch'

In [2]:
import numpy as np
import spacy
import random
import pandas as pd
from tqdm import tqdm
from collections import Counter


In [3]:
df = pd.read_excel("Mentor_CSAT.xlsx")
df.head()

Unnamed: 0.1,Unnamed: 0,activityid,Lastdate,SentDate,StudentName,PMName,PMOperationsID,PathID,PipePathwayID,StudentType,...,"To help us better understand your experience, please explain the reasons for your satisfaction ratin",My Mentor shares useful information.,My Mentor responds to me within 24 hours.,My Mentor helps me get questions and issues resolved.,My Mentor cares about me and my success.,Sent Date,Semester,% Communicated,% Not Communicated,"Overall, how satisfied are you with BYU-Pathway Mentoring?"
0,0,F22B8E8C-476D-EE11-8DF0-00224808045A,2023-10-17 23:47:33,10/16/2023,Hnin Wai Oo,Maria Carmela Villenas Gaela,10260,22822379,22822379,Matriculation,...,She's really kind to me,Mostly true,Moderately true,Slightly true,Moderately true,2023-10-16,Fall 2023,Communicated,,Satisfied
1,1,C049BF3C-0F10-EE11-8F6D-0022480A40EB,2023-06-21 08:40:19,6/20/2023,Adrian James Oriewo,Susan Kapiya Chitembo,10121,22560197,22560197,Online Degree - BYU-Idaho,...,The Peer Mentor always encourages me to keep o...,Definitely true,Definitely true,Definitely true,Definitely true,2023-06-20,Spring 2023,Communicated,,Very satisfied
2,2,15D312C2-80F9-ED11-8849-0022480AE0F6,2023-05-23 15:44:56,5/22/2023,Marie Angelene Empleo Diaz,Julie Anne Gollayan Alberto Espino,10503,22525995,22525995,PathwayConnect,...,She replies whenever I need her help.,Moderately true,Moderately true,Moderately true,Moderately true,2023-05-22,Spring 2023,Communicated,,Very satisfied
3,3,8F082B81-0C25-EE11-9CBE-0022480AE0F6,2023-07-18 01:43:35,7/17/2023,Gabart Eugenio Jean Louis,Michael Aboagye Adu,10389,20062864,20062864,PathwayConnect,...,Best mentor ever thus far,Definitely true,Definitely true,Definitely true,Definitely true,2023-07-17,Spring 2023,Communicated,,Very satisfied
4,4,264787A9-EC98-EE11-BE37-0022480AE1C8,2023-12-12 12:47:48,12/11/2023,Nomathamsanqa Maho,Evans Nana Baah Ankomah,10243,13242085,13242085,Online Degree - Ensign College,...,,,,,,2023-12-11,Fall 2023,,Not Communicated,


In [4]:
#Dirichlet, Hyperparameter, tokenizer
ALPHA = 0.1 
BETA = 0.1
NUM_TOPICS = 10
sp = spacy.load("en_core_web_sm") #Tokenizer #Efficiency 
# sp = spacy.load("en_core_web_trf") accuracy

# reproducibility
np.random.seed(42)
random.seed(42)



In [5]:
def generate_frequencies(data, max_docs = 10000):
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    nr_tokens = 0

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(str(doc))
        for token in tokens:
            token_text = token.text.lower()
            if token_text not in all_stopwords and token.is_alpha:
                nr_tokens += 1
                freqs[token_text] +=1
    
    return freqs

def get_vocab(freqs, freq_threshold=3):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs:
        if freqs[word] >= freq_threshold:
            vocab[word] = vocab_idx
            vocab_idx_str[vocab_idx] = word
            vocab_idx +=1
    
    return vocab, vocab_idx_str


def toeknize_dataset(data, vocab, max_docs=10000):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data[:max_docs]:
        tokens = sp.tokenizer(str(doc))

        if len(tokens) > 1:
            doc = []
            for token in tokens:
                token_text = token.text.lower()
                if token_text in vocab:
                    doc.append(token_text)
                    nr_tokens +=1
            nr_docs +=1
            docs.append(doc)
    
    print(f"Number of Reviews:{nr_docs}")
    print(f"Number of tokens: {nr_tokens}")


    #Numericalize
    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))

    return docs, corpus



In [6]:
data = df['To help us better understand your experience, please explain the reasons for your satisfaction ratin'].sample(frac=0.5, random_state=42).values
freqs = generate_frequencies(data)
vocab, vocab_idx_str = get_vocab(freqs)
docs, corpus = toeknize_dataset(data, vocab)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

Number of Reviews:8216
Number of tokens: 61339
Vocab size: 1972


In [7]:
def LDS_Collapsed_Gibbs(corpus, num_iter=200):
    #Initialize counts and Z
    Z = []
    num_docs = len(corpus)
    for _, doc in enumerate(corpus):
        Zd = np.random.randint(low=0, high=NUM_TOPICS, size=(len(doc)))
        Z.append(Zd)

    ndk = np.zeros((num_docs, NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d, k] = np.sum(Z[d] == k)
    
    nkw = np.zeros((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] += 1

    nk = np.sum(nkw, axis=1)
    topic_list = [i for i in range(NUM_TOPICS)]

    # loop
    for _ in tqdm(range(num_iter)):
        for doc_idx, doc in enumerate(corpus):
            for i in range(len(doc)):
                word = doc[i]
                topic = Z[doc_idx][i]

                # remove z_i because conditioned on z_(-i)
                ndk[doc_idx, topic] -= 1
                nkw[topic, word] -=1
                nk[topic] -= 1

                p_z = (ndk[doc_idx, : ] + ALPHA) * (nkw[:, word] + BETA) / (nk[:] + BETA*vocab_size)
                topic = random.choices(topic_list, weights=p_z, k=1)[0]

                #update n parameters
                Z[doc_idx][i] = topic
                ndk[doc_idx, topic] += 1
                nkw[topic, word] +=1
                nk[topic] += 1
    return Z, ndk, nkw, nk

Z, ndk, nkw, nk = LDS_Collapsed_Gibbs(corpus)


100%|██████████| 200/200 [01:41<00:00,  1.97it/s]


In [8]:
phi = nkw / nk.reshape(NUM_TOPICS, 1) # to get probability distribution

num_words = 10
for k in range(NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::-1][:num_words]
    print(f"Topic {k} most common words: ")

    for word in most_common_words:
        print(vocab_idx_str[word])

    print('\n')


Topic 0 most common words: 
mentor
peer
helpful
great
kind
good
feel
person
love
like


Topic 1 most common words: 
week
mentor
messages
help
scriptures
encouraging
encourages
studies
going
encourage


Topic 2 most common words: 
time
help
mentor
questions
messages
know
reply
email
message
respond


Topic 3 most common words: 
mentor
help
peer
like
program
need
think
helped
degree
semester


Topic 4 most common words: 
encouragement
messages
encouraging
good
communication
words
emails
sending
sends
great


Topic 5 most common words: 
mentor
help
challenges
peer
helps
problems
problem
issues
solve
encourages


Topic 6 most common words: 
questions
answer
answers
responds
help
concerns
respond
question
time
helpful


Topic 7 most common words: 
mentor
peer
support
information
mentoring
pathway
byu
satisfied
success
journey


Topic 8 most common words: 
help
need
mentor
available
willing
time
ready
needed
peer
support


Topic 9 most common words: 
mentor
pathway
peer
byu
help
good
things
