In [31]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re
import tqdm


In [27]:
covid_sentences = [
    "the COVID-19 vaccine rollout has been effective in reducing the number of cases.",
    "New variants of the coronavirus are causing a lot of concern among health experts.",
    "Many people are hesitant about the COVID vaccine due to misinformation.",
    "Hospitals are overwhelmed due to the surge in COVID-19 cases.",
    "The pandemic has led to a significant increase in mental health issues worldwide.",
    "Researchers are studying the long-term effects of COVID-19 on the human body.",
    "Remote work has become the new norm since the COVID pandemic started.",
    "Many countries are implementing stricter lockdowns to control the spread of COVID-19.",
]


In [None]:
class LDA():

    def __init__(self, docs, k, iterations, alpha, beta):
        self.k = k
        self.iterations = iterations
        self.alpha = alpha
        self.beta = beta
        self.docs = self.preprocess_tweets(docs)

        self.num_docs = len(docs)

        # Setting vocab and id mappings
        vectorizer = CountVectorizer(max_features=50000)
        print(self.docs)
        vectorizer.fit(self.docs)
        self.vocab = vectorizer.get_feature_names_out()
        self.vocab_size = len(self.vocab)

        # Essentially gives us a mapping from word to id and vice versa (used for updating counts)
        self.word2id = vectorizer.vocabulary_
        print(self.word2id)
        self.id2word = {idx: word for word, idx in self.word2id.items()}
        print(self.id2word)
        
        # Counts needed for gibbs sampling
        self.doc_topic_count = np.zeros((self.num_docs, self.k))
        self.topic_word_count = np.zeros((self.k, len(self.vocab)))
        self.topic_total_counts = np.zeros(self.k)
        self.doc_lengths = np.zeros(self.num_docs)
        self.assigned_topics = []

    def preprocess_tweets(self, tweets):
        
        #use from other file
        return

    def init_assignment(self):

        for t, tweet in enumerate(self.docs):
            words = tweet.split()
            curr_doc_topic = []

            for w, word in enumerate(words):
                try:
                    rand_topic = np.random.randint(self.k)
                    word_id = self.word2id[word]
                    curr_doc_topic.append(rand_topic)
                except Exception as e:
                    print('Word not in vocab: ', word)
                    continue
                
                self.doc_topic_count[t, rand_topic] += 1
                self.topic_word_count[rand_topic, word_id] += 1
                self.topic_total_counts[rand_topic] += 1

            self.assigned_topics.append(curr_doc_topic)
        return 

    def count_modifier(self, tweet_idx, word_idx, topic_idx, type : str):

        var = 0

        if type == 'increment':
            var = 1
        elif type == 'decrement':
            var = -1

        self.doc_topic_count[tweet_idx, topic_idx] += var
        self.topic_word_count[topic_idx, word_idx] += var
        self.topic_total_counts[topic_idx] += var
    
    def gibbs_sampling(self):

        # Loop through each word of each document
      
        # For each word, do the gibbs sampling update equation to get the new probabilities
        # then sample based on those proabilities and update the counts
        # Update the topic counts
        # Repeat for a number of iterations

        # first_part = (num words in d that belong to topic t + alpha) / (num words in d + k*alpha)
        # second_part = (num of word being assigned to topic k + beta) / (num of words assigned to topic k + V*beta)
        # prob(word of doc belongs to topic) = first_part * second_part

        for i in tqdm(range(self.iterations)):
            
            print(f'i : {i}')


            for t, tweet in enumerate(self.docs):
                words = tweet.split()
                for w, word in enumerate(words):
                    try:
                        old_topic = self.assigned_topics[t][w]
                        
                        word_id = self.word2id[word]
                    except Exception as e:
                        print(e)
                        
                    #decrement counts to ensure curr word is not considered while choosing new topic
                    self.count_modifier(t,word_id,old_topic,'decrement')

                    doc_top_frac = self.doc_topic_count[t] + self.alpha
                    top_word_frac = self.topic_word_count[:, word_id] + self.beta
                    new_prob = doc_top_frac * top_word_frac / (self.topic_total_counts + (self.vocab_size * self.beta))
                    new_prob /= np.sum(new_prob)

                    new_topic = np.random.choice(np.arange(self.k), p=new_prob)

                    # increment counts!!
                    self.count_modifier(t,word_id,new_topic,'increment')

                    self.assigned_topics[t][w] = new_topic
        print('Done.')
        return

    def pipeline(self):
        self.init_assignment()
        self.gibbs_sampling()


lda_model = LDA(covid_sentences, 7, 12000, 0.1, 0.1)
lda_model.pipeline()
print(f"Assigned Topics : {lda_model.assigned_topics}")
    

    