In [27]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re

In [28]:
covid_sentences = [
    "the COVID-19 vaccine rollout has been effective in reducing the number of cases.",
    "New variants of the coronavirus are causing a lot of concern among health experts.",
    "Many people are hesitant about the COVID vaccine due to misinformation.",
    "Hospitals are overwhelmed due to the surge in COVID-19 cases.",
    "The pandemic has led to a significant increase in mental health issues worldwide.",
    "Researchers are studying the long-term effects of COVID-19 on the human body.",
    "Remote work has become the new norm since the COVID pandemic started.",
    "Many countries are implementing stricter lockdowns to control the spread of COVID-19.",
]


In [35]:
class LDA():

    def __init__(self, docs, k, iterations, alpha, beta):
        self.k = k
        self.iterations = iterations
        self.alpha = alpha
        self.beta = beta
        self.docs = self.preprocess_texts(docs)

        self.num_docs = len(docs)

        # Setting vocab and id mappings
        vectorizer = CountVectorizer()
        vectorizer.fit(self.docs)
        self.vocab = vectorizer.get_feature_names_out()
        self.word2id = vectorizer.vocabulary_
        print(self.word2id)
        self.id2word = {idx: word for word, idx in self.word2id.items()}
        print(self.id2word)
        
        # Counts needed for gibbs sampling
        self.doc_topic_count = np.zeros((self.num_docs, self.k))
        self.topic_word_count = np.zeros((self.k, len(self.vocab)))
        self.topic_total_counts = np.zeros(self.k)
        self.doc_lengths = np.zeros(self.num_docs)
        self.assigned_topics = []

    def preprocess_texts(self,texts):
        processed_texts = []
        for text in texts:
            text = text.lower()
            text = re.sub(r'[^\w\s_]', '', text)
            text = re.sub(r'\d+', '', text)
            text = re.sub(r'\s+', ' ', text).strip()
            processed_texts.append(text)
        return processed_texts

    def init_assignment(self):

        for t, tweet in enumerate(self.docs):
            words = tweet.split()
            curr_doc_topic = []

            for w, word in enumerate(words):
                try:
                    word = word.lower()
                    rand_topic = np.random.randint(self.k)
                    word_id = self.word2id[word]
                    curr_doc_topic.append(rand_topic)
                except Exception as e:
                    print(e)
                    continue
                
                self.doc_topic_count[t, rand_topic] += 1
                self.topic_word_count[rand_topic, word_id] += 1
                self.topic_total_counts[rand_topic] += 1

            self.assigned_topics.append(curr_doc_topic)
            
            

    def pipeline(self):
        self.init_assignment()

        


lda_model = LDA(covid_sentences, 3, 1000, 0.1, 0.1)
lda_model.pipeline()
print(f"Assigned Topics : {lda_model.assigned_topics}")
    

    

{'the': 52, 'covid': 12, 'vaccine': 54, 'rollout': 44, 'has': 17, 'been': 4, 'effective': 14, 'in': 23, 'reducing': 41, 'number': 35, 'of': 36, 'cases': 6, 'new': 33, 'variants': 55, 'coronavirus': 10, 'are': 2, 'causing': 7, 'lot': 29, 'concern': 8, 'among': 1, 'health': 18, 'experts': 16, 'many': 30, 'people': 40, 'hesitant': 19, 'about': 0, 'due': 13, 'to': 53, 'misinformation': 32, 'hospitals': 20, 'overwhelmed': 38, 'surge': 51, 'pandemic': 39, 'led': 26, 'significant': 45, 'increase': 24, 'mental': 31, 'issues': 25, 'worldwide': 57, 'researchers': 43, 'studying': 50, 'longterm': 28, 'effects': 15, 'on': 37, 'human': 21, 'body': 5, 'remote': 42, 'work': 56, 'become': 3, 'norm': 34, 'since': 46, 'started': 48, 'countries': 11, 'implementing': 22, 'stricter': 49, 'lockdowns': 27, 'control': 9, 'spread': 47}
{52: 'the', 12: 'covid', 54: 'vaccine', 44: 'rollout', 17: 'has', 4: 'been', 14: 'effective', 23: 'in', 41: 'reducing', 35: 'number', 36: 'of', 6: 'cases', 33: 'new', 55: 'varian