In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import gensim

In [2]:
stop_list = stopwords.words('english')

In [22]:
def remove_hashtag_mentions_urls(text):
    # e.sub(r"(?:\@|https?\://)\S+", "", text) # remove mentions and url only
    return re.sub(r"(?:\@|\#|https?\://)\S+", "", text)

def process_text(df, comment_header):
    processed = []
    for text in df[comment_header]:
        text = remove_hashtag_mentions_urls(text)
        tokenized = TweetTokenizer().tokenize(text)
        lowercased = [w.lower() for w in tokenized]
        words_only = [w for w in lowercased if re.search('^[a-z]+$',w)]
        stop_removed = [w for w in words_only if w not in stop_list]
        processed.append(stop_removed)
    df['Text Processed'] = processed
    return df

def text2vecs(df, comment_header):
    texts = [text for text in df[comment_header]]
    dic = gensim.corpora.Dictionary(texts)
    vecs = [dic.doc2bow(text) for text in texts]
    return dic, vecs

In [4]:
df = pd.read_csv('/Users/chenjianyu/Desktop/Y2S2/SMT203 Computational Social Sci/Covid-19-Singapore-Analysis/Data/Thoughtful Comments/thoughtful_comments_final.csv')

In [5]:
df = process_text(df, 'Comment')

In [23]:
dic, vecs = text2vecs(df, 'Text Processed')

In [25]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=vecs, id2word=dic, num_topics=10)

In [27]:
topics = lda_model.show_topics(10, 15)

for i in range(0, 10):
    print(topics[i])

(0, '0.011*"vaccine" + 0.007*"tt" + 0.007*"like" + 0.006*"covid" + 0.006*"people" + 0.006*"app" + 0.005*"get" + 0.005*"data" + 0.005*"would" + 0.005*"contact" + 0.004*"government" + 0.004*"think" + 0.004*"virus" + 0.004*"police" + 0.004*"phase"')
(1, '0.009*"vaccine" + 0.006*"breaker" + 0.006*"circuit" + 0.005*"like" + 0.005*"singapore" + 0.005*"know" + 0.005*"government" + 0.004*"people" + 0.004*"app" + 0.004*"phase" + 0.004*"think" + 0.004*"get" + 0.004*"point" + 0.004*"around" + 0.003*"make"')
(2, '0.011*"vaccine" + 0.009*"circuit" + 0.009*"breaker" + 0.008*"covid" + 0.006*"singapore" + 0.006*"get" + 0.005*"vaccines" + 0.005*"use" + 0.004*"one" + 0.004*"people" + 0.004*"like" + 0.004*"data" + 0.004*"right" + 0.004*"going" + 0.004*"well"')
(3, '0.007*"people" + 0.006*"government" + 0.006*"think" + 0.005*"deleted" + 0.005*"contact" + 0.005*"one" + 0.005*"tracing" + 0.004*"use" + 0.004*"tracetogether" + 0.004*"old" + 0.004*"also" + 0.004*"need" + 0.004*"go" + 0.004*"covid" + 0.004*"man