In [74]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import gensim
import pyLDAvis.gensim
from gensim.models import CoherenceModel

In [44]:
stop_list = stopwords.words('english')

In [45]:
def remove_hashtag_mentions_urls(text):
    # e.sub(r"(?:\@|https?\://)\S+", "", text) # remove mentions and url only
    return re.sub(r"(?:\@|\#|https?\://)\S+", "", text)

def process_text(df, comment_header):
    processed = []
    for text in df[comment_header]:
        text = remove_hashtag_mentions_urls(text)
        tokenized = TweetTokenizer().tokenize(text)
        lowercased = [w.lower() for w in tokenized]
        words_only = [w for w in lowercased if re.search('^[a-z]+$',w)]
        stop_removed = [w for w in words_only if w not in stop_list]
        processed.append(stop_removed)
    df['Text Processed'] = processed
    return df

def text2vecs(df, comment_header):
    texts = [text for text in df[comment_header]]
    dic = gensim.corpora.Dictionary(texts)
    vecs = [dic.doc2bow(text) for text in texts]
    return dic, vecs

In [67]:
df = pd.read_csv("C:/Users/user/Documents/GitHub/Covid-19-Singapore-Analysis/Data/Thoughtful Comments/thoughtful_comments_final(1).csv")

thoughtful_df = df[(df['Thoughtful?'] == 1) & (df['Topic'] == 'Circuit Breaker')]
thoughtful_df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Topic,Label1,Label2,Label3,Thoughtful? (voting),Thoughtful?,Length,Length Category,Average Loglikelihood,Num Verbs,Num Discourse Relations,Num Pronouns,Relevance score,Relevance Score Category
10,10,"Even if CB is not extended, like what you said...",Circuit Breaker,1,1,1.0,1,1,55,3,-10.844451,12,2,4,6.867072,1
11,11,"Hi, so as long as it is cohabitation for the f...",Circuit Breaker,1,0,1.0,1,1,31,2,-10.29937,5,1,0,11.208939,2
15,15,What the hell is a circuit breaker? Just say l...,Circuit Breaker,0,0,1.0,0,1,13,1,-11.852054,3,0,0,16.235624,2
24,24,Why is TCM included in the first wave of thing...,Circuit Breaker,1,0,1.0,1,1,17,1,-10.509482,5,0,0,36.848291,4
28,28,How does it work for those who are staying tem...,Circuit Breaker,0,0,1.0,0,1,37,2,-11.16115,7,2,1,15.463359,2


In [61]:
df = process_text(df, 'Comment')

In [62]:
dic, vecs = text2vecs(df, 'Text Processed')

In [71]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=vecs, id2word=dic, num_topics=3)

In [72]:
topics = lda_model.show_topics(3, 10)

for i in range(0, 3):
    print(topics[i])

(0, '0.009*"vaccine" + 0.009*"people" + 0.007*"like" + 0.006*"app" + 0.005*"one" + 0.005*"get" + 0.005*"covid" + 0.005*"distancing" + 0.005*"social" + 0.005*"would"')
(1, '0.007*"people" + 0.006*"vaccine" + 0.006*"like" + 0.006*"singapore" + 0.005*"vaccines" + 0.005*"data" + 0.004*"new" + 0.004*"get" + 0.004*"privacy" + 0.004*"government"')
(2, '0.008*"vaccine" + 0.006*"covid" + 0.006*"singapore" + 0.006*"think" + 0.005*"vaccines" + 0.005*"people" + 0.004*"distancing" + 0.004*"good" + 0.004*"social" + 0.004*"token"')


## Visualisations

In [None]:
# !pip install pyldavis

In [73]:
pyLDAvis.enable_notebook()

visual= pyLDAvis.gensim.prepare(lda_model, vecs, dic)
pyLDAvis.save_html(visual, "thoughtful_test_viz.html")

## Evaluation: Coherence Score

In [77]:
coherence_model_lda = CoherenceModel(model=lda_model, texts= processed,dictionary=dic, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score DAModel: ', coherence_lda)

NameError: name 'processed' is not defined