In [27]:
# Import Libraries
# for text preprocessing
import re
# import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
# import numpy for matrix operation
import numpy as np
# Importing Gensim
import gensim
from gensim import corpora  
import pandas as pd

In [28]:
import ast
dataframe = pd.read_csv("./data/preprocessed.csv")
clean_corpus = [ast.literal_eval(x) for x in dataframe["preprocessed"]]

In [29]:
# where every unique term is assigned an index.
dict_ = corpora.Dictionary(clean_corpus)

In [30]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]

In [31]:
# Uncomment to retrain LDAModel
# %%script false
# Creating the object for LDA model using gensim library

n_topics = 6

Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, num_topics=n_topics , id2word = dict_, passes=20, random_state=20, eval_every=None)

# Prints the topics with the indexes: 0,1,2 :
ldamodel.print_topics()
# we need to manually check whethere the topics are different from one another or not
print(ldamodel.print_topics(num_topics=n_topics , num_words=5))

# num_topics mean: how many topics want to extract
# num_words: the number of words that want per topic


[(0, '0.010*"price" + 0.008*"country" + 0.007*"rate" + 0.007*"like" + 0.006*"people"'), (1, '0.009*"golden" + 0.009*"peso" + 0.007*"haha" + 0.006*"daw" + 0.005*"bigas"'), (2, '0.007*"dont" + 0.007*"economy" + 0.006*"one" + 0.006*"people" + 0.005*"like"'), (3, '0.012*"wala" + 0.010*"bilihin" + 0.008*"sana" + 0.007*"gobyerno" + 0.006*"tapos"'), (4, '0.011*"share" + 0.008*"mb" + 0.007*"company" + 0.007*"ipo" + 0.006*"billion"'), (5, '0.011*"k" + 0.006*"covid" + 0.005*"kuryente" + 0.005*"mahal" + 0.005*"pbbm"')]


In [32]:
# # Comment to prevent loading the pretrained model
# # %%script false

# import pickle

# ldamodel = None
# with open('./out/lda_model.pkl', 'rb') as f:
#     ldamodel = pickle.load(f)

In [33]:
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=ldamodel, texts=clean_corpus, dictionary=dict_, coherence="c_v")

coherence_model.get_coherence() 

0.4497714323589299

In [34]:
# NOTE: Gensim’s perplexity value is in logarithmic form. To compare with sklearn’s perplexity value np.exp(-1 *gensim.log_perplexity) is used

print(ldamodel.log_perplexity(doc_term_matrix ))
print(np.exp(-1 * ldamodel.log_perplexity(doc_term_matrix )))


-8.69750470740118
5987.942142462064


In [35]:
import numpy as np
all_rows = []



for index,doc_in_words in enumerate(clean_corpus):
    doc_in_words_as_string = ' '.join(doc_in_words)
    row_vals = [dataframe.loc[index,'platform'], dataframe.loc[index, 'raw'] ,doc_in_words_as_string]+list(np.zeros(n_topics))
    doc_topics = ldamodel.get_document_topics(dict_.doc2bow(doc_in_words))
    
    for doc_topic in doc_topics:
        row_vals[doc_topic[0]+3] = doc_topic[1]
    index_of_best_topic = np.argmax(row_vals[3:])
    
    row_vals.append(index_of_best_topic+1)
    all_rows.append(row_vals)


In [36]:
import pyLDAvis

import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
%matplotlib inline

gensimvis.prepare(ldamodel ,doc_term_matrix , dict_)


  default_term_info = default_term_info.sort_values(


In [37]:
import pandas as pd
labelled_dataset = pd.DataFrame(all_rows, columns=["Platform","Raw","Text"]+[f"Topic {i+1}" for i in range(n_topics)]+["Best Topic"])
labelled_dataset

Unnamed: 0,Platform,Raw,Text,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Best Topic
0,Facebook,Why nowadays every thing seem to be increasin...,nowadays every thing seem increasing governanc...,0.473519,0.000000,0.373516,0.000000,0.000000,0.124946,1
1,Facebook,I will have to disagree.. we’re not that high!!,disagree high,0.056182,0.055559,0.056438,0.720307,0.055583,0.055932,4
2,Facebook,"Wag po tayong mag-alala. Naniniwala po ako, is...",tayong magalala naniniwala isusuprise sir bbm ...,0.010470,0.139012,0.314221,0.247253,0.010458,0.278586,3
3,Facebook,Ok lang yang lahat naman nang bansa ganyan. Sa...,ok yang nang bansa ganyan selfish fanatic blen...,0.018524,0.018563,0.018567,0.618592,0.018527,0.307227,4
4,Facebook,Sama-sama tayong BABAON muli.,samasama tayong babaon,0.041687,0.041688,0.041687,0.791544,0.041687,0.041706,4
...,...,...,...,...,...,...,...,...,...,...
4691,Youtube,mukhang nakashabu,mukhang nakashabu,0.055576,0.055577,0.055576,0.055722,0.055576,0.721973,6
4692,Youtube,Bbm is mixed up and confusing on economic terms,bbm mixed confusing economic term,0.386153,0.027944,0.502183,0.027899,0.027846,0.027975,3
4693,Youtube,Mr.utal utal,mrutal utal,0.055577,0.722116,0.055577,0.055577,0.055577,0.055577,2
4694,Youtube,Kung Si Leni Pa Yan \r\n\r\nLutang Na Tayo,leni lutang,0.055558,0.721804,0.055711,0.055653,0.055558,0.055715,2


In [38]:
labelled_dataset.to_csv("./data/labelled_dataset.csv",index=False)

In [39]:
# Dump model
# pickle.dump(ldamodel, open('./out/lda_model.pkl','wb'))