In [19]:
# Import Libraries
# for text preprocessing
import re
# import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
# import numpy for matrix operation
import numpy as np
# Importing Gensim
import gensim
from gensim import corpora  
import pandas as pd

In [20]:
import ast
dataframe = pd.read_csv("./data/preprocessed.csv")
clean_corpus = [ast.literal_eval(x) for x in dataframe["preprocessed"]]

In [21]:
# where every unique term is assigned an index.
dict_ = corpora.Dictionary(clean_corpus)

In [22]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]

In [23]:
# Uncomment to retrain LDAModel
# %%script false
# Creating the object for LDA model using gensim library

n_topics = 6

Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, num_topics=n_topics , id2word = dict_, passes=20, random_state=20, eval_every=None)

# Prints the topics with the indexes: 0,1,2 :
ldamodel.print_topics()
# we need to manually check whethere the topics are different from one another or not
print(ldamodel.print_topics(num_topics=n_topics , num_words=5))

# num_topics mean: how many topics want to extract
# num_words: the number of words that want per topic


[(0, '0.010*"price" + 0.008*"country" + 0.007*"rate" + 0.007*"like" + 0.006*"people"'), (1, '0.009*"golden" + 0.009*"peso" + 0.007*"haha" + 0.006*"daw" + 0.005*"bigas"'), (2, '0.007*"dont" + 0.007*"economy" + 0.006*"one" + 0.006*"people" + 0.005*"like"'), (3, '0.012*"wala" + 0.010*"bilihin" + 0.008*"sana" + 0.007*"gobyerno" + 0.006*"tapos"'), (4, '0.011*"share" + 0.008*"mb" + 0.007*"company" + 0.007*"ipo" + 0.006*"billion"'), (5, '0.011*"k" + 0.006*"covid" + 0.005*"kuryente" + 0.005*"mahal" + 0.005*"pbbm"')]


In [24]:
def common_member(a, b):
    a_set = set(a)
    b_set = set(b)
    # print(a==b)
    if (a_set & b_set):
        return (a_set & b_set)
    else:
        return "No common elements"

In [25]:
x=ldamodel.show_topics(num_words=100)

twords={}
for topic,words in x:
    words_in_string =re.sub('[^A-Za-z ]+', '', words)
    words_in_list =[word for word in words_in_string.split(' ') if word != '' ] 
    twords[topic] = words_in_list
   #  print(words_in_list)
    
# print(twords)

In [26]:
count_ = {}
for i,words_in_list1 in twords.items():
   for j,words_in_list2 in twords.items():
      
      count_[f"{i}:{j}"] = True
      if  f"{j}:{i}" in count_.keys(): 
         continue
      if i==j :
         continue
      # print(f'{i}:{j}: Common Member {common_member(words_in_list1, words_in_list2)}')
      # print(words_in_list1==words_in_list2)

In [27]:
# # Comment to prevent loading the pretrained model
# # %%script false

# import pickle

# ldamodel = None
# with open('./out/lda_model.pkl', 'rb') as f:
#     ldamodel = pickle.load(f)

In [28]:
# from gensim.models import CoherenceModel

# coherence_model = CoherenceModel(model=ldamodel, texts=clean_corpus, dictionary=dict_, coherence="c_v")

# coherence_model.get_coherence() 

In [29]:
# NOTE: Gensim’s perplexity value is in logarithmic form. To compare with sklearn’s perplexity value np.exp(-1 *gensim.log_perplexity) is used

print(ldamodel.log_perplexity(doc_term_matrix ))
print(np.exp(-1 * ldamodel.log_perplexity(doc_term_matrix )))


-8.697503900538264
5987.938142420413


In [30]:
import numpy as np
all_rows = []



for index,doc_in_words in enumerate(clean_corpus):
    doc_in_words_as_string = ' '.join(doc_in_words)
    row_vals = [dataframe.loc[index,'platform'], dataframe.loc[index, 'raw'] ,doc_in_words_as_string]+list(np.zeros(n_topics))
    doc_topics = ldamodel.get_document_topics(dict_.doc2bow(doc_in_words))
    
    for doc_topic in doc_topics:
        row_vals[doc_topic[0]+3] = doc_topic[1]
    index_of_best_topic = np.argmax(row_vals[3:])
    
    row_vals.append(index_of_best_topic+1)
    all_rows.append(row_vals)


In [31]:
import pyLDAvis

import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
%matplotlib inline

gensimvis_dict = gensimvis.prepare(ldamodel ,doc_term_matrix , dict_).to_dict()


# gensimvis_dict

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [32]:
len(dict_.token2id.keys())

18666

In [33]:
term_to_topic_df = pd.DataFrame(
     columns=["Term", "Topic","Frequency"])
for term, id in dict_.token2id.items():
   test = ldamodel.get_term_topics(id, minimum_probability=0.00001)
   for i in test:
      term_to_topic_df.loc[len(term_to_topic_df )] =[term, int(i[0])+1,i[1]]
      

term_to_topic_df

Unnamed: 0,Term,Topic,Frequency
0,bless,4,0.000899
1,bless,6,0.000845
2,control,1,0.001527
3,control,3,0.000254
4,country,1,0.007802
...,...,...,...
22295,mrutal,2,0.000311
22296,utal,2,0.000311
22297,aaralan,4,0.000197
22298,kakaumpisa,4,0.000197


In [35]:
term_to_topic_df.to_csv('./data/term_to_topic.csv', index=False)

In [None]:
# term_to_topic_df = pd.DataFrame(
#      columns=["Term", "Topic","Frequency"])

# topic_designations = gensimvis_dict['token.table']['Topic']
# terms = gensimvis_dict['token.table']['Term']
# term_frequency = gensimvis_dict['token.table']['Freq']
# term_to_topic_designation_dict = {}
# for n_topic,term,frequency in zip(topic_designations, terms, term_frequency):
#    # term_to_topic_designation_dict[term] = [frequency, n_topic]
#    # if term in term_to_topic_designation_dict.keys():
#    #    if frequency> term_to_topic_designation_dict[term][0]:
#    #       term_to_topic_designation_dict[term][1] = n_topic
#    #    else: 
#    #       pass
#    to_append = [term, n_topic, frequency]
#    term_to_topic_df.loc[len(term_to_topic_df )] = to_append

# term_to_topic_df.to_csv('./data/term_to_topic.csv', index=False)

In [36]:
term_to_topic_df.shape

(22300, 3)

In [37]:
import pandas as pd
labelled_dataset = pd.DataFrame(all_rows, columns=["Platform","Raw","Text"]+[f"Topic {i+1}" for i in range(n_topics)]+["Best Topic"])
labelled_dataset

Unnamed: 0,Platform,Raw,Text,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Best Topic
0,Facebook,Why nowadays every thing seem to be increasin...,nowadays every thing seem increasing governanc...,0.473528,0.000000,0.373507,0.000000,0.000000,0.124946,1
1,Facebook,I will have to disagree.. we’re not that high!!,disagree high,0.056182,0.055559,0.056438,0.720307,0.055583,0.055932,4
2,Facebook,"Wag po tayong mag-alala. Naniniwala po ako, is...",tayong magalala naniniwala isusuprise sir bbm ...,0.010470,0.139007,0.314224,0.247254,0.010458,0.278587,3
3,Facebook,Ok lang yang lahat naman nang bansa ganyan. Sa...,ok yang nang bansa ganyan selfish fanatic blen...,0.018524,0.018563,0.018567,0.618591,0.018527,0.307228,4
4,Facebook,Sama-sama tayong BABAON muli.,samasama tayong babaon,0.041687,0.041688,0.041687,0.791544,0.041687,0.041706,4
...,...,...,...,...,...,...,...,...,...,...
4691,Youtube,mukhang nakashabu,mukhang nakashabu,0.055576,0.055577,0.055576,0.055722,0.055576,0.721973,6
4692,Youtube,Bbm is mixed up and confusing on economic terms,bbm mixed confusing economic term,0.386169,0.027944,0.502167,0.027899,0.027846,0.027975,3
4693,Youtube,Mr.utal utal,mrutal utal,0.055577,0.722116,0.055577,0.055577,0.055577,0.055577,2
4694,Youtube,Kung Si Leni Pa Yan \r\n\r\nLutang Na Tayo,leni lutang,0.055558,0.721804,0.055711,0.055653,0.055558,0.055715,2


In [38]:
labelled_dataset.to_csv("./data/labelled_dataset.csv",index=False)

In [None]:
# Dump model
# pickle.dump(ldamodel, open('./out/lda_model.pkl','wb'))