In [27]:
# Import Libraries
# for text preprocessing
import re
# import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
# import numpy for matrix operation
import numpy as np
# Importing Gensim
import gensim
from gensim import corpora  
import pandas as pd

In [28]:
import ast
dataframe = pd.read_csv("./data/preprocessed.csv")
clean_corpus = [ast.literal_eval(x) for x in dataframe["preprocessed"]]

In [29]:
# where every unique term is assigned an index.
dict_ = corpora.Dictionary(clean_corpus)

In [30]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]

In [31]:
# Uncomment to retrain LDAModel
# %%script false
# Creating the object for LDA model using gensim library

n_topics = 3

Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, num_topics=n_topics , id2word = dict_, passes=20, random_state=20, eval_every=None)

# Prints the topics with the indexes: 0,1,2 :
ldamodel.print_topics()
# we need to manually check whethere the topics are different from one another or not
print(ldamodel.print_topics(num_topics=n_topics , num_words=5))

# num_topics mean: how many topics want to extract
# num_words: the number of words that want per topic


[(0, '0.008*"country" + 0.007*"people" + 0.006*"like" + 0.006*"dont" + 0.005*"even"'), (1, '0.006*"wala" + 0.006*"bilihin" + 0.006*"k" + 0.005*"bansa" + 0.004*"unity"'), (2, '0.008*"share" + 0.006*"price" + 0.005*"mb" + 0.005*"company" + 0.005*"market"')]


In [32]:
# # Comment to prevent loading the pretrained model
# # %%script false

# import pickle

# ldamodel = None
# with open('./out/lda_model.pkl', 'rb') as f:
#     ldamodel = pickle.load(f)

In [33]:
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=ldamodel, texts=clean_corpus, dictionary=dict_, coherence="c_v")

coherence_model.get_coherence() 

0.43126922625818187

In [34]:
# NOTE: Gensim’s perplexity value is in logarithmic form. To compare with sklearn’s perplexity value np.exp(-1 *gensim.log_perplexity) is used

print(ldamodel.log_perplexity(doc_term_matrix ))
print(np.exp(-1 * ldamodel.log_perplexity(doc_term_matrix )))


-8.522820523848097
5028.216361810175


In [35]:
import numpy as np
all_rows = []



for index,doc_in_words in enumerate(clean_corpus):
    doc_in_words_as_string = ' '.join(doc_in_words)
    row_vals = [dataframe.loc[index,'platform'], dataframe.loc[index, 'raw'] ,doc_in_words_as_string]+list(np.zeros(n_topics))
    doc_topics = ldamodel.get_document_topics(dict_.doc2bow(doc_in_words))
    
    for doc_topic in doc_topics:
        row_vals[doc_topic[0]+3] = doc_topic[1]
    index_of_best_topic = np.argmax(row_vals[3:])
    
    row_vals.append(index_of_best_topic+1)
    all_rows.append(row_vals)


In [36]:
import pyLDAvis

import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
%matplotlib inline

gensimvis.prepare(ldamodel ,doc_term_matrix , dict_)


  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [37]:
import pandas as pd
labelled_dataset = pd.DataFrame(all_rows, columns=["Platform","Raw","Text"]+[f"Topic {i+1}" for i in range(n_topics)]+["Best Topic"])
labelled_dataset

Unnamed: 0,Platform,Raw,Text,Topic 1,Topic 2,Topic 3,Best Topic
0,Facebook,Why nowadays every thing seem to be increasin...,nowadays every thing seem increasing governanc...,0.960316,0.019696,0.019989,1
1,Facebook,I will have to disagree.. we’re not that high!!,disagree high,0.772866,0.114139,0.112995,1
2,Facebook,"Wag po tayong mag-alala. Naniniwala po ako, is...",tayong magalala naniniwala isusuprise sir bbm ...,0.021459,0.957550,0.020991,2
3,Facebook,Ok lang yang lahat naman nang bansa ganyan. Sa...,ok yang nang bansa ganyan selfish fanatic blen...,0.037489,0.925430,0.037081,2
4,Facebook,Sama-sama tayong BABAON muli.,samasama tayong babaon,0.083516,0.832983,0.083501,2
...,...,...,...,...,...,...,...
4691,Youtube,mukhang nakashabu,mukhang nakashabu,0.111293,0.777430,0.111277,2
4692,Youtube,Bbm is mixed up and confusing on economic terms,bbm mixed confusing economic term,0.881969,0.060535,0.057496,1
4693,Youtube,Mr.utal utal,mrutal utal,0.111413,0.777200,0.111387,2
4694,Youtube,Kung Si Leni Pa Yan \r\n\r\nLutang Na Tayo,leni lutang,0.117132,0.771727,0.111141,2


In [38]:
labelled_dataset.to_csv("./data/labelled_dataset.csv",index=False)

In [39]:
# Dump model
# pickle.dump(ldamodel, open('./out/lda_model.pkl','wb'))