In [15]:
# Import Libraries
# for text preprocessing
import re
# import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
# import numpy for matrix operation
import numpy as np
# Importing Gensim
import gensim
from gensim import corpora  
import pandas as pd

In [16]:
import ast
dataframe = pd.read_csv("./data/preprocessed.csv")
# clean_corpus = dataframe["preprocessed"]
# clean_corpus = [x.strip('][').split(', ') for x in dataframe["preprocessed"]]
# string to list
clean_corpus = [ast.literal_eval(x) for x in dataframe["preprocessed"]]

In [17]:
# where every unique term is assigned an index.
dict_ = corpora.Dictionary(clean_corpus)

In [18]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]

In [19]:
# Uncomment to retrain LDAModel
# %%script false
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, num_topics=6, id2word = dict_, passes=20, random_state=20, eval_every=None)

# Prints the topics with the indexes: 0,1,2 :
ldamodel.print_topics()
# we need to manually check whethere the topics are different from one another or not
print(ldamodel.print_topics(num_topics=6, num_words=5))

# num_topics mean: how many topics want to extract
# num_words: the number of words that want per topic


[(0, '0.010*"price" + 0.008*"country" + 0.007*"rate" + 0.007*"like" + 0.006*"people"'), (1, '0.009*"golden" + 0.009*"peso" + 0.007*"haha" + 0.006*"daw" + 0.005*"bigas"'), (2, '0.007*"dont" + 0.007*"economy" + 0.006*"one" + 0.006*"people" + 0.005*"like"'), (3, '0.012*"wala" + 0.010*"bilihin" + 0.008*"sana" + 0.007*"gobyerno" + 0.006*"tapos"'), (4, '0.011*"share" + 0.008*"mb" + 0.007*"company" + 0.007*"ipo" + 0.006*"billion"'), (5, '0.011*"k" + 0.006*"covid" + 0.005*"kuryente" + 0.005*"mahal" + 0.005*"pbbm"')]


In [20]:
# Comment to prevent loading the pretrained model
# %%script false

import pickle

ldamodel = None
with open('./out/lda_model.pkl', 'rb') as f:
    ldamodel = pickle.load(f)

In [21]:
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=ldamodel, texts=clean_corpus, dictionary=dict_, coherence="c_v")

coherence_model.get_coherence() 

0.6673389727540426

In [22]:
# NOTE: Gensim’s perplexity value is in logarithmic form. To compare with sklearn’s perplexity value np.exp(-1 *gensim.log_perplexity) is used

print(ldamodel.log_perplexity(doc_term_matrix ))
print(np.exp(-1 * ldamodel.log_perplexity(doc_term_matrix )))


-11.19287994883112
72610.99637295035


In [23]:
import numpy as np
all_rows = []

for index,doc_in_words in enumerate(clean_corpus):
    doc_in_words_as_string = ' '.join(doc_in_words)
    row_vals = [dataframe.loc[index,'platform'], doc_in_words_as_string]+list(np.zeros(6))
    doc_topics = ldamodel.get_document_topics(dict_.doc2bow(doc_in_words))
    for doc_topic in doc_topics:
        row_vals[doc_topic[0]+2] = doc_topic[1]
    index_of_best_topic = np.argmax(row_vals[2:])
    row_vals.append(index_of_best_topic+1)
    all_rows.append(row_vals)


In [24]:
import pandas as pd
labelled_dataset = pd.DataFrame(all_rows, columns=["Platform","Text"]+[f"Topic {i+1}" for i in range(6)]+["Best Topic"])
labelled_dataset

Unnamed: 0,Platform,Text,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Best Topic
0,Facebook,nowadays every thing seem increasing governanc...,0.872072,0.000000,0.000000,0.090702,0.000000,0.000000,1
1,Facebook,disagree high,0.056015,0.056168,0.055559,0.720877,0.055559,0.055822,4
2,Facebook,tayong magalala naniniwala isusuprise sir bbm ...,0.010428,0.010424,0.010568,0.418144,0.010423,0.540014,6
3,Facebook,ok yang nang bansa ganyan selfish fanatic blen...,0.131990,0.018559,0.018721,0.573137,0.018559,0.239034,4
4,Facebook,samasama tayong babaon,0.041692,0.041692,0.041819,0.542127,0.290956,0.041715,4
...,...,...,...,...,...,...,...,...,...
4691,Youtube,mukhang nakashabu,0.055780,0.055779,0.055781,0.055782,0.390796,0.386081,5
4692,Youtube,bbm mixed confusing economic term,0.028045,0.185259,0.028051,0.028246,0.193463,0.536937,6
4693,Youtube,mrutal utal,0.055651,0.055651,0.055652,0.055652,0.388764,0.388631,5
4694,Youtube,leni lutang,0.055619,0.055618,0.055619,0.388053,0.055620,0.389472,6


In [25]:
labelled_dataset.to_csv("./data/labelled_dataset.csv",index=False)

In [26]:
# Dump model
pickle.dump(ldamodel, open('./out/lda_model.pkl','wb'))