In [2]:
# Import Libraries
# for text preprocessing
import re
# import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
# import numpy for matrix operation
import numpy as np
# Importing Gensim
import gensim
from gensim import corpora  
import pandas as pd

In [3]:
import ast
dataframe = pd.read_csv("./data/preprocessed.csv")
# clean_corpus = dataframe["preprocessed"]
# clean_corpus = [x.strip('][').split(', ') for x in dataframe["preprocessed"]]
# string to list
clean_corpus = [ast.literal_eval(x) for x in dataframe["preprocessed"]]

In [4]:
# where every unique term is assigned an index.
dict_ = corpora.Dictionary(clean_corpus)

In [5]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]

In [6]:
# Uncomment to retrain LDAModel
# %%script false
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, num_topics=6, id2word = dict_, passes=20, random_state=20, eval_every=None)

# Prints the topics with the indexes: 0,1,2 :
ldamodel.print_topics()
# we need to manually check whethere the topics are different from one another or not
print(ldamodel.print_topics(num_topics=6, num_words=5))

# num_topics mean: how many topics want to extract
# num_words: the number of words that want per topic


[(0, '0.010*"diokno" + 0.009*"inflation" + 0.007*"pbbm" + 0.005*"sir" + 0.005*"news"'), (1, '0.011*"philippine" + 0.010*"marcos" + 0.010*"world" + 0.010*"inflation" + 0.009*"economy"'), (2, '0.018*"k" + 0.009*"yung" + 0.007*"bbm" + 0.006*"inflation" + 0.006*"peso"'), (3, '0.009*"share" + 0.006*"mb" + 0.006*"price" + 0.006*"market" + 0.006*"company"'), (4, '0.024*"yung" + 0.013*"nyo" + 0.010*"bilihin" + 0.010*"wala" + 0.007*"mahal"'), (5, '0.015*"inflation" + 0.008*"philippine" + 0.008*"people" + 0.008*"dont" + 0.007*"country"')]


In [7]:
# Comment to prevent loading the pretrained model
# %%script false

import pickle

ldamodel = None
with open('./out/lda_model.pkl', 'rb') as f:
    ldamodel = pickle.load(f)

In [8]:
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=ldamodel, texts=clean_corpus, dictionary=dict_, coherence="c_v")

coherence_model.get_coherence() 

0.6006093464326839

In [9]:
# NOTE: Gensim’s perplexity value is in logarithmic form. To compare with sklearn’s perplexity value np.exp(-1 *gensim.log_perplexity) is used

print(ldamodel.log_perplexity(doc_term_matrix ))
print(np.exp(-1 * ldamodel.log_perplexity(doc_term_matrix )))


-11.069121296141363
64159.30412757534


In [10]:
import numpy as np
all_rows = []

for index,doc_in_words in enumerate(clean_corpus):
    doc_in_words_as_string = ' '.join(doc_in_words)
    row_vals = [dataframe.loc[index,'platform'], doc_in_words_as_string]+list(np.zeros(6))
    doc_topics = ldamodel.get_document_topics(dict_.doc2bow(doc_in_words))
    for doc_topic in doc_topics:
        row_vals[doc_topic[0]+2] = doc_topic[1]
    index_of_best_topic = np.argmax(row_vals[2:])
    row_vals.append(index_of_best_topic+1)
    all_rows.append(row_vals)


In [11]:
import pandas as pd
labelled_dataset = pd.DataFrame(all_rows, columns=["Platform","Text"]+[f"Topic {i+1}" for i in range(6)]+["Best Topic"])
labelled_dataset

Unnamed: 0,Platform,Text,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Best Topic
0,Facebook,nowadays every thing seem increasing governanc...,0.872068,0.000000,0.000000,0.090705,0.000000,0.000000,1
1,Facebook,disagree high,0.056015,0.056123,0.055559,0.720923,0.055559,0.055822,4
2,Facebook,wag tayong magalala naniniwala isusuprise sir ...,0.000000,0.000000,0.000000,0.383420,0.000000,0.577187,6
3,Facebook,ok yang nang bansa ganyan selfish fanatic blen...,0.131092,0.018532,0.018661,0.683483,0.018532,0.129700,4
4,Facebook,samasama tayong babaon,0.041733,0.041702,0.041996,0.291638,0.540676,0.042255,5
...,...,...,...,...,...,...,...,...,...
4720,Youtube,mukhang nakashabu,0.055895,0.055894,0.383666,0.055898,0.392747,0.055900,5
4721,Youtube,bbm mixed confusing economic term,0.347652,0.027958,0.027897,0.299905,0.027884,0.268703,1
4722,Youtube,mrutal utal,0.055670,0.055670,0.055671,0.388296,0.055672,0.389022,6
4723,Youtube,leni lutang,0.718660,0.056020,0.056760,0.056879,0.056089,0.055592,1


In [12]:
labelled_dataset.to_csv("./data/labelled_dataset.csv",index=False)

In [14]:
# Dump model
pickle.dump(ldamodel, open('./out/lda_model.pkl','wb'))