In [1]:
# Import Libraries
# for text preprocessing
import re
# import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
# import numpy for matrix operation
import numpy as np
# Importing Gensim
import gensim
from gensim import corpora  
import pandas as pd

In [2]:
import ast
dataframe = pd.read_csv("./data/preprocessed.csv")
# clean_corpus = dataframe["preprocessed"]
# clean_corpus = [x.strip('][').split(', ') for x in dataframe["preprocessed"]]
# string to list
clean_corpus = [ast.literal_eval(x) for x in dataframe["preprocessed"]]

In [3]:
# where every unique term is assigned an index.
dict_ = corpora.Dictionary(clean_corpus)

In [4]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]

In [5]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, num_topics=6, id2word = dict_, passes=20, random_state=20, eval_every=None)

# Prints the topics with the indexes: 0,1,2 :
ldamodel.print_topics()
# we need to manually check whethere the topics are different from one another or not
print(ldamodel.print_topics(num_topics=6, num_words=5))

# num_topics mean: how many topics want to extract
# num_words: the number of words that want per topic


[(0, '0.016*"inflation" + 0.010*"people" + 0.010*"philippine" + 0.010*"country" + 0.008*"dont"'), (1, '0.009*"share" + 0.007*"price" + 0.006*"mb" + 0.006*"market" + 0.006*"company"'), (2, '0.027*"yung" + 0.007*"leni" + 0.007*"bilihin" + 0.007*"inflation" + 0.007*"bansa"'), (3, '0.017*"nyo" + 0.010*"bbm" + 0.008*"wala" + 0.008*"inflation" + 0.007*"golden"'), (4, '0.010*"profit" + 0.007*"land" + 0.006*"build" + 0.006*"million" + 0.006*"q2"'), (5, '0.017*"unity" + 0.007*"question" + 0.006*"loan" + 0.005*"bbm" + 0.005*"marcos"')]


In [6]:
import pickle
pickle.dump(ldamodel, open('lda_model.pkl','wb'))

In [7]:
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=ldamodel, texts=clean_corpus, dictionary=dict_, coherence="c_v")

coherence_model.get_coherence() 

In [None]:
# NOTE: Gensim’s perplexity value is in logarithmic form. To compare with sklearn’s perplexity value np.exp(-1 *gensim.log_perplexity) is used

print(ldamodel.log_perplexity(doc_term_matrix ))
print(np.exp(-1 * ldamodel.log_perplexity(doc_term_matrix )))


-8.549481118559168
5163.983614221804


In [None]:
import numpy as np
all_rows = []

for index,doc_in_words in enumerate(clean_corpus):
    doc_in_words_as_string = ' '.join(doc_in_words)
    row_vals = [dataframe.loc[index,'platform'], doc_in_words_as_string]+list(np.zeros(6))
    doc_topics = ldamodel.get_document_topics(dict_.doc2bow(doc_in_words))
    for doc_topic in doc_topics:
        row_vals[doc_topic[0]+2] = doc_topic[1]
    index_of_best_topic = np.argmax(row_vals[2:])
    row_vals.append(index_of_best_topic+1)
    all_rows.append(row_vals)


['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 1]
['Facebook', '', 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.1

In [None]:
import pandas as pd
labelled_dataset = pd.DataFrame(all_rows, columns=["Platform","Text"]+[f"Topic {i+1}" for i in range(6)]+["Best Topic"])
labelled_dataset

Unnamed: 0,Platform,Text,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Best Topic
0,Facebook,nowadays every thing seem increasing governanc...,0.872067,0.000000,0.000000,0.090707,0.000000,0.000000,1
1,Facebook,disagree high,0.056015,0.056150,0.055559,0.720895,0.055559,0.055822,4
2,Facebook,wag tayong magalala naniniwala isusuprise sir ...,0.000000,0.000000,0.000000,0.383386,0.000000,0.577221,6
3,Facebook,ok yang nang bansa ganyan selfish fanatic blen...,0.131091,0.018532,0.018657,0.683489,0.018532,0.129700,4
4,Facebook,samasama tayong babaon,0.041733,0.041702,0.041996,0.291639,0.540683,0.042247,5
...,...,...,...,...,...,...,...,...,...
5204,Youtube,mukhang nakashabu,0.055581,0.055581,0.055694,0.055581,0.055581,0.721982,6
5205,Youtube,bbm mixed confusing economic term,0.542274,0.027945,0.027817,0.346267,0.027795,0.027902,1
5206,Youtube,mrutal utal,0.055595,0.055595,0.055595,0.055595,0.722024,0.055596,5
5207,Youtube,leni lutang,0.055560,0.055560,0.309242,0.468519,0.055560,0.055560,4


In [None]:
labelled_dataset.to_csv("./data/labelled_dataset.csv",index=False
 )