# Install Library

In [None]:
!pip install pyLDAvis

# Import Library

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models

import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  from collections import Iterable


In [None]:
import nltk
nltk.download('stopwords')
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('indonesian')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Load Data

In [None]:
df = pd.read_excel('44444.xlsx')

data = df['Cluster 0']

# Create Bigram and Trigram Models

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data]])



[' tagar', ' spbu', ' mypertaminaunfaedah', ' ledak', ' hp', ' larang', ' tanggung', ' main', ' beli', ' bbm', ' up', ' sulit', ' pakai', ' aplikasi', ' api', ' mypertamina', ' manfaat', ' bensin', ' sebentar', ' pertalite', ' atur', ' awas', ' bijak', ' sing', ' aman', ' buka', ' solusi', ' masyarakat', ' isi', ' asyik', ' tekan', ' titip', ' pom', ' pintu', ' toilet', ' pergi', ' timbang', ' survei', ' lapa', ' omg', ' viralkan', ' sila', ' perintah', ' kamipercayaact', ' cocox', ' posisi', ' nitizen', ' salut', ' rurut', ' gampang', ' dengar', ' bakar', ' jam', ' selamat', ' pertamina', ' aktipin', ' sengaja', ' hmn', ' mypertaminaunfaed', ' wajib', ' bahan', ' langgar', ' mati', ' netijen', ' percik', ' padahl', ' picu', ' akal', ' tinggal', ' pasang', ' sehat', ' download', ' hidup', ' pertamini', ' ngu', ' kare', ' spb', ' wilayah', ' bangun', ' ae', ' mudah', ' keun', ' gas', ' bantah', ' proyek', ' kecewa', ' tanda', ' hormat', ' ubtuk', ' halahhalah', ' bismillaah', ' rezim', 

# Preprocessing

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en


# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = data_words_bigrams

# Create Dictionary

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus)

[[(0, 1)], [(1, 1)], [], [(2, 1)], [(3, 1)], [(4, 1)], [(5, 1)], [(6, 1)], [(7, 1)], [(8, 1)], [(9, 1)], [(10, 1)], [(11, 1)], [(12, 1)], [(13, 1)], [(14, 1)], [(15, 1)], [(16, 1)], [(17, 1)], [(18, 1)], [(19, 1)], [(20, 1)], [(21, 1)], [(22, 1)], [(23, 1)], [(24, 1)], [(25, 1)], [(26, 1)], [(27, 1)], [(28, 1)], [(29, 1)], [(30, 1)], [(31, 1)], [(32, 1)], [(33, 1)], [(34, 1)], [(35, 1)], [(36, 1)], [(37, 1)], [(38, 1)], [(39, 1)], [(40, 1)], [(41, 1)], [(42, 1)], [(43, 1)], [(44, 1)], [(45, 1)], [(46, 1)], [(47, 1)], [(48, 1)], [(49, 1)], [(50, 1)], [(51, 1)], [(52, 1)], [(53, 1)], [(54, 1)], [(55, 1)], [(56, 1)], [], [(57, 1)], [(58, 1)], [(59, 1)], [(60, 1)], [(61, 1)], [(62, 1)], [(63, 1)], [(64, 1)], [(65, 1)], [(66, 1)], [(67, 1)], [(68, 1)], [(69, 1)], [(70, 1)], [(71, 1)], [(72, 1)], [(73, 1)], [(74, 1)], [(75, 1)], [(76, 1)], [(77, 1)], [(78, 1)], [(79, 1)], [(80, 1)], [(81, 1)], [(82, 1)], [(83, 1)], [(84, 1)], [(85, 1)], [(86, 1)], [(87, 1)], [(88, 1)], [(89, 1)], [(90, 1)], 

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus]

# Create LDA Models

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=2, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.001*"ori" + 0.001*"oneset" + 0.001*"order" + 0.001*"oppo" + '
  '0.001*"oraimo" + 0.001*"ormas" + 0.001*"organik" + 0.001*"orga" + '
  '0.001*"onlyfans" + 0.001*"optimis"'),
 (1,
  '0.001*"ori" + 0.001*"oneset" + 0.001*"oppo" + 0.001*"order" + 0.001*"ormas" '
  '+ 0.001*"oraimo" + 0.001*"organik" + 0.001*"orga" + 0.001*"onlyfans" + '
  '0.001*"origami"')]


In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.949847170964313

Coherence Score:  0.8725118748379629


# Visualisasi Topics Models

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# Coherence Values

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array
#function to compute coherence values
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, iterations=100)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        
    return model_list, coherence_values

# Visualisasi Coherence Score

In [None]:
start=1
limit=10
step=1
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=start, limit=limit, step=step)
#show graphs
import matplotlib.pyplot as plt
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

