In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np

# https://www.youtube.com/watch?v=tUh0mgB0QuA
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

import matplotlib.pyplot as plt

import pyLDAvis.sklearn

In [4]:
import nltk
nltk.download('stopwords')
sw_indo = stopwords.words("indonesian") + list(punctuation)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANZ007\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv("data/tweets_labelled_nltk.csv")
df.head()

Unnamed: 0,tweet,sentimen
0,menangani kekerasan seksual disahkan enam ...,Negatif
1,menangani kekerasan seksual disahkan enam ...,Negatif
2,wakil ketua mpr ri mahasiswa kawal imple...,Negatif
3,fadel muhammad mahasiswa kawal implementasi,Negatif
4,tanggal chatnya april dijerat,Negatif


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANZ007\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 30):
    with open('data/wordcount_topik.txt', 'w') as f:
        print(df['tweet'].str.split(expand=True).stack().value_counts(), file=f)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(ngram_range=(1,2), tokenizer=word_tokenize)
bow_matrix = bow.fit_transform(df.tweet.astype("U"))
print(bow_matrix)

  (0, 25864)	1
  (0, 19143)	2
  (0, 41373)	2
  (0, 10013)	1
  (0, 12366)	1
  (0, 8302)	1
  (0, 7977)	1
  (0, 36007)	1
  (0, 38359)	1
  (0, 26787)	1
  (0, 48180)	1
  (0, 46943)	1
  (0, 36166)	1
  (0, 42130)	1
  (0, 1866)	1
  (0, 25869)	1
  (0, 19224)	2
  (0, 41508)	1
  (0, 10123)	1
  (0, 12367)	1
  (0, 8307)	1
  (0, 7995)	1
  (0, 36019)	1
  (0, 38386)	1
  (0, 26863)	1
  :	:
  (15631, 23334)	1
  (15631, 20396)	1
  (15631, 11655)	1
  (15631, 39258)	1
  (15631, 37801)	1
  (15631, 14596)	1
  (15631, 38510)	1
  (15631, 38524)	1
  (15631, 22590)	1
  (15631, 22596)	1
  (15631, 38563)	1
  (15631, 33574)	1
  (15631, 19846)	1
  (15631, 33575)	1
  (15631, 2651)	1
  (15631, 28025)	1
  (15631, 2657)	1
  (15631, 10384)	1
  (15631, 36561)	1
  (15631, 23589)	1
  (15631, 38581)	1
  (15631, 36562)	1
  (15631, 14638)	1
  (15631, 19848)	1
  (15631, 11613)	1


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

datatfidf = TfidfVectorizer(ngram_range=(1,2), strip_accents='unicode', 
                            analyzer='word', tokenizer=word_tokenize, 
                            use_idf=True, smooth_idf=True, sublinear_tf=True, token_pattern=r'\w+')
datatfidf.fit(df.tweet.astype("U"))
print(datatfidf)

TfidfVectorizer(ngram_range=(1, 2), strip_accents='unicode', sublinear_tf=True,
                token_pattern='\\w+',
                tokenizer=<function word_tokenize at 0x00000288B9278EE0>)

TfidfVectorizer(ngram_range=(1, 2), strip_accents='unicode', sublinear_tf=True,
                token_pattern='\\w+',
                tokenizer=<function word_tokenize at 0x00000288B9278EE0>)


In [10]:
vocab = bow.get_feature_names()
len(vocab)

49955

In [11]:
vocab2 = datatfidf.get_feature_names()
len(vocab2)

49955

## Latent Semantic Analysis (LSA) BoW

In [12]:
from sklearn.decomposition import TruncatedSVD

In [13]:
lsa = TruncatedSVD(n_components=3, n_iter=10, random_state=0)
lsa_matrix = lsa.fit_transform(bow_matrix)

In [14]:
# print(bow_matrix.shape)
print(lsa_matrix.shape)
print(lsa.components_.shape)

(15632, 3)
(3, 49955)


In [15]:
def get_topic(model):
    return [[vocab[idx] for idx in comp.argsort()[-6:] if vocab[idx].isalnum()]
        for comp in model.components_]

In [16]:
get_topic(lsa)

[['maharani', 'pahlawan', 'disahkan', 'puan'],
 ['pidana', 'tindak', 'kekerasan', 'seksual'],
 ['ri', 'ketua', 'maharani', 'dpr', 'disahkan']]

## Latent Semantic Analysis (LSA) TF-IDF

In [17]:
from sklearn.decomposition import TruncatedSVD

In [18]:
lsa2 = TruncatedSVD(n_components=3, n_iter=10, random_state=0)
lsa_matrix2 = lsa2.fit_transform(datatfidf.transform(df.tweet.astype("U")))

In [19]:
# print(datatfidf.shape)
print(lsa_matrix2.shape)
print(lsa2.components_.shape)

(15632, 3)
(3, 49955)


In [20]:
def get_topic(model):
    return [[vocab[idx] for idx in reversed(comp.argsort()[-6:]) if vocab[idx].isalnum()]
        for comp in model.components_]

In [21]:
get_topic(lsa)

[['puan', 'disahkan', 'pahlawan', 'maharani'],
 ['seksual', 'kekerasan', 'tindak', 'pidana'],
 ['disahkan', 'dpr', 'maharani', 'ketua', 'ri']]

## Latent Dirichlet Allocation (LDA) BoW

In [22]:
from sklearn.decomposition import LatentDirichletAllocation

In [23]:
lda = LatentDirichletAllocation(n_components=3, max_iter = 10, learning_method='online', learning_offset=50., random_state=0)
lda_matrix = lda.fit_transform(bow_matrix)

In [24]:
get_topic(lda)

[['puan', 'disahkan', 'banget', 'sah', 'pahlawan'],
 ['seksual', 'kekerasan', 'korban', 'hukum', 'disahkan'],
 ['puan', 'disahkan', 'dpr', 'perempuan', 'maharani']]

In [25]:
vis_lda = pyLDAvis.sklearn.prepare(lda, bow_matrix, bow)
pyLDAvis.save_html(vis_lda, "data/pyldavis-output.html")

from IPython.display import IFrame
IFrame(src='data/pyldavis-output.html', width=1400, height=800)

## Latent Dirichlet Allocation (LDA) TF-IDF

In [26]:
from sklearn.decomposition import LatentDirichletAllocation

In [27]:
lda2 = LatentDirichletAllocation(n_components=3, max_iter = 10, learning_method='online', learning_offset=50., random_state=0)
lda_matrix2 = lda2.fit_transform(datatfidf.transform(df.tweet.astype("U")))

In [28]:
get_topic(lda2)

[['puan', 'disahkan', 'terima', 'kasih', 'pemerintah'],
 ['puan', 'korban', 'indonesia', 'disahkan', 'perempuan', 'maharani'],
 ['disahkan', 'puan', 'perempuan', 'seksual', 'kekerasan', 'dpr']]

In [29]:
vis_lda = pyLDAvis.sklearn.prepare(lda2, datatfidf.transform(df.tweet.astype("U")), datatfidf)
pyLDAvis.save_html(vis_lda, "data/pyldavis-output-2.html")

from IPython.display import IFrame
IFrame(src='data/pyldavis-output-2.html', width=1400, height=800)

In [30]:
nltk.download('punkt')
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['tweet'] = df['tweet'].astype("U").apply(word_tokenize_wrapper)
df['tweet']
data_words = df['tweet'].values.tolist()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANZ007\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

0        [menangani, kekerasan, seksual, disahkan, enam...
1        [menangani, kekerasan, seksual, disahkan, enam...
2        [wakil, ketua, mpr, ri, mahasiswa, kawal, impl...
3        [fadel, muhammad, mahasiswa, kawal, implementasi]
4                       [tanggal, chatnya, april, dijerat]
                               ...                        
15627    [koordinator, forum, perempuan, indonesia, ber...
15628    [puan, maharani, dinilai, penuhi, harapan, kau...
15629    [keberadaan, puan, maharani, payung, hukum, me...
15630    [puan, menyebut, kehadiran, wujud, keberpihaka...
15631    [ketua, dpr, ri, puan, maharani, rancangan, un...
Name: tweet, Length: 15632, dtype: object

In [31]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1)]


In [32]:
import gensim
from gensim.models.coherencemodel import CoherenceModel

In [33]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_words, start=2, limit=11, step=1)

In [None]:
# Print the coherence scores
for m, cv in zip(model_list, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 10))