# Choose Number of Topics using LDA algorithm

In [9]:
import gensim
import nltk
nltk.download('wordnet')

import pandas as pd
from sklearn.manifold import TSNE
import time

from tqdm import tqdm
import numpy as np
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud

from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()


### BEGIN settings of pyLDAvis ### 
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
### END settings of pyLDAvis ###

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocess Documents

In [2]:
def read_json_docs(filename):
    dataset_df = pd.read_csv(filename, error_bad_lines=False)
    
    dataset = dataset_df['text']
    
    return dataset


def preprocess(raw_docs):
    """
    Preprocess documents including normalization, lemmatization, tokenization, removing stopwords.

    Args:
        raw_docs (pandas.Series)
    
    Returns:
        docs (list(list(str)))
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    docs = []
    
    for d in raw_docs:
        tokens = gensim.utils.simple_preprocess(d)
        stopwords_custom = ['https', 'rt', 'amp'] 
        doc = []
        
        for t in tokens:
            if t not in gensim.parsing.preprocessing.STOPWORDS:
                if t not in stopwords_custom:
                    t = lemmatizer.lemmatize(t, pos='v') # convert past tense to present tense.
                    doc.append(t)
                    
        docs.append(doc)
        
    return docs
        
       
def create_dictionary(docs):
    """
    Create dictionary
    
    Args:
        docs (list(list(str)))
        
    Returns:
        dictionary (gensim.corpora.Dictionary)
    """
    
    return gensim.corpora.Dictionary(docs)
    
    
def get_bow_corpus(docs, dictionary):
    """
    Convert texts to encoded bows using dictionary
    
    Args:
        docs (list(list(str))):
        dictionary (gensim.corpora.Dictionary)
        
    Returns:
        bows (list(list(tuple(token_id, token_count))))
    """
    bows = [dictionary.doc2bow(d) for d in docs]
    return bows


def get_tfidf_corpus(bows):
    tfidf_model = gensim.models.TfidfModel(bows)
    corpus_tfidf = tfidf_model[bows]
    
    return corpus_tfidf, tfidf_model

# Traning Program

In [22]:
filename_dataset = './twitter_trump_2019_05.csv'
no_below = 2
no_above = 1.0
num_topics = 3
passes = 20

In [3]:
# Read file
raw_docs = read_json_docs(filename_dataset)

# Preprocess documents including normalization, lemmatization, tokenization, removing stopwords.
docs = preprocess(raw_docs)

# Create dictionary
dictionary = create_dictionary(docs)
print('Size of unfiltered dictionary: {}'.format(len(dictionary)))

# Remove common/rare words
dictionary.filter_extremes(no_below=no_below, no_above=no_above)
print('Size of filtered dictionary: {}'.format(len(dictionary)))

# Transform docs to BOWs corpus
corpus_bows = get_bow_corpus(docs, dictionary)

# Transform BOWS to TF-IDF corpus
corpus_tfidf, tfidf_model = get_tfidf_corpus(corpus_bows)


Size of unfiltered dictionary: 2724
Size of filtered dictionary: 1162


# Train LDA

In [23]:
# Train the corpus_bows
lda_model_bows = gensim.models.LdaModel(corpus_bows, num_topics=num_topics, id2word=dictionary,passes=passes)

# Train the corpus_bows
# lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf,
#                                               num_topics=num_topics,
#                                               id2word=dictionary,
#                                               passes=passes)

# Choose Number of Topics using pyLDAvis

In [24]:
pyLDAvis.gensim.prepare(lda_model_bows, corpus_bows, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
