# Topic Modeling with gensim

In [None]:
# gensim
import gensim
from gensim import corpora, models, similarities, matutils
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pickle
import numpy as np
import pandas as pd

## Document Preprocessing
We'll need to generate a term-document matrix of word (token) counts for use in LDA.

We'll use `sklearn`'s `CountVectorizer` to generate our term-document matrix of counts. We'll make use of a few parameters to accomplish the following preprocessing of the text documents all within the `CountVectorizer`:
* `analyzer=word`: Tokenize by word
* `ngram_range=(1,2)`: Keep all 1 and 2-word grams
* `stop_words=english`: Remove all English stop words
* `token_pattern=\\b[a-z][a-z]+\\b`: Match all tokens with 2 or more (strictly) alphabet characters

In [None]:
#Open Corpus of News Article Text
# with open('./data/news_df.pickle', 'rb') as file:
with open('./data/news_data_frame_reduced_preprocessed.pickle', 'rb') as file:
# with open('./data/news_fake_real_df_reduced_token.pickle', 'rb') as file:
    news_data_frame = pickle.load(file)

In [None]:
news_data_frame["reshaped_tokenized_text"] = news_data_frame["tokenized_text"].apply(lambda x: ' '.join(x))

In [None]:
#Remove Overfit Words
import re
said_remove = re.compile('(\s*)said(\s*)')
facebook_remove = re.compile('(\s*)facebook(\s*)')
obama_remove = re.compile('(\s*)obama(\s*)')
trump_remove = re.compile('(\s*)trump(\s*)')
hillary_remove = re.compile('(\s*)hillary(\s*)')
clinton_remove = re.compile('(\s*)clinton(\s*)')

news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: said_remove.sub('', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: facebook_remove.sub('', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: obama_remove.sub('', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: trump_remove.sub('', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: hillary_remove.sub('', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: clinton_remove.sub('', x))


In [None]:
real_news_data = news_data_frame.loc[news_data_frame['Not_Real_or_Real'] == 0, 'reshaped_tokenized_text']
fake_news_data = news_data_frame.loc[news_data_frame['Not_Real_or_Real'] == 1, 'reshaped_tokenized_text']
all_news_data =  news_data_frame['reshaped_tokenized_text']

In [None]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")

count_vectorizer_real = count_vectorizer.fit(real_news_data)
count_vectorizer_fake = count_vectorizer.fit(real_news_data)
count_vectorizer_all = count_vectorizer.fit(real_news_data)

In [None]:
# Create the term-document matrix
# Transpose it so the terms are the rows
doc_word_real_news = count_vectorizer_real.transform(real_news_data).transpose()
doc_word_fake_news = count_vectorizer_fake.transform(fake_news_data).transpose()
doc_word_all_news = count_vectorizer_all.transform(all_news_data).transpose()

In [None]:
print(doc_word_real_news.shape)
print(doc_word_fake_news.shape)
print(doc_word_all_news.shape)

##### Convert to gensim
We need to convert our sparse `scipy` matrix to a `gensim`-friendly object called a Corpus:

In [None]:
# Convert sparse matrix of counts to a gensim corpus
corpus_real_news = matutils.Sparse2Corpus(doc_word_real_news)
corpus_fake_news = matutils.Sparse2Corpus(doc_word_fake_news)
corpus_all_news = matutils.Sparse2Corpus(doc_word_all_news)

##### Map matrix rows to words (tokens)
We need to save a mapping (dict) of row id to word (token) for later use by gensim:

In [None]:
id2word_real = dict((v, k) for k, v in count_vectorizer_real.vocabulary_.items())
id2word_fake = dict((v, k) for k, v in count_vectorizer_fake.vocabulary_.items())
id2word_all = dict((v, k) for k, v in count_vectorizer_all.vocabulary_.items())

## LDA
At this point we can simply plow ahead in creating an LDA model.  It requires our corpus of word counts, mapping of row ids to words, and the number of topics (3).

### Real News

In [None]:
# Create lda model (equivalent to "fit" in sklearn)
lda_real = models.LdaModel(corpus=corpus_real_news, num_topics=3, id2word=id2word_real, passes=20)

### Fake News

In [None]:
# Create lda model (equivalent to "fit" in sklearn)
lda_fake = models.LdaModel(corpus=corpus_fake_news, num_topics=3, id2word=id2word_fake, passes=5)

### All News

In [None]:
# Create lda model (equivalent to "fit" in sklearn)
lda_all = models.LdaModel(corpus=corpus_all_news, num_topics=3, id2word=id2word_all, passes=5)

Let's take a look at what happened.  Here are the 5 most important words for each of the 3 topics we found:

In [None]:
lda.print_topics()

#### Topic Space
If we want to map our documents to the topic space we need to actually use the LdaModel transformer that we created above, like so:

In [None]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[corpus]
lda_corpus

In [None]:
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

Now we can take a look at the document vectors in the topic space, which are measures of the component of each document along each topic.  Thus, at most a document vector can have num_topics=3 nonzero components in the topic space, and most have far fewer.

In [None]:
# Check out the document vectors in the topic space for the first 5 documents
lda_docs[0:5]

In [None]:
ng_train.data[0]

## LDA Visualizaiton Using Tri and Bi Grams

In [None]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
fileObject = open('./data/grams/bigram_text','rb')  
bi_lem_comb2 = pickle.load(fileObject)

In [None]:
fileObject = open('./data/grams/trigram_text','rb')  
tri_lem_comb2 = pickle.load(fileObject)

In [None]:
bi_gram_series = pd.Series(bi_lem_comb2) 

In [None]:
#Tokenize Bigrams
from nltk.tokenize import word_tokenize
bi_gram_series = bi_gram_series.apply(word_tokenize)

In [None]:
tri_gram_series = pd.Series(tri_lem_comb2) 

In [None]:
#Tokenize Bigrams
tri_gram_series = tri_gram_series.apply(word_tokenize)

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

#Lemmatizing Text
tri_gram_lemmatized = tri_gram_series.apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

#Lemmatizing Text
bi_gram_lemmatized = bi_gram_series.apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

In [None]:
id2word_tri = corpora.Dictionary(tri_gram_lemmatized)
id2word_bi = corpora.Dictionary(bi_gram_lemmatized)

In [None]:
# Create Corpus
text_tri = tri_gram_lemmatized
text_bi  = bi_gram_lemmatized

In [None]:
# Term Document Frequency
corpus_tri = [id2word_tri.doc2bow(text) for text in text_tri]
corpus_bi = [id2word_bi.doc2bow(text) for text in text_bi]

### Note
Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on.

This is used as the input by the LDA model.

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word_tri[id], freq) for id, freq in cp] for cp in corpus_tri[:1]]

# Building the Topic Model
In addition to the corpus and dictionary, will need to provide the number of topics as well.

Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior.

chunksize is the number of documents to be used in each training chunk. update_every determines how often the model parameters should be updated and passes is the total number of training passes.

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_tri,
                                           id2word=id2word_tri,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

## View the topics in LDA model
The above LDA model is built with 20 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.

You can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics() as shown next.

In [None]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus_tri]

## Compute Model Perplexity and Coherence Score
Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. In my experience, topic coherence score, in particular, has been more helpful.

In [None]:
### Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus_tri))  # a measure of how good the model is. lower the better.

### Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tri_gram_lemmatized, dictionary=id2word_tri, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Visualize the topics-keywords

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_tri, id2word_tri)
vis

# pyLDAvis.sklearn

In [1]:
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import pickle

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
# gensim
import gensim
from gensim import corpora, models, similarities, matutils
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from gensim import corpora, models, similarities, matutils
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pickle
import numpy as np
import pandas as pd

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [44]:
#Open Corpus of News Article Text
with open('./data/news_data_frame_reduced_preprocessed.pickle', 'rb') as file:
    news_data_frame = pickle.load(file)

In [45]:
news_data_frame["reshaped_tokenized_text"] = news_data_frame["tokenized_text"].apply(lambda x: ' '.join(x))

In [46]:
#Remove Overfit Words
import re
said_remove = re.compile(r'(\s*)said(\s*)')
facebook_remove = re.compile(r'(\s*)facebook(\s*)')
obama_remove = re.compile(r'(\s*)obama(\s*)')
trump_remove = re.compile(r'(\s*)trump(\s*)')
hillary_remove = re.compile(r'(\s*)hillary(\s*)')
clinton_remove = re.compile(r'(\s*)clinton(\s*)')

news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: said_remove.sub(' ', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: facebook_remove.sub(' ', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: obama_remove.sub(' ', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: trump_remove.sub(' ', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: hillary_remove.sub(' ', x))
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(lambda x: clinton_remove.sub(' ', x))


In [47]:
#Tokenize Text
from nltk.tokenize import word_tokenize
news_data_frame["reshaped_tokenized_text"] = news_data_frame["reshaped_tokenized_text"].apply(word_tokenize)

In [48]:
real_news_data = news_data_frame.loc[news_data_frame['Not_Real_or_Real'] == 0, 'reshaped_tokenized_text']
fake_news_data = news_data_frame.loc[news_data_frame['Not_Real_or_Real'] == 1, 'reshaped_tokenized_text']
all_news_data =  news_data_frame['reshaped_tokenized_text']

### Prepare Bigram Text

In [49]:
bigram_real = Phrases(real_news_data, min_count=5, threshold = 80);
bigram_mod_real = Phraser(bigram_real);

2019-05-27 17:30:15,120 : INFO : collecting all words and their counts
2019-05-27 17:30:15,121 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-05-27 17:30:16,606 : INFO : collected 578030 word types from a corpus of 1061812 words (unigram + bigrams) and 2314 sentences
2019-05-27 17:30:16,608 : INFO : using 578030 counts as vocab in Phrases<0 vocab, min_count=5, threshold=80, max_vocab_size=40000000>
2019-05-27 17:30:16,624 : INFO : source_vocab length 578030
2019-05-27 17:30:21,660 : INFO : Phraser built with 6561 6561 phrasegrams


In [50]:
bigram_fake = Phrases(fake_news_data, min_count=5, threshold = 80);
bigram_mod_fake = Phraser(bigram_fake);

2019-05-27 17:30:21,665 : INFO : collecting all words and their counts
2019-05-27 17:30:21,666 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-05-27 17:30:22,681 : INFO : collected 518114 word types from a corpus of 707360 words (unigram + bigrams) and 2112 sentences
2019-05-27 17:30:22,682 : INFO : using 518114 counts as vocab in Phrases<0 vocab, min_count=5, threshold=80, max_vocab_size=40000000>
2019-05-27 17:30:22,693 : INFO : source_vocab length 518114
2019-05-27 17:30:27,102 : INFO : Phraser built with 2266 2266 phrasegrams


In [51]:
bigram_all = Phrases(all_news_data, min_count=5, threshold = 80);
bigram_mod_all = Phraser(bigram_all);

2019-05-27 17:30:27,109 : INFO : collecting all words and their counts
2019-05-27 17:30:27,109 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-05-27 17:30:29,866 : INFO : collected 1019403 word types from a corpus of 1769172 words (unigram + bigrams) and 4426 sentences
2019-05-27 17:30:29,867 : INFO : using 1019403 counts as vocab in Phrases<0 vocab, min_count=5, threshold=80, max_vocab_size=40000000>
2019-05-27 17:30:29,893 : INFO : source_vocab length 1019403
2019-05-27 17:30:38,759 : INFO : Phraser built with 8297 8297 phrasegrams


In [52]:
def make_bigrams(texts, type='real'):
    if type == 'real':
        return [bigram_mod_real[doc] for doc in texts]
    elif type == 'fake':
        return [bigram_mod_fake[doc] for doc in texts]
    else:
        return [bigram_mod_all[doc] for doc in texts]

In [53]:
words_bigrams_real = make_bigrams(real_news_data, 'real')
words_bigrams_fake = make_bigrams(real_news_data, 'fake')
words_bigrams_all = make_bigrams(real_news_data, 'all')

### Prepare Trigram Text

In [54]:
trigram_real = Phrases(real_news_data, threshold = 80);
trigram_mod_real = Phraser(trigram_real);

2019-05-27 17:30:45,865 : INFO : collecting all words and their counts
2019-05-27 17:30:45,866 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-05-27 17:30:47,444 : INFO : collected 578030 word types from a corpus of 1061812 words (unigram + bigrams) and 2314 sentences
2019-05-27 17:30:47,444 : INFO : using 578030 counts as vocab in Phrases<0 vocab, min_count=5, threshold=80, max_vocab_size=40000000>
2019-05-27 17:30:47,463 : INFO : source_vocab length 578030
2019-05-27 17:30:52,789 : INFO : Phraser built with 6561 6561 phrasegrams


In [55]:
trigram_fake = Phrases(fake_news_data, threshold = 80);
trigram_mod_fake = Phraser(trigram_fake);

2019-05-27 17:30:52,796 : INFO : collecting all words and their counts
2019-05-27 17:30:52,797 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-05-27 17:30:54,116 : INFO : collected 518114 word types from a corpus of 707360 words (unigram + bigrams) and 2112 sentences
2019-05-27 17:30:54,117 : INFO : using 518114 counts as vocab in Phrases<0 vocab, min_count=5, threshold=80, max_vocab_size=40000000>
2019-05-27 17:30:54,137 : INFO : source_vocab length 518114
2019-05-27 17:30:59,277 : INFO : Phraser built with 2266 2266 phrasegrams


In [56]:
trigram_all = Phrases(all_news_data, threshold = 80);
trigram_mod_all = Phraser(trigram_all);

2019-05-27 17:30:59,282 : INFO : collecting all words and their counts
2019-05-27 17:30:59,283 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-05-27 17:31:02,521 : INFO : collected 1019403 word types from a corpus of 1769172 words (unigram + bigrams) and 4426 sentences
2019-05-27 17:31:02,522 : INFO : using 1019403 counts as vocab in Phrases<0 vocab, min_count=5, threshold=80, max_vocab_size=40000000>
2019-05-27 17:31:02,550 : INFO : source_vocab length 1019403
2019-05-27 17:31:12,481 : INFO : Phraser built with 8297 8297 phrasegrams


In [57]:
def make_trigrams(texts, type='real'):
    if type == 'real':
        return [trigram_mod_real[doc] for doc in texts]
    elif type == 'fake':
        return [trigram_mod_fake[doc] for doc in texts]
    else:
        return [trigram_mod_all[doc] for doc in texts]

In [82]:
words_trigrams_real = pd.Series(make_trigrams(real_news_data, 'real'))
words_trigrams_fake = pd.Series(make_trigrams(real_news_data, 'fake'))
words_trigrams_all = pd.Series(make_trigrams(real_news_data, 'all'))

In [83]:
real_extracted = words_trigrams_real.apply(lambda x: ' '.join(x))
fake_extracted = words_trigrams_fake.apply(lambda x: ' '.join(x))
all_extracted = words_trigrams_all.apply(lambda x: ' '.join(x))

## Convert to document-term matrix
Next, the raw documents are converted into document-term matrix, possibly as raw counts or in TF-IDF form.

In [93]:
#Bigrams
tf_vectorizer_bi = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)

dtm_tf_bi = tf_vectorizer_bi.fit_transform(all_extracted)
print(dtm_tf_bi.shape)

tfidf_vectorizer_bi = TfidfVectorizer(**tf_vectorizer_bi.get_params())
dtm_tfidf_bi = tfidf_vectorizer_bi.fit_transform(all_extracted)
print(dtm_tfidf_bi.shape)

(2314, 8389)




(2314, 8389)


In [109]:
#Trigrams
tf_vectorizer_tri = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)

dtm_tf_tri = tf_vectorizer_tri.fit_transform(all_extracted)
print(dtm_tf_tri.shape)

tfidf_vectorizer_tri = TfidfVectorizer(**tf_vectorizer_tri.get_params())
dtm_tfidf_tri = tfidf_vectorizer_tri.fit_transform(all_extracted)
print(dtm_tfidf_tri.shape)

(2314, 8389)




(2314, 8389)


## Fit Latent Dirichlet Allocation models
Finally, the LDA models are fitted.

In [94]:
#Bigrams
# for TF DTM
lda_tf_bi = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf_bi.fit(dtm_tf_bi)

# for TFIDF DTM
lda_tfidf_bi = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf_bi.fit(dtm_tfidf_bi)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [110]:
#Trigrames
# for TF DTM
lda_tf_tri = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf_tri.fit(dtm_tf_tri)

# for TFIDF DTM
lda_tfidf_tri = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf_tri.fit(dtm_tfidf_tri)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

## Visualizing the models with pyLDAvis

In [89]:
bi_real = pyLDAvis.sklearn.prepare(lda_tf_bi, dtm_tf_bi, tf_vectorizer_bi)
bi_real

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [92]:
bi_fake = pyLDAvis.sklearn.prepare(lda_tf_bi, dtm_tf_bi, tf_vectorizer_bi)
bi_fake

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [95]:
bi_all = pyLDAvis.sklearn.prepare(lda_tf_bi, dtm_tf_bi, tf_vectorizer_bi)
bi_all

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [104]:
tri_real = pyLDAvis.sklearn.prepare(lda_tfidf_tri, dtm_tfidf_tri, tfidf_vectorizer_tri)
tri_real

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [108]:
tri_fake = pyLDAvis.sklearn.prepare(lda_tfidf_tri, dtm_tfidf_tri, tfidf_vectorizer_tri)
tri_fake

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [111]:
tri_all =  pyLDAvis.sklearn.prepare(lda_tfidf_tri, dtm_tfidf_tri, tfidf_vectorizer_tri)
tri_all

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
