In [1]:
raw_corpus = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [5]:
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist
        ] for document in raw_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token]>1 ] for text in texts]
processed_corpus

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [4]:
from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)



Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [6]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [7]:

new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
new_vec

[(0, 1), (1, 1)]

In [8]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
bow_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

In [9]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
bow_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

In [10]:
from gensim import models
# train the model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" string
tfidf[dictionary.doc2bow("system minors".lower().split())]

[(5, 0.5898341626740045), (11, 0.8075244024440723)]

## WordRank gensim

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/WordRank_wrapper_quickstart.ipynb

In [12]:
from gensim.models.wrappers import Wordrank

wr_path = 'wordrank' # path to Wordrank directory
out_dir = 'model' # name of output directory to save data to
data = './lee.cor' # sample corpus

model = Wordrank.train(wr_path, data, out_dir, iter=11, dump_period=5)

FileExistsError: [Errno 17] File exists: 'wordrank/model/meta'

In [13]:
model.most_similar('President')

NameError: name 'model' is not defined

In [14]:
model.similarity('President', 'military')

NameError: name 'model' is not defined

## fasttext gensim

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb

In [15]:
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath

# Set file names for train and test data
corpus_file = datapath('lee_background.cor')

model_gensim = FT_gensim(size=100)

# build the vocabulary
model_gensim.build_vocab(corpus_file=corpus_file)

# train the model
model_gensim.train(
    corpus_file=corpus_file, epochs=model_gensim.epochs,
    total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words
)

print(model_gensim)

FastText(vocab=1762, size=100, alpha=0.025)


In [17]:
print('night' in model_gensim.wv.vocab)
print('nights' in model_gensim.wv.vocab)
print(model_gensim['night'])
print(model_gensim['nights'])

True
False
[ 1.07354417e-01  1.04594184e-02 -5.71774840e-01  5.04119098e-01
  5.87822914e-01 -3.31964374e-01 -1.91451162e-01 -2.66763456e-02
  4.30513501e-01  3.45042586e-01 -6.28720999e-01 -1.91997178e-02
 -6.53189719e-01  4.12460417e-01  2.96156883e-01 -5.20451292e-02
 -1.71165690e-01  1.80118456e-01  2.49184221e-01 -3.77968967e-01
 -2.34638289e-01  2.75152713e-01 -3.89198571e-01 -1.04820792e-04
 -8.28804731e-01  7.28953600e-01  1.05358765e-01  1.61917821e-01
  4.11544144e-01  3.89883062e-03 -6.59689307e-01  2.22101852e-01
  8.70955810e-02 -4.66582984e-01  4.65454102e-01  1.27155274e-01
 -1.62015662e-01 -8.85693952e-02  4.38466638e-01  2.33619004e-01
 -1.04455706e-02 -7.45560080e-02  3.92103940e-01 -6.35504350e-02
  1.32898152e-01  1.87912285e-01 -1.41070589e-01  2.21928105e-01
 -2.59709377e-02 -3.80960643e-01 -5.59951603e-01 -5.58635175e-01
  8.76470804e-02  1.06419958e-02  4.28282589e-01 -8.04397702e-01
 -1.07582688e-01 -1.70826063e-01  1.67557262e-02 -2.55352147e-02
  2.08238497e-

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


## Using wrappers for Scikit learn API

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/sklearn_api.ipynb

In [18]:
from gensim.sklearn_api import LdaTransformer

In [19]:
from gensim.corpora import Dictionary
texts = [
    ['complier', 'system', 'computer'],
    ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
    ['graph', 'flow', 'network', 'graph'],
    ['loading', 'computer', 'system'],
    ['user', 'server', 'system'],
    ['tree', 'hamiltonian'],
    ['graph', 'trees'],
    ['computer', 'kernel', 'malfunction', 'computer'],
    ['server', 'system', 'computer']
]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [20]:
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(5, 2), (9, 1), (10, 1)],
 [(1, 1), (2, 1), (11, 1)],
 [(2, 1), (12, 1), (13, 1)],
 [(8, 1), (14, 1)],
 [(5, 1), (15, 1)],
 [(1, 2), (16, 1), (17, 1)],
 [(1, 1), (2, 1), (12, 1)]]

In [23]:
model = LdaTransformer(num_topics=4, id2word=dictionary, iterations=20, random_state=1)
model.fit(corpus)
model.transform(corpus)

array([[0.06260677, 0.06266407, 0.8119805 , 0.06274869],
       [0.88988334, 0.03759476, 0.03661389, 0.03590799],
       [0.05085288, 0.05071966, 0.8480455 , 0.05038194],
       [0.06260499, 0.06266756, 0.8119778 , 0.06274967],
       [0.06262801, 0.06270411, 0.8118702 , 0.06279765],
       [0.7489928 , 0.08407857, 0.08337071, 0.08355792],
       [0.08486405, 0.08463454, 0.74658567, 0.08391576],
       [0.05010197, 0.05016245, 0.84949183, 0.05024375],
       [0.06256833, 0.06260946, 0.81215537, 0.06266686]], dtype=float32)

## Integration with Sklearn
To provide a better example of how it can be used with Sklearn, Let's use CountVectorizer method of sklearn. For this example we will use 20 Newsgroups data set. We will only use the categories rec.sport.baseball and sci.crypt and use it to generate topics.

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/sklearn_api.ipynb

In [24]:
import numpy as np
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from sklearn.datasets import fetch_20newsgroups
from gensim.sklearn_api.ldamodel import LdaTransformer

In [25]:
rand = np.random.mtrand.RandomState(1) # set seed for getting same result
cats = ['rec.sport.baseball', 'sci.crypt']
data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [26]:
data_texts = [_.split() for _ in data.data]
id2word = Dictionary(data_texts)
corpus = [id2word.doc2bow(i.split()) for i in data.data]

In [27]:
obj = LdaTransformer(id2word=id2word, num_topics=5, iterations=20)
lda = obj.fit(corpus)

In [32]:
lda.transform(corpus)

array([[0.17836611, 0.0523372 , 0.        , 0.5488369 , 0.21117735],
       [0.98500174, 0.        , 0.        , 0.        , 0.        ],
       [0.04092231, 0.        , 0.        , 0.4762131 , 0.4780422 ],
       ...,
       [0.595756  , 0.02553337, 0.16302142, 0.08455595, 0.13113326],
       [0.2834982 , 0.        , 0.22094913, 0.11297537, 0.3776973 ],
       [0.03466221, 0.8910917 , 0.        , 0.        , 0.06462381]],
      dtype=float32)

In [66]:
lda.transform(corpus).shape

(300, 5)

In [67]:
len(corpus)

300

In [69]:
corpus[10]

[(0, 1),
 (4, 1),
 (7, 1),
 (21, 1),
 (33, 1),
 (57, 3),
 (82, 1),
 (86, 1),
 (102, 2),
 (106, 1),
 (123, 2),
 (125, 2),
 (169, 1),
 (206, 1),
 (217, 1),
 (254, 1),
 (256, 1),
 (329, 1),
 (347, 1),
 (393, 4),
 (551, 1),
 (576, 1),
 (610, 1),
 (618, 1),
 (619, 1),
 (622, 2),
 (628, 1),
 (639, 1),
 (680, 1),
 (681, 1),
 (682, 2),
 (683, 1),
 (684, 1),
 (685, 1),
 (686, 1),
 (687, 1),
 (688, 1),
 (689, 1),
 (690, 1),
 (691, 1),
 (692, 1),
 (693, 1),
 (694, 1),
 (695, 1),
 (696, 1),
 (697, 1),
 (698, 1),
 (699, 1),
 (700, 1),
 (701, 1),
 (702, 1),
 (703, 1),
 (704, 1),
 (705, 2),
 (706, 1),
 (707, 1),
 (708, 2),
 (709, 1),
 (710, 1),
 (711, 1),
 (712, 1),
 (713, 1),
 (714, 1),
 (715, 1),
 (716, 1),
 (717, 1),
 (718, 1),
 (719, 1),
 (720, 1),
 (721, 1),
 (722, 1),
 (723, 1)]

In [70]:
lda.transform(corpus[10])

array([[0.4663972 , 0.        , 0.08512421, 0.36374238, 0.08163416]],
      dtype=float32)

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
obj = LdaTransformer(id2word=id2word, num_topics=2, iterations=5, scorer='u_mass') # here 'scorer' can be 'perplexity' or 'u_mass'
parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}

# set `scoring` as `None` to use the inbuilt score function of `SklLdaModel` class
model = GridSearchCV(obj, parameters, cv=3, scoring=None)
model.fit(corpus)

model.best_params

AttributeError: 'GridSearchCV' object has no attribute 'best_params'

In [34]:
model.best_params_

{'iterations': 50, 'num_topics': 2}

You can also supply a custom scoring function of your choice using the scoring parameter of GridSearchCV function. The example shown below uses c_v mode of CoherenceModel class for computing the scores of the candidate models.

In [35]:
from gensim.models.coherencemodel import CoherenceModel

# supplying a custom scoring function
def scoring_function(estimator, X, y=None):
    goodcm = CoherenceModel(model=estimator.gensim_model, texts=data_texts, dictionary=estimator.gensim_model.id2word, coherence='c_v')
    return goodcm.get_coherence()

obj = LdaTransformer(id2word=id2word, num_topics=5, iterations=5)
parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}

# set `scoring` as your custom scoring function
model = GridSearchCV(obj, parameters, cv=2, scoring=scoring_function)
model.fit(corpus)

model.best_params_

{'iterations': 50, 'num_topics': 2}

Example of Using Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import linear_model

def print_features_pipe(clf, vocab, n=10):
    ''' Better printing for sorted list '''
    coef = clf.named_steps['classifier'].coef_[0]
    print coef
    print 'Positive features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[::-1][:n] if coef[j] > 0]))
    print 'Negative features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0]))

In [None]:
id2word = Dictionary([_.split() for _ in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]

In [None]:
model = LdaTransformer(num_topics=15, id2word=id2word, iterations=10, random_state=37)
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)  # l2 penalty used
pipe = Pipeline([('features', model,), ('classifier', clf)])
pipe.fit(corpus, data.target)
print_features_pipe(pipe, id2word.values())

print(pipe.score(corpus, data.target))

## LSI Model

## News classification with topic models in gensim

News article classification is a task which is performed on a huge scale by news agencies all over the world. We will be looking into how topic modeling can be used to accurately classify news articles into different categories such as sports, technology, politics etc.

Our aim in this tutorial is to come up with some topic model which can come up with topics that can easily be interpreted by us. Such a topic model can be used to discover hidden structure in the corpus and can also be used to determine the membership of a news article into one of the topics.

For this tutorial, we will be using the Lee corpus which is a shortened version of the Lee Background Corpus. The shortened version consists of 300 documents selected from the Australian Broadcasting Corporation's news mail service. It consists of texts of headline stories from around the year 2000-2001.

Accompanying slides can be found here.

Requirements
In this tutorial we look at how different topic models can be easily created using gensim. Following are the dependencies for this tutorial:

- Gensim Version >=0.13.1 would be preferred since we will be using topic coherence metrics extensively here.
- matplotlib
- nltk.stopwords and nltk.wordnet
- pyLDAVis
We will be playing around with 4 different topic models here:

- LSI (Latent Semantic Indexing)
- HDP (Hierarchical Dirichlet Process)
- LDA (Latent Dirichlet Allocation)
- LDA (tweaked with topic coherence to find optimal number of topics) and
- LDA as LSI with the help of topic coherence metrics
First we'll fit those topic models on our existing data then we'll compare each against the other and see how they rank in terms of human interpretability.

All can be found in gensim and can be easily used in a plug-and-play fashion. We will tinker with the LDA model using the newly added topic coherence metrics in gensim based on this paper by Roeder et al and see how the resulting topic model compares with the exsisting ones

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/gensim_news_classification.ipynb

In [37]:
import os
import re
import operator
import matplotlib.pyplot as plt
import warnings
import gensim
import numpy as np
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now

import nltk
nltk.download('stopwords') # Let's make sure the 'stopword' package is downloaded & updated
nltk.download('wordnet') # Let's also download wordnet, which will be used for lemmatization

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint
from smart_open import smart_open

%matplotlib inline

[nltk_data] Downloading package stopwords to /home/lc/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/lc/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [38]:
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'

In [39]:
with smart_open(lee_train_file, 'rb') as f:
    for n, l in enumerate(f):
        if n < 5:
            print([l])

[b'Hundreds of people have been forced to vacate their homes in the Southern Highlands of New South Wales as strong winds today pushed a huge bushfire towards the town of Hill Top. A new blaze near Goulburn, south-west of Sydney, has forced the closure of the Hume Highway. At about 4:00pm AEDT, a marked deterioration in the weather as a storm cell moved east across the Blue Mountains forced authorities to make a decision to evacuate people from homes in outlying streets at Hill Top in the New South Wales southern highlands. An estimated 500 residents have left their homes for nearby Mittagong. The New South Wales Rural Fire Service says the weather conditions which caused the fire to burn in a finger formation have now eased and about 60 fire units in and around Hill Top are optimistic of defending all properties. As more than 100 blazes burn on New Year\'s Eve in New South Wales, fire crews have been called to new fire at Gunning, south of Goulburn. While few details are available at 

In [40]:
def build_texts(fname):
    """
    Function to build tokenized texts from file
    
    Parameters:
    ----------
    fname: File to be read
    
    Returns:
    -------
    yields preprocessed line
    """
    with smart_open(fname, 'rb') as f:
        for line in f:
            yield gensim.utils.simple_preprocess(line, deacc=True, min_len=3)

In [41]:
train_texts = list(build_texts(lee_train_file))

In [42]:
len(train_texts)

300

In [45]:
len(train_texts[0])

256

In [46]:
bigram = gensim.models.Phrases(train_texts)

In [47]:
bigram[['new', 'york', 'example']]

['new_york', 'example']

In [48]:
from gensim.utils import lemmatize
from nltk.corpus import stopwords

In [49]:
stops = set(stopwords.words('english'))

In [50]:
def process_texts(texts):
    """
    Function to process texts. Following are the steps we take:
    
    1. Stopword Removal.
    2. Collocation detection.
    3. Lemmatization (not stem since stemming can reduce the interpretability).
    
    Parameters:
    ----------
    texts: Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    texts = [[word for word in line if word not in stops] for line in texts]
    texts = [bigram[line] for line in texts]
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    texts = [[word for word in lemmatizer.lemmatize(' '.join(line), pos='v').split()] for line in texts]
    return texts

In [51]:
train_texts = process_texts(train_texts)
train_texts[5:6]

[['federal_government',
  'says',
  'safe',
  'afghani',
  'asylum_seekers',
  'australia',
  'return',
  'home',
  'environment',
  'becomes',
  'secure',
  'government',
  'suspended',
  'applications',
  'interim_government',
  'established',
  'kabul',
  'foreign_affairs',
  'minister_alexander',
  'downer',
  'refused',
  'say',
  'long',
  'claims',
  'process',
  'put',
  'hold',
  'says',
  'major',
  'threat',
  'people',
  'seeking',
  'asylum',
  'longer',
  'many',
  'afghans',
  'tried',
  'get',
  'australia',
  'matter',
  'britain',
  'countries',
  'north',
  'west',
  'europe',
  'claimed',
  'fleeing',
  'taliban',
  'said',
  'well',
  'taliban',
  'longer',
  'power',
  'afghanistan',
  'taliban',
  'finished',
  'meanwhile',
  'mass',
  'airlift',
  'detainees',
  'christmas',
  'island',
  'pacific',
  'island',
  'nauru',
  'total',
  'people',
  'flown',
  'island',
  'two',
  'operations',
  'using',
  'chartered',
  'aircraft',
  'second',
  'airlift',
  'tod

In [52]:
dictionary = Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]

## Topic modeling with LSI
This is a useful topic modeling algorithm in that it can rank topics by itself. Thus it outputs topics in a ranked order. However it does require a num_topics parameter (set to 200 by default) to determine the number of latent dimensions after the SVD.

In [54]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [55]:
lsimodel.show_topics(num_topics=5)

[(0,
  '0.542*"said" + 0.349*"says" + 0.127*"arafat" + 0.122*"palestinian" + 0.118*"people" + 0.117*"israeli" + 0.112*"two" + 0.110*"australian" + 0.110*"also" + 0.107*"australia"'),
 (1,
  '-0.407*"says" + 0.321*"arafat" + 0.315*"palestinian" + 0.273*"israeli" + 0.192*"israel" + 0.173*"sharon" + -0.145*"australia" + -0.144*"australian" + 0.140*"west_bank" + 0.136*"hamas"'),
 (2,
  '0.349*"says" + -0.330*"said" + -0.202*"afghanistan" + -0.191*"bin_laden" + -0.179*"taliban" + -0.169*"pakistan" + 0.162*"australia" + 0.149*"arafat" + -0.126*"tora_bora" + 0.123*"israeli"'),
 (3,
  '0.293*"fire" + 0.240*"sydney" + -0.214*"says" + 0.184*"firefighters" + 0.171*"south" + 0.165*"new_south" + 0.165*"wales" + 0.163*"north" + 0.161*"fires" + -0.152*"afghanistan"'),
 (4,
  '-0.220*"said" + -0.172*"test" + -0.170*"match" + 0.154*"afghanistan" + 0.150*"government" + 0.148*"says" + -0.144*"first" + 0.142*"fire" + -0.139*"australia" + 0.136*"force"')]

In [56]:
lsitopics = lsimodel.show_topics(formatted=False)

In [57]:
lsimodel.show_topics()

[(0,
  '0.542*"said" + 0.349*"says" + 0.127*"arafat" + 0.122*"palestinian" + 0.118*"people" + 0.117*"israeli" + 0.112*"two" + 0.110*"australian" + 0.110*"also" + 0.107*"australia"'),
 (1,
  '-0.407*"says" + 0.321*"arafat" + 0.315*"palestinian" + 0.273*"israeli" + 0.192*"israel" + 0.173*"sharon" + -0.145*"australia" + -0.144*"australian" + 0.140*"west_bank" + 0.136*"hamas"'),
 (2,
  '0.349*"says" + -0.330*"said" + -0.202*"afghanistan" + -0.191*"bin_laden" + -0.179*"taliban" + -0.169*"pakistan" + 0.162*"australia" + 0.149*"arafat" + -0.126*"tora_bora" + 0.123*"israeli"'),
 (3,
  '0.293*"fire" + 0.240*"sydney" + -0.214*"says" + 0.184*"firefighters" + 0.171*"south" + 0.165*"new_south" + 0.165*"wales" + 0.163*"north" + 0.161*"fires" + -0.152*"afghanistan"'),
 (4,
  '-0.220*"said" + -0.172*"test" + -0.170*"match" + 0.154*"afghanistan" + 0.150*"government" + 0.148*"says" + -0.144*"first" + 0.142*"fire" + -0.139*"australia" + 0.136*"force"'),
 (5,
  '-0.282*"said" + 0.265*"afghanistan" + 0.204

## Topic modeling with HDP
An HDP model is fully unsupervised. It can also determine the ideal number of topics it needs through posterior inference.

In [58]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [59]:
hdpmodel.show_topics()

[(0,
  '0.006*said + 0.003*israeli + 0.003*palestinian + 0.002*three + 0.002*group + 0.002*west_bank + 0.002*krishna + 0.002*sharon + 0.002*hare + 0.002*killed + 0.002*ashes + 0.002*benares + 0.002*government + 0.002*arafat + 0.002*adventure_world + 0.002*near + 0.002*police + 0.002*canyoning + 0.002*israel + 0.002*hindus'),
 (1,
  '0.006*said + 0.003*airport + 0.003*taliban + 0.002*eight + 0.002*one + 0.002*kandahar + 0.002*commission + 0.002*today + 0.002*killed + 0.002*opposition + 0.002*left + 0.002*half + 0.001*wounded + 0.001*civilians + 0.001*collapse + 0.001*end + 0.001*lali + 0.001*agha + 0.001*city + 0.001*gul'),
 (2,
  '0.003*says + 0.003*match + 0.003*said + 0.002*israeli + 0.002*team + 0.002*rafter + 0.002*france + 0.002*tennis + 0.002*australia + 0.001*government + 0.001*attacks + 0.001*guarantee + 0.001*could + 0.001*john + 0.001*still + 0.001*want + 0.001*deciding + 0.001*house + 0.001*decision + 0.001*disappointed'),
 (3,
  '0.004*says + 0.003*india + 0.003*government 

## Topic modeling using LDA
This is one the most popular topic modeling algorithms today. It is a generative model in that it assumes each document is a mixture of topics and in turn, each topic is a mixture of words. To understand it better you can watch this lecture by David Blei. Let's choose 10 topics to initialize this.

In [60]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [63]:
import pyLDAvis.gensim

In [62]:
!pip install pyLDAvis

Looking in indexes: https://pypi.douban.com/simple
Collecting pyLDAvis
[?25l  Downloading https://pypi.doubanio.com/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 869kB/s eta 0:00:01
Collecting joblib>=0.8.4 (from pyLDAvis)
[?25l  Downloading https://pypi.doubanio.com/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl (278kB)
[K     |████████████████████████████████| 286kB 6.9MB/s eta 0:00:01
Collecting funcy (from pyLDAvis)
  Downloading https://pypi.doubanio.com/packages/b3/23/d1f90f4e2af5f9d4921ab3797e33cf0503e3f130dd390a812f3bf59ce9ea/funcy-1.12-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/lc/.cache/pip/wheels/3e/02/05/df49a05080c8fc4c5652bce0c442c8416df1b3bb48550de79b
Successfully built pyLDAvis
Ins

In [64]:
pyLDAvis.enable_notebook()

In [65]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)