In [1]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

import numpy as np

In [2]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [4]:
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 3.100s.


In [10]:
# Use tf-idf features.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.665s.


In [16]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.655s.



In [5]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 2.879s.


In [6]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1: don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2: christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3: drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4: hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5: god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6: 55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7: car year just cars new engine like bike good oil insurance better tires 000 thing speed model brake driving performance
Topic #8: people said

In [8]:
topic_word_distributions = np.array([row / np.sum(row)
                                     for row in lda.components_])

In [9]:
np.sum(topic_word_distributions[0])

1.0

In [10]:
print('Displaying the top', n_top_words, 'words per topic and their probabilities within the topic...')
print()

for topic_idx in range(n_components):
    print('[ Topic', topic_idx, ']')
    sort_indices = np.argsort(topic_word_distributions[topic_idx])[::-1]  # highest prob to lowest prob word indices
    for rank in range(n_top_words):
        word_idx = sort_indices[rank]
        print(tf_vectorizer.get_feature_names()[word_idx], ':', topic_word_distributions[topic_idx, word_idx])
    print()

Displaying the top 20 words per topic and their probabilities within the topic...

[ Topic 0 ]
edu : 0.05201295519414276
com : 0.02330146798243792
mail : 0.018757677200607305
send : 0.017853034313902207
graphics : 0.01595098191416805
ftp : 0.014210395836814476
pub : 0.013452740553974222
available : 0.011952686203526925
contact : 0.011686121720430657
university : 0.010583275527159331
list : 0.009855197508471462
faq : 0.00957636727342271
ca : 0.009509668275976258
information : 0.009114667086822981
cs : 0.008586300586097028
1993 : 0.007287622462962467
program : 0.007127524628490156
sun : 0.007015770896936095
uk : 0.006898763186196177
mit : 0.006855242806574545

[ Topic 1 ]
don : 0.02252631873000027
like : 0.01920069484451604
just : 0.016172949248556064
know : 0.014221316124639132
think : 0.013672203283395329
ve : 0.012894625646772533
way : 0.012293395990878963
use : 0.010477578387284938
right : 0.0104737325409385
good : 0.010437073090386382
going : 0.010000710949059645
make : 0.0090126369