In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print(documents.pop())
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

                                      ^^^^^^
No argument at all with Murphy.  He scared the hell out of me when he came in
last year.  On the other hand, the club though enough of Boever to put him into
an awful lot of games (he may have led the league in appearances - he did at
least at some point).  He seemed to be a very viable setup guy - but I guess
that's not considered that crucial by the club.  I can just remember two years
ago so well, though...
...

I'm not that concerned.  Those guys have been relatively consistent over the
years and they have no good reasons to decline (no injuries, not old, ...).
I expect them to come through just fine.  It's those guys that have not
been consistently good that are the worrisome part, even if they are coming
through right now.

This sounds like their old road unis.  Pretty dull.  Buttons or pullovers?
I'll check through my uniform book to see if they've always had some orange.


Well, we'll see.  I've got a Astros pullover shirt with the "



Topic 0:
people time good right did ve say make way said
Topic 1:
window problem using server application display screen manager motif running
Topic 2:
god jesus bible christ faith believe christian christians church sin
Topic 3:
game team year games season players play hockey win league
Topic 4:
new 00 sale 10 price offer shipping condition 20 15
Topic 5:
thanks mail advance hi looking info help information address email
Topic 6:
windows file files dos program version ftp ms running directory
Topic 7:
edu soon cs university ftp internet email article pub david
Topic 8:
key chip clipper encryption keys escrow government public algorithm nsa
Topic 9:
drive scsi drives hard disk ide floppy controller cd mac
Topic 10:
just thought ll got tell oh fine wanted mean little
Topic 11:
does know anybody mean work doesn say help exist program
Topic 12:
card video monitor cards drivers bus vga driver color memory
Topic 13:
like sounds looks look sound lot things bike really thing
Topic 14:
don kno