In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF

## Helper functions

In [3]:
# taken from http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [9]:
def decomposition_helper(data, method, n_components, vectorizer=CountVectorizer, print_num=20):
    vectorized = vectorizer(stop_words='english', max_df=0.95, 
                             min_df=2, max_features=1000)
    features = vectorized.fit_transform(data)
    for nc in n_components:
        print('For n_components=%d' % nc)
        lda = method(n_components=nc, random_state=666)
        lda.fit(features)
        print_top_words(lda, vectorized.get_feature_names(), print_num)
        print('\n')

## 20NG

In [5]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/',
    subset='train', remove=('headers', 'footers', 'quotes'),
    shuffle=True, random_state=666)

In [6]:
decomposition_helper(news_train.data, method=LDA, n_components=[10, 20, 50], vectorizer=CountVectorizer)

For n_components=10
Topic #0: space new armenian president 1993 armenians research university national turkish center program nasa information april general press comp request launch
Topic #1: good year like new just car game team time got years don think home best better ll games make high
Topic #2: 145 b8f 34u bike w7 pl ah a86 0t chz 2di lk air 1d9 gm ms 7u sl cd d9
Topic #3: use file windows drive software program using files version card data does window problem available dos ftp pc disk image
Topic #4: edu com mail send thanks email information list internet post know cs subject address posting info news article gov group
Topic #5: ax max g9v b8f a86 pl giz 1d9 1t bhj 3t 75u 7ey 2tm 0t bxn 2di wm 34u gk
Topic #6: key use government law gun scsi public chip encryption used keys security control clipper privacy motif state using people rights
Topic #7: people don just think know like say time said did way things going ve make want really right does point
Topic #8: 10 00 25 15 12 11

In [10]:
decomposition_helper(news_train.data, method=NMF, n_components=[10, 20, 50], vectorizer=TfidfVectorizer)

For n_components=10
Topic #0: don just people think like know right ve did time say really good way make said going want things thing
Topic #1: card video monitor cards drivers bus vga driver color memory bit ram board mode pc graphics apple 16 modem speed
Topic #2: god jesus bible believe christ faith christian christians church does life sin truth lord say man hell christianity love belief
Topic #3: game team year games season players play hockey win league player teams nhl good runs better best hit think time
Topic #4: new car 00 10 sale price space condition offer used good 20 bike 50 shipping 15 old power interested 30
Topic #5: thanks know does mail advance anybody hi looking info help appreciated information email address post need interested send like appreciate
Topic #6: windows file use files window dos program using problem running version run application server screen ms image help software ftp
Topic #7: edu soon com university cs email article internet send mail ftp david 