In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from HW2 import DATA_DIR


In [6]:
n_features = 1000
n_top_words = 20
dataset = fetch_20newsgroups(data_home=DATA_DIR,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)


def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
def print_top_words(model, feature_names, n_top_words,is_nmf=False):
    for topic_idx, topic in enumerate(model.components_):
#         todo normalisation for NMF
        message = "Topic #%d: " % topic_idx
        index = topic.argsort()[:-n_top_words - 1:-1]
        if is_nmf:
            message += " ".join([feature_names[i] for i in index])
        else:
            message += " ".join([feature_names[i] + " " + str(topic[i]/topic.sum()) for i in index])

        print(message)


for k in [10, 20, 50]:
    nmf = NMF(n_components=k, random_state=1,
              alpha=.1, l1_ratio=.5).fit(tf)
    lda = LatentDirichletAllocation(n_components=k, learning_method='online')
    lda.fit(tf)
    print("\nTopics in NMF model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(nmf, tf_feature_names, n_top_words,True)
    print("\nTopics in LDA model:")
    print_top_words(lda, tf_feature_names, n_top_words)



Topics in NMF model:
Topic #0: max giz bhj qax bxn rlk chz air end fax output van let scx stream big file buf -bit edu
Topic #1: edu available com pub ftp version server graphics widget tar mit window subject sun motif data set export software information
Topic #2: people said know don just didn like say time god went think did going came way says come armenians told
Topic #3: file ---------------------------------------------------------------------- gun congress control firearms states united house use amendment crime directory law american second code march new national
Topic #4: output file entry program stream check line buf build open null section rules return int info size year read write
Topic #5: jpeg image gif file images format files version quality free color software programs use available display -bit don graphics ftp
Topic #6: stephanopoulos president know think going don said package did groups working just administration jobs say time press believe yes mean
Topic #7: 


Topics in NMF model:
Topic #0: max bhj giz qax rlk end fax van let red comp standard scsi- total years built build scsi hard avoid
Topic #1: file ---------------------------------------------------------------------- gun congress control firearms states united house amendment crime directory american law march second use code national issue
Topic #2: said didn know people don went just came say says like going told did started saw time took come armenians
Topic #3: argument true example does false form used truth evidence question logic general bible right arguments particular cause known event note
Topic #4: output file program entry stream buf check line open build null return int read size write input info year section
Topic #5: jpeg image gif file images format quality version files free color use programs software available display don -bit does note
Topic #6: stephanopoulos president know going don think said did package groups working tax mean press yes time believe just jobs s