In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF

In [23]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

## Helper functions

In [3]:
# partly taken from http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def print_top_words(model, feature_names, n_top_words):
    ret = {}
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        ret[topic_idx] = {feature_names[i]: topic[i]/sum(topic) for i in topic.argsort()[:-n_top_words - 1:-1]}
        message += " ".join([k + ' (%.5f)' % v for k, v in ret[topic_idx].items()])
        print(message)
    print()
    return ret

In [4]:
def decomposition_helper(data, method='lda', n_components=[10], vectorizer=CountVectorizer, print_num=20):
    vectorized = vectorizer(stop_words='english', max_df=0.95, 
                             min_df=2, max_features=1000)
    features = vectorized.fit_transform(data)
    ret_models = {}
    ret_topics = {}
    for nc in n_components:
        print('\nFor n_components=%d' % nc)
        if method == 'lda':
            m = LDA(n_components=nc, random_state=666, learning_method='online', n_jobs=-1)
        else:
            m = NMF(n_components=nc, random_state=666)
        m.fit(features)
        ret_models[nc] = m
        ret_topics[nc] = print_top_words(m, vectorized.get_feature_names(), print_num)
    return ret_models, ret_topics

In [36]:
def es_topic_indexer(topics, index):
    for topic_id, words_pd in topics.items():
        words, pds = [], []
        for word, pd in words_pd.items():
            words.append(word)
            pds.append(pd)
        doc = {
            'topic_id' : topic_id,
            'top_words' : words,
            'word_prob' : pds
        }
        es.index(index=index, doc_type='topic', body=doc)

## 20NG

In [5]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/',
    subset='train', remove=('headers', 'footers', 'quotes'),
    shuffle=True, random_state=666)

In [6]:
lda_models, lda_topics = decomposition_helper(news_train.data, method='lda', n_components=[10, 20, 50], vectorizer=CountVectorizer)


For n_components=10
Topic #0: space (0.02703) new (0.01426) armenian (0.01348) president (0.01322) 1993 (0.01257) armenians (0.01183) research (0.01182) university (0.01134) national (0.01069) turkish (0.01056) center (0.01012) program (0.01006) nasa (0.01000) information (0.00954) april (0.00892) general (0.00812) press (0.00785) comp (0.00765) request (0.00753) launch (0.00723)
Topic #1: good (0.01785) year (0.01696) like (0.01522) new (0.01437) just (0.01280) car (0.01196) game (0.01171) team (0.01030) time (0.01013) got (0.00986) years (0.00899) don (0.00834) think (0.00831) home (0.00802) best (0.00768) better (0.00757) ll (0.00738) games (0.00730) make (0.00701) high (0.00696)
Topic #2: 145 (0.08498) b8f (0.04817) 34u (0.04244) bike (0.04081) w7 (0.03784) pl (0.03738) ah (0.03444) a86 (0.03319) 0t (0.03317) chz (0.03155) 2di (0.02877) lk (0.02830) air (0.02798) 1d9 (0.02764) gm (0.02676) ms (0.02647) 7u (0.02574) sl (0.02511) cd (0.02287) d9 (0.02035)
Topic #3: use (0.01755) fil

In [7]:
_, _ = decomposition_helper(news_train.data, method='nmf', n_components=[10, 20, 50], vectorizer=TfidfVectorizer)


For n_components=10
Topic #0: don (0.01778) just (0.01737) people (0.01707) think (0.01431) like (0.01203) know (0.00813) right (0.00765) ve (0.00720) did (0.00712) time (0.00708) say (0.00703) really (0.00640) good (0.00634) way (0.00602) make (0.00583) said (0.00570) going (0.00562) want (0.00561) things (0.00554) thing (0.00492)
Topic #1: card (0.08603) video (0.04278) monitor (0.02833) cards (0.02211) drivers (0.02184) bus (0.02041) vga (0.01926) driver (0.01797) color (0.01617) memory (0.01536) bit (0.01509) ram (0.01389) board (0.01304) mode (0.01295) pc (0.01256) graphics (0.01157) apple (0.01099) 16 (0.01097) modem (0.01021) speed (0.01000)
Topic #2: god (0.08988) jesus (0.03210) bible (0.01888) believe (0.01649) christ (0.01634) faith (0.01514) christian (0.01499) christians (0.01433) church (0.01198) does (0.01036) life (0.01022) sin (0.00977) truth (0.00942) lord (0.00942) say (0.00924) man (0.00859) hell (0.00842) christianity (0.00787) love (0.00771) belief (0.00758)
Topi

#### ES

In [37]:
es_topic_indexer(lda_topics[10], 'topics20ng')

#### LDA analysis

In [8]:
news_train.data[0], news_train.target_names[news_train.target[0]]

("You can't make a Citizens arrest on anything but a felony.\n.\n  \n",
 'rec.motorcycles')

In [9]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=1000)
features = vectorizer.fit_transform(news_train.data)

In [15]:
for n_topics, model in lda_models.items():
    best_topic = model.transform(features[0])[0].argmax()
    print(n_topics, [k for k in lda_topics[n_topics][best_topic].keys()])
    print()

10 ['key', 'use', 'government', 'law', 'gun', 'scsi', 'public', 'chip', 'encryption', 'used', 'keys', 'security', 'control', 'clipper', 'privacy', 'motif', 'state', 'using', 'people', 'rights']

20 ['government', 'law', 'gun', 'president', 'state', 'states', 'rights', 'public', 'american', 'people', 'national', 'police', 'control', 'crime', 'guns', 'laws', 'administration', 'united', 'military', 'federal']

50 ['government', 'law', 'public', 'privacy', 'use', 'private', 'crime', 'legal', 'security', 'laws', 'federal', 'administration', 'congress', 'weapons', 'clinton', 'house', 'people', 'encryption', 'protect', 'enforcement']



The topics get better as the number of components increase.

## DUC

In [16]:
data_path = '../../data/DUC/'
docs = {}

In [17]:
def get_docs(path):
    import os
    from bs4 import BeautifulSoup
    ret_dict = {}
    files = os.listdir(path)
    for file in files:
        file = path + file
        if os.path.isdir(file) or '.txt' in file:
            continue
        with open(path + file) as infile:
            soup = BeautifulSoup(infile, 'html.parser')
            ret_dict[soup.docno.text.strip()] = soup.find('text').text.strip()
    return ret_dict

In [18]:
docs = get_docs(data_path)

In [19]:
duc_data = [v for k, v in docs.items()]

In [33]:
lda_models_duc, lda_topics_duc = decomposition_helper(duc_data, method='lda', n_components=[10, 20, 50], vectorizer=CountVectorizer)


For n_components=10
Topic #0: said (0.05070) crash (0.02569) air (0.02131) plane (0.02050) tunnel (0.01421) aircraft (0.01197) flight (0.01077) crashed (0.00981) force (0.00976) jet (0.00971) people (0.00940) 10 (0.00935) engine (0.00934) miles (0.00927) united (0.00909) pilot (0.00903) base (0.00891) military (0.00851) french (0.00822) near (0.00788)
Topic #1: said (0.02908) year (0.01689) disease (0.01292) percent (0.01208) earthquake (0.01065) drought (0.01038) says (0.01032) british (0.00926) people (0.00898) new (0.00755) billion (0.00683) area (0.00680) taylor (0.00676) tornado (0.00648) damage (0.00632) bse (0.00622) years (0.00556) total (0.00546) scientists (0.00537) million (0.00526)
Topic #2: said (0.00225) shining (0.00183) police (0.00179) department (0.00177) slovenia (0.00171) government (0.00163) federal (0.00163) state (0.00161) path (0.00160) hurricane (0.00154) army (0.00151) court (0.00150) people (0.00149) term (0.00147) president (0.00145) new (0.00143) slovenian

In [21]:
_, _ = decomposition_helper(duc_data, method='nmf', n_components=[10, 20, 50], vectorizer=TfidfVectorizer)


For n_components=10
Topic #0: said (0.02081) crash (0.01975) forest (0.01684) air (0.01486) fires (0.01387) aircraft (0.01209) plane (0.01097) firefighters (0.01049) crashed (0.01043) acres (0.01037) jet (0.00960) base (0.00781) flight (0.00745) pilot (0.00743) park (0.00712) military (0.00690) force (0.00667) area (0.00639) national (0.00632) engine (0.00628)
Topic #1: oil (0.06840) exxon (0.06335) spill (0.04597) valdez (0.04347) cleanup (0.02252) said (0.01898) tanker (0.01839) alaska (0.01718) ship (0.01420) sound (0.01392) guard (0.01215) coast (0.01092) million (0.00932) prince (0.00906) gallons (0.00898) miles (0.00870) vessel (0.00847) 000 (0.00846) environmental (0.00784) william (0.00772)
Topic #2: hurricane (0.06818) hurricanes (0.02607) storm (0.01943) sheets (0.01790) storms (0.01578) mph (0.01526) atlantic (0.01471) winds (0.01406) tropical (0.01366) florida (0.01323) said (0.01310) gilbert (0.01178) hugo (0.01132) season (0.01120) gray (0.01095) center (0.01050) forecas

#### ES

In [38]:
es_topic_indexer(lda_topics_duc[10], 'topicsduc')