In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF
from tqdm import tqdm

In [3]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

## Helper functions

In [4]:
# partly taken from http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def print_top_words(model, feature_names, n_top_words):
    ret = {}
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        ret[topic_idx] = {feature_names[i]: topic[i]/sum(topic) for i in topic.argsort()[:-n_top_words - 1:-1]}
        message += " ".join([k + ' (%.5f)' % v for k, v in ret[topic_idx].items()])
        print(message)
    print()
    return ret

In [5]:
def decomposition_helper(data, method='lda', n_components=[10], vectorizer=CountVectorizer, print_num=20):
    vectorized = vectorizer(stop_words='english', max_df=0.95, 
                             min_df=2, max_features=1000)
    features = vectorized.fit_transform(data)
    ret_models = {}
    ret_topics = {}
    for nc in n_components:
        print('\nFor n_components=%d' % nc)
        if method == 'lda':
            m = LDA(n_components=nc, random_state=666, learning_method='online', n_jobs=-1)
        else:
            m = NMF(n_components=nc, random_state=666)
        m.fit(features)
        ret_models[nc] = m
        ret_topics[nc] = print_top_words(m, vectorized.get_feature_names(), print_num)
    return ret_models, ret_topics

In [6]:
def es_topic_indexer(topics, index):
    for topic_id, words_pd in topics.items():
        words, pds = [], []
        for word, pd in words_pd.items():
            words.append(word)
            pds.append(pd)
        doc = {
            'topic_id' : topic_id,
            'top_words' : words,
            'word_prob' : pds
        }
        es.index(index=index, doc_type='topic', body=doc)

In [7]:
def es_text_indexer(data, index, gold=False):
    for idx, doc in tqdm(data.items()):
        es_doc = {
            'doc_id' : idx,
            'doc_text' : doc['text'],
            'doc_topics' : ', '.join([str(x) for x in doc['topics'][:5]]),
            'doc_topics_pd' : ', '.join([str(x) for x in doc['pd'][:5]])
        }
        if gold:
            es_doc['gold_summary'] = doc['gold']
        es.index(index=index, doc_type='doc', body=es_doc)

## 20NG

In [8]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/',
    subset='train', remove=('headers', 'footers', 'quotes'),
    shuffle=True, random_state=666)

In [9]:
%%time
lda_models, lda_topics = decomposition_helper(news_train.data, method='lda', n_components=[10, 20, 50], vectorizer=CountVectorizer)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs

For n_components=10
Topic #0: space (0.02703) new (0.01426) armenian (0.01348) president (0.01322) 1993 (0.01257) armenians (0.01183) research (0.01182) university (0.01134) national (0.01069) turkish (0.01056) center (0.01012) program (0.01006) nasa (0.01000) information (0.00954) april (0.00892) general (0.00812) press (0.00785) comp (0.00765) request (0.00753) launch (0.00723)
Topic #1: good (0.01785) year (0.01696) like (0.01522) new (0.01437) just (0.01280) car (0.01196) game (0.01171) team (0.01030) time (0.01013) got (0.00986) years (0.00899) don (0.00834) think (0.00831) home (0.00802) best (0.00768) better (0.00757) ll (0.00738) games (0.00730) make (0.00701) high (0.00696)
Topic #2: 145 (0.08498) b8f (0.04817) 34u (0.04244) bike (0.04081) w7 (0.03784) pl (0.03738) ah (0.03444) a86 (0.03319) 0t (0.03317) chz (0.03155) 2di (0.02877) lk (0.02830) air (0.02798) 1d9 (0.02764) gm (0.02676) ms (0.02647) 7u (0.02574) sl

In [7]:
_, _ = decomposition_helper(news_train.data, method='nmf', n_components=[10, 20, 50], vectorizer=TfidfVectorizer)


For n_components=10
Topic #0: don (0.01778) just (0.01737) people (0.01707) think (0.01431) like (0.01203) know (0.00813) right (0.00765) ve (0.00720) did (0.00712) time (0.00708) say (0.00703) really (0.00640) good (0.00634) way (0.00602) make (0.00583) said (0.00570) going (0.00562) want (0.00561) things (0.00554) thing (0.00492)
Topic #1: card (0.08603) video (0.04278) monitor (0.02833) cards (0.02211) drivers (0.02184) bus (0.02041) vga (0.01926) driver (0.01797) color (0.01617) memory (0.01536) bit (0.01509) ram (0.01389) board (0.01304) mode (0.01295) pc (0.01256) graphics (0.01157) apple (0.01099) 16 (0.01097) modem (0.01021) speed (0.01000)
Topic #2: god (0.08988) jesus (0.03210) bible (0.01888) believe (0.01649) christ (0.01634) faith (0.01514) christian (0.01499) christians (0.01433) church (0.01198) does (0.01036) life (0.01022) sin (0.00977) truth (0.00942) lord (0.00942) say (0.00924) man (0.00859) hell (0.00842) christianity (0.00787) love (0.00771) belief (0.00758)
Topi

#### ES

In [37]:
es_topic_indexer(lda_topics[10], 'topics20ng')

In [11]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, 
                             min_df=2, max_features=1000)
vectorizer.fit(news_train.data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [82]:
_20ng_data = {}
for idx, doc in tqdm(enumerate(news_train.data[:100])):
    _20ng_data[idx] = {}
    _20ng_data[idx]['text'] = doc
    pd = lda_models[10].transform(vectorizer.transform([doc]))[0]
    _20ng_data[idx]['topics'] = pd.argsort()[::-1]
    _20ng_data[idx]['pd'] = pd[pd.argsort()[::-1]]

100it [00:22,  4.40it/s]


In [85]:
es_text_indexer(_20ng_data, '20ng')

100%|██████████| 100/100 [00:06<00:00, 14.58it/s]


#### LDA analysis

In [59]:
news_train.data[0], news_train.target_names[news_train.target[0]]

("You can't make a Citizens arrest on anything but a felony.\n.\n  \n",
 'rec.motorcycles')

In [60]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=1000)
features = vectorizer.fit_transform(news_train.data)

In [62]:
for n_topics, model in lda_models.items():
    best_topic = model.transform(features[0])[0].argmax()
    print(n_topics, [k for k in lda_topics[n_topics][best_topic].keys()])
    print()

10 ['key', 'use', 'government', 'law', 'gun', 'scsi', 'public', 'chip', 'encryption', 'used', 'keys', 'security', 'control', 'clipper', 'privacy', 'motif', 'state', 'using', 'people', 'rights']

20 ['government', 'law', 'gun', 'president', 'state', 'states', 'rights', 'public', 'american', 'people', 'national', 'police', 'control', 'crime', 'guns', 'laws', 'administration', 'united', 'military', 'federal']

50 ['government', 'law', 'public', 'privacy', 'use', 'private', 'crime', 'legal', 'security', 'laws', 'federal', 'administration', 'congress', 'weapons', 'clinton', 'house', 'people', 'encryption', 'protect', 'enforcement']



The topics get better as the number of components increase.

## DUC

In [8]:
data_path = '../../data/DUC/'
docs = {}

In [9]:
def get_docs(path):
    import os
    from bs4 import BeautifulSoup
    gold_path = path + "Summaries"
    
    ret_dict = {}
    
    doc_files = os.listdir(path)
    gold_files = os.listdir(gold_path)

    for gold_file in gold_files:
        file_name =  gold_file.split('.')[0].upper()
        if file_name not in doc_files:
            print('no file for %s' % gold_file)
            continue

        sum_path = gold_path + '/' + gold_file

        with open(sum_path) as sum_file:                
            summary, doc = sum_file.read().strip().split('Introduction:')
            summary = summary.split('Abstract:')[1]
            
            ret_dict[file_name] = [doc.strip(), summary.strip()]
    return ret_dict

In [10]:
docs = get_docs(data_path)

no file for ap890325-0143.txt
no file for ap900928-0054.txt


In [11]:
duc_data = [v[0] for k, v in docs.items()]

In [12]:
lda_models_duc, lda_topics_duc = decomposition_helper(duc_data, method='lda', n_components=[10, 20, 50], vectorizer=CountVectorizer)


For n_components=10
Topic #0: said (0.03492) hurricane (0.02118) taylor (0.01344) year (0.01271) eclipse (0.01230) disease (0.01008) drought (0.01001) people (0.00772) storm (0.00725) sun (0.00725) hurricanes (0.00705) center (0.00666) years (0.00647) hospital (0.00594) florida (0.00585) pneumonia (0.00574) farmers (0.00560) miami (0.00547) new (0.00544) 000 (0.00537)
Topic #1: said (0.04476) police (0.02450) oil (0.01474) exxon (0.01263) earthquake (0.00902) forest (0.00884) year (0.00869) 000 (0.00868) national (0.00867) department (0.00852) fires (0.00802) spill (0.00763) officers (0.00727) officials (0.00700) miles (0.00690) officer (0.00680) valdez (0.00675) park (0.00658) state (0.00650) service (0.00607)
Topic #2: said (0.02321) gun (0.01948) right (0.01933) party (0.01903) assassination (0.01902) government (0.01630) people (0.01514) amendment (0.01443) arms (0.01306) second (0.01176) candidate (0.01173) police (0.01125) state (0.01019) military (0.00976) presidential (0.00933

In [13]:
_, _ = decomposition_helper(duc_data, method='nmf', n_components=[10, 20, 50], vectorizer=TfidfVectorizer)


For n_components=10
Topic #0: police (0.05493) officers (0.02296) said (0.01767) gates (0.01551) shining (0.01457) brutality (0.01425) path (0.01198) department (0.01185) officer (0.01098) commission (0.01017) angeles (0.00970) los (0.00961) jackson (0.00951) city (0.00945) black (0.00924) chief (0.00862) report (0.00773) racism (0.00666) guzman (0.00651) complaints (0.00633)
Topic #1: hurricane (0.06908) hurricanes (0.02640) storm (0.01933) sheets (0.01793) storms (0.01608) atlantic (0.01500) mph (0.01490) tropical (0.01395) winds (0.01380) florida (0.01344) said (0.01316) gilbert (0.01174) hugo (0.01129) gray (0.01110) season (0.01096) forecasters (0.01050) center (0.01014) miami (0.00898) louisiana (0.00805) weather (0.00788)
Topic #2: oil (0.06755) exxon (0.06428) spill (0.04570) valdez (0.03968) cleanup (0.02074) alaska (0.01938) said (0.01683) tanker (0.01627) sound (0.01238) ship (0.01162) million (0.00990) guard (0.00981) miles (0.00974) coast (0.00932) wildlife (0.00794) will

#### ES

In [38]:
es_topic_indexer(lda_topics_duc[10], 'topicsduc')

In [14]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, 
                             min_df=2, max_features=1000)
vectorizer.fit(duc_data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
_duc_data = {}
for i, (idx, doc) in tqdm(enumerate(docs.items())):
    _duc_data[idx] = {}
    _duc_data[idx]['text'] = doc[0]
    _duc_data[idx]['gold'] = doc[1]
    pd = lda_models_duc[10].transform(vectorizer.transform([doc[0]]))[0]
    _duc_data[idx]['topics'] = pd.argsort()[::-1]
    _duc_data[idx]['pd'] = pd[pd.argsort()[::-1]]

301it [00:53,  5.59it/s]


In [16]:
es_text_indexer(_duc_data, 'duc', gold=True)

100%|██████████| 301/301 [00:20<00:00, 14.64it/s]
