In [128]:
from collections import Counter
import re

import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import entropy as kl_div
from math import log

from tqdm import tqdm

## Index ES for topics and data

In [3]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

In [13]:
_text = {
    'size' : 301,
    'query': {
        'match_all' : {}
    }
}

In [20]:
data = es.search(index='duc', doc_type='doc', body=_text)['hits']['hits']
data = [d['_source'] for d in data]

In [25]:
_topics = {
    'size' : 10,
    'query': {
        'match_all' : {}
    }
}

In [96]:
topics = es.search(index='topicsduc', doc_type='topic', body=_topics)['hits']['hits']
topics = [t['_source'] for t in topics]

## KlSum

In [30]:
regex = re.compile(r"(\W)")
wc = lambda text: Counter([t for t in re.split(r"(\W)", text) if t and t != ' ' and t != '\n'])
pd = lambda wc: {k: v/sum(wc.values()) for k, v in wc.items()}

In [31]:
def klsum(document, summary, L):
    doc_sent = sent_tokenize(document)
    doc_wc = wc(document)
    doc_pd = pd(doc_wc)
    px = [p for p in doc_pd.values()]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(document)
        
        _min, _min_id = 999, -1
        for idx, sent in enumerate(sentences):
            new_sum = summary + sent
            new_pd = pd(wc(new_sum))
            qx = [new_pd[k] if k in new_pd else 0.001 for k in doc_pd.keys()]
            kl = kl_div(px, qx)
            if kl < _min:
                _min, _min_id = kl, idx
        
        summary += "\n" + sentences[_min_id]
        document = " ".join(sentences[:_min_id] + sentences[_min_id+1:])
    
    return summary.strip()

In [49]:
kl_summaries = {}
for d in data:
    kl_summaries[d['doc_id']] = klsum(d['doc_text'], '', 1)

## LDASum

In [131]:
def ldasum(data, summary, topics, L):
    doc = data['doc_text']
    doc_topics = [int(t.strip()) for t in data['doc_topics'].split(',')]
    doc_topics_pd = [float(t.strip()) for t in data['doc_topics_pd'].split(',')]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(doc)
        score = [0] * len(sentences)
        for idx, topic in enumerate(topics):
            if topic['topic_id'] in doc_topics:
                for sidx, sent in enumerate(sentences):
                    for word in word_tokenize(sent):
                        if word in topic['top_words']:
                            score[sidx] += topic['word_prob'][topic['top_words'].index(word)]
        
        best = np.argmax(score)
        summary += ' \n ' + sentences[best]
        doc = " ".join(sentences[:best] + sentences[best+1:])
        
    return summary.strip()

In [132]:
lda_summaries = {}
for d in tqdm(data):
    lda_summaries[d['doc_id']] = ldasum(d, '', topics, 1)

100%|██████████| 301/301 [00:13<00:00, 22.96it/s]


## ES

In [139]:
for d in tqdm(data):
    _id = d['doc_id']
    d['kl_summary'] = kl_summaries[_id]
    d['lda_summary'] = lda_summaries[_id]
    
    es.index(index='ducsummary', doc_type='doc', body=d)

100%|██████████| 301/301 [00:19<00:00, 15.23it/s]


## TopicSum

In [6]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=1000)
features = vectorizer.fit_transform(duc_data)
model = LatentDirichletAllocation(n_components=10, random_state=666, learning_method='online', n_jobs=-1)
model.fit(features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=666, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [135]:
def topicsum(document, summary, L):
    px = model.transform(vectorizer.transform([document]))[0]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(document)
        
        _min, _min_id = 999, -1
        for idx, sent in enumerate(sentences):
            new_sum = summary + sent
            qx = model.transform(vectorizer.transform([new_sum]))[0]
            kl = kl_div(px, qx)
            if kl < _min:
                _min, _min_id = kl, idx
        
        summary += "\n" + sentences[_min_id]
        document = " ".join(sentences[:_min_id] + sentences[_min_id+1:])
    
    return summary.strip()

In [136]:
print(topicsum(docs['AP880217-0175'], '', 2))

The group contends that including the estimated 2 million or
more illegal aliens in the national head count, which is used to
distribute seats in the House of Representatives, will cause unfair
shifts of seats from one state to another.
Some 40 members of the House joined the Federation for American
Immigration Reform in announcing that the suit would be filed
Thursday in U.S. District Court in Pittsburgh, spokesmen said at a
news conference here.
