In [1]:
from collections import Counter
import re

import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import entropy as kl_div
from math import log

In [2]:
data_path = '../../data/DUC/'
docs = {}

In [3]:
def get_docs(path):
    import os
    from bs4 import BeautifulSoup
    ret_dict = {}
    files = os.listdir(path)
    for file in files:
        file = path + file
        if os.path.isdir(file) or '.txt' in file:
            continue
        with open(path + file) as infile:
            soup = BeautifulSoup(infile, 'html.parser')
            ret_dict[soup.docno.text.strip()] = soup.find('text').text.strip()
    return ret_dict

In [4]:
%%time
docs = get_docs(data_path)

CPU times: user 535 ms, sys: 40.9 ms, total: 576 ms
Wall time: 2.37 s


In [5]:
duc_data = [v for k, v in docs.items()]

## KlSum

In [87]:
regex = re.compile(r"(\W)")
wc = lambda text: Counter([t for t in re.split(r"(\W)", text) if t and t != ' ' and t != '\n'])
pd = lambda wc: {k: v/sum(wc.values()) for k, v in wc.items()}

In [110]:
def klsum(document, summary, L):
    doc_sent = sent_tokenize(document)
    doc_wc = wc(document)
    doc_pd = pd(doc_wc)
    px = [p for p in doc_pd.values()]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(document)
        
        _min, _min_id = 999, -1
        for idx, sent in enumerate(sentences):
            new_sum = summary + sent
            new_pd = pd(wc(new_sum))
            qx = [new_pd[k] if k in new_pd else 0.001 for k in doc_pd.keys()]
            kl = kl_div(px, qx)
            if kl < _min:
                _min, _min_id = kl, idx
        
        summary += "\n" + sentences[_min_id]
        document = " ".join(sentences[:_min_id] + sentences[_min_id+1:])
    
    return summary.strip()

In [132]:
%%time
print(klsum_new(docs['AP880217-0175'], '', 2))




Census officials say they are required to count everyone by the
U.S. Constitution, which does not mention citizenship but only
instructs that the House apportionment be based on the ``whole
number of persons'' residing in the various states.


Rep. Tom Ridge, R-Pa., said the Census Bureau should actually
count everyone but that it should develop a method to determine how
many people are illegally in the country, and them deduct that
number from the figures used for reapportioning Congress.
CPU times: user 7.98 ms, sys: 10 µs, total: 7.99 ms
Wall time: 7.67 ms


## TopicSum

In [6]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=1000)
features = vectorizer.fit_transform(duc_data)
model = LatentDirichletAllocation(n_components=10, random_state=666, learning_method='online', n_jobs=-1)
model.fit(features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=666, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [135]:
def topicsum(document, summary, L):
    px = model.transform(vectorizer.transform([document]))[0]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(document)
        
        _min, _min_id = 999, -1
        for idx, sent in enumerate(sentences):
            new_sum = summary + sent
            qx = model.transform(vectorizer.transform([new_sum]))[0]
            kl = kl_div(px, qx)
            if kl < _min:
                _min, _min_id = kl, idx
        
        summary += "\n" + sentences[_min_id]
        document = " ".join(sentences[:_min_id] + sentences[_min_id+1:])
    
    return summary.strip()

In [136]:
print(topicsum(docs['AP880217-0175'], '', 2))

The group contends that including the estimated 2 million or
more illegal aliens in the national head count, which is used to
distribute seats in the House of Representatives, will cause unfair
shifts of seats from one state to another.
Some 40 members of the House joined the Federation for American
Immigration Reform in announcing that the suit would be filed
Thursday in U.S. District Court in Pittsburgh, spokesmen said at a
news conference here.
