In [24]:
from collections import Counter
import re

import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import entropy as kl_div
from math import log

from tqdm import tqdm
from rouge import Rouge

## Index ES for topics and data

In [2]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

In [3]:
def indexer(index, doc_type, body):
    data = es.search(index=index, doc_type=doc_type, body=body)['hits']['hits']
    data = [d['_source'] for d in data]
    return data

In [5]:
_text = {
    'size' : 301,
    'query': {
        'match_all' : {}
    }
}
duc_es = indexer('ducsummary', 'doc', _text)

In [6]:
_text = {
    'size' : 500,
    'query': {
        'match_all' : {}
    }
}
_20ng_es = indexer('20ngsummary', 'doc', _text)

In [7]:
_topics = {
    'size' : 10,
    'query': {
        'match_all' : {}
    }
}
duc_topics = indexer('topicsduc', 'topic', _topics)
_20ng_topics = indexer('topics20ng', 'topic', _topics)

## KlSum

In [16]:
regex = re.compile(r"(\W)")
wc = lambda text: Counter([t for t in re.split(r"(\W)", text) if t and t != ' ' and t != '\n'])
pd = lambda wc: {k: v/sum(wc.values()) for k, v in wc.items()}

In [42]:
def klsum(document, summary, L):
    doc_sent = sent_tokenize(document)
    doc_wc = wc(document)
    doc_pd = pd(doc_wc)
    px = [p for p in doc_pd.values()]
    if len(doc_sent) == 1 or len(doc_sent) == 0:
        return document.strip()
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(document)
        
        _min, _min_id = 999, -1
        for idx, sent in enumerate(sentences):
            new_sum = summary + sent
            new_pd = pd(wc(new_sum))
            qx = [new_pd[k] if k in new_pd else 0.001 for k in doc_pd.keys()]
            kl = kl_div(px, qx)
            if kl < _min:
                _min, _min_id = kl, idx
        
        summary += "\n" + sentences[_min_id]
        document = " ".join(sentences[:_min_id] + sentences[_min_id+1:])
    
    return summary.strip()

In [9]:
duc_kl = {}
for d in duc_es:
    duc_kl[d['doc_id']] = klsum(d['doc_text'], '', 1)

In [43]:
_20ng_kl = {}
for i, d in tqdm(enumerate(_20ng_es)):
    _20ng_kl[d['doc_id']] = klsum(d['doc_text'], '', 1)

501it [00:01, 395.96it/s]


## LDASum

In [52]:
def ldasum(data, summary, topics, L):
    doc = data['doc_text']
    doc_topics = [int(t.strip()) for t in data['doc_topics'].split(',')]
    doc_topics_pd = [float(t.strip()) for t in data['doc_topics_pd'].split(',')]
    
    if len(sent_tokenize(doc)) == 0 or len(sent_tokenize(doc)) == 1:
        return doc
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(doc)
        score = [0] * len(sentences)
        for idx, topic in enumerate(topics):
            if topic['topic_id'] in doc_topics:
                for sidx, sent in enumerate(sentences):
                    for word in word_tokenize(sent):
                        if word in topic['top_words']:
                            score[sidx] += topic['word_prob'][topic['top_words'].index(word)]
        
        best = np.argmax(score)
        summary += ' \n ' + sentences[best]
        doc = " ".join(sentences[:best] + sentences[best+1:])
        
    return summary.strip()

In [53]:
_20_lda = {}
for d in tqdm(_20ng_es):
    _20_lda[d['doc_id']] = ldasum(d, '', _20ng_topics, 1)

100%|██████████| 500/500 [00:05<00:00, 91.02it/s]


In [55]:
duc_lda = {}
for d in tqdm(duc_es):
    duc_lda[d['doc_id']] = ldasum(d, '', duc_topics, 1)

100%|██████████| 301/301 [00:13<00:00, 21.57it/s]


## ES

In [56]:
for d in tqdm(_20ng_es):
    _id = d['doc_id']
    d['kl_summary'] = _20ng_kl[_id]
    d['lda_summary'] = _20_lda[_id]
    
    es.index(index='20ngsummary', doc_type='doc', body=d)

100%|██████████| 500/500 [00:35<00:00, 14.08it/s]


## Extra Credit

In [17]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
duc_data = [d['doc_text'] for d in duc_es]

In [19]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=1000)
features = vectorizer.fit_transform(duc_data)
model = LatentDirichletAllocation(n_components=10, random_state=666, learning_method='online', n_jobs=-1)
model.fit(features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=666, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [20]:
def topicsum(document, summary, L):
    px = model.transform(vectorizer.transform([document]))[0]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(document)
        
        _min, _min_id = 999, -1
        for idx, sent in enumerate(sentences):
            new_sum = summary + sent
            qx = model.transform(vectorizer.transform([new_sum]))[0]
            kl = kl_div(px, qx)
            if kl < _min:
                _min, _min_id = kl, idx
        
        summary += "\n" + sentences[_min_id]
        document = " ".join(sentences[:_min_id] + sentences[_min_id+1:])
    
    return summary.strip()

In [28]:
for d in duc_es:
    if d['doc_id'] == 'AP880217-0175':
        test = d['doc_text']
        ref = d['gold_summary']
        lda = d['lda_summary']
        kl = d['kl_summary']

In [40]:
hyp = topicsum(test, '', 2)

In [25]:
rouge = Rouge()

In [42]:
rouge.get_scores(hyp, ref)[0]['rouge-2']['f']

0.3333333284621247

In [43]:
rouge.get_scores(lda, ref)[0]['rouge-2']['f']

0.3053435073970049

In [44]:
rouge.get_scores(kl, ref)[0]['rouge-2']['f']

0.0312499960986333

Looks like adding PD and PS topic distributions do give better results.