In [1]:
from math import log10

from summa.pagerank_weighted import pagerank_weighted_scipy as _pagerank
from summa.preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from summa.commons import build_graph as _build_graph
from summa.commons import remove_unreachable_nodes as _remove_unreachable_nodes
from summa.keywords import keywords
from summa.pagerank_weighted import pagerank_weighted as pagerank
from topmine_src import phrase_mining
from PyRouge.pyrouge import Rouge
from bm25 import get_bm25_weights as _bm25_weights
from six.moves import xrange

## Data Reading and Phrase Extract

In [2]:
# file_name = ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', 
#              '10.txt', '11.txt', '12.txt', '13.txt', '14.txt', '15.txt']
file_name = ['1.txt']

In [3]:
def phrase_extract(file_name):
    # represents the minimum number of occurences you want each phrase to have.
    min_support=2

    # represents the threshold for merging two words into a phrase. A lower value
    # alpha leads to higher recall and lower precision,
    alpha=4

    # length of the maximum phrase size
    max_phrase_size=10

    phrase_miner = phrase_mining.PhraseMining(file_name, min_support, max_phrase_size, alpha);
    partitioned_docs, index_vocab = phrase_miner.mine()
    frequent_phrases = phrase_miner.get_frequent_phrases(min_support)
    
    return frequent_phrases

## Build Graph

In [4]:
def _set_graph_edge_weights(graph):
    WEIGHT_THRESHOLD = 1.e-3
    weights = _bm25_weights(graph.nodes())
    i = 0
    for sentence_1 in graph.nodes():
        j = 0
        for sentence_2 in graph.nodes():
            weight = (weights[i][j] + 100) * 0.1
            if i == j or weight < WEIGHT_THRESHOLD:
                continue

            edge = (sentence_1, sentence_2)
            if sentence_1 != sentence_2 and not graph.has_edge(edge):
                    graph.add_edge(edge, weight * graph.node_weight[sentence_2])
            j += 1
        i += 1

    # Handles the case in which all similarities are zero.
    # The resultant summary will consist of random sentences.
    if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
        _create_valid_graph(graph)

In [5]:
def _create_valid_graph(graph):
    nodes = graph.nodes()

    for i in range(len(nodes)):
        for j in range(len(nodes)):
            if i == j:
                continue

            edge = (nodes[i], nodes[j])

            if graph.has_edge(edge):
                graph.del_edge(edge)

            graph.add_edge(edge, 1)

In [6]:
def _get_similarity(s1, s2):
    words_sentence_one = s1.split()
    words_sentence_two = s2.split()

    common_word_count = len(set(words_sentence_one) & set(words_sentence_two))

    log_s1 = log10(len(words_sentence_one))
    log_s2 = log10(len(words_sentence_two))

    if log_s1 + log_s2 == 0:
        return 0

    return common_word_count / (log_s1 + log_s2)

In [7]:
def _format_results(extracted_sentences, split, score):
    if score:
        return [(sentence.text, sentence.score) for sentence in extracted_sentences]
    if split:
        return [sentence.text for sentence in extracted_sentences]
    return " ".join([sentence.text for sentence in extracted_sentences])

In [8]:
def _get_sentences_with_word_count(sentences, words):
    """ Given a list of sentences, returns a list of sentences with a
    total word count similar to the word count provided.
    """
    word_count = 0
    selected_sentences = []
    # Loops until the word count is reached.
    for sentence in sentences:
        words_in_sentence = len(sentence.text.split())

        # Checks if the inclusion of the sentence gives a better approximation
        # to the word parameter.
        if abs(words - word_count - words_in_sentence) > abs(words - word_count):
            return selected_sentences

        selected_sentences.append(sentence)
        word_count += words_in_sentence

    return selected_sentences

In [9]:
def _extract_most_important_sentences(sentences, ratio, words):
    sentences.sort(key=lambda s: s.score, reverse=True)

    # If no "words" option is selected, the number of sentences is
    # reduced by the provided ratio.
    if words is None:
        length = len(sentences) * ratio
        return sentences[:int(length)]

    # Else, the ratio is ignored.
    else:
        return _get_sentences_with_word_count(sentences, words)


In [10]:
def get_graph(text, language="english"):
    sentences = _clean_text_by_sentences(text, language)

    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    return graph

## Add weights to sentecences containing important phrase

In [11]:
def _add_scores_to_sentences(sentences, scores, frequent_phrases):
    for sentence in sentences:
        # Adds the score to the object if it has one.
        if sentence.token in scores:
            sentence.score = scores[sentence.token]
        else:
            sentence.score = 0
            
#         for phrase in frequent_phrases:
#             if((sentence.text.lower()).find(phrase) >= 0):
#                 sentence.score += frequent_phrases[phrase] * 0.01


In [12]:
def get_summary(file):
    with open ('../data/document/' + file_name[0], 'r', encoding='utf-8') as f:
        data = f.read()
    text = data.replace('\n', '')
    
    frequent_phrases = phrase_extract('../data/document/' + file)
    
    ratio=0.2
    words=None
    split=False
    scores=False
    additional_stopwords=None

    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, 'english', additional_stopwords)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
#     graph = _build_graph([sentence.token for sentence in sentences])
    graph = _build_graph(sentences, frequent_phrases)
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        print('Graph is Empty')

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores, frequent_phrases)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    res = _format_results(extracted_sentences, split, scores)
    
    return res

### Evaluate

In [13]:
res = []
for file in file_name:
    summary = get_summary(file)
    res.append(summary)
print(res)

['However, only about 25 to 30% of the homes in the earthquake-prone San Francisco area have earthquake insurance. The damage goes far beyond that of Hurricane Hugo along the Carolinas last month. About 25% of the policyholders have earthquake insurance.']


In [14]:
refer = []
for file in file_name:
    with open ('../data/summary/' + file, 'r', encoding='utf-8') as f:
        summary = f.read().replace('\n','')
    refer.append(summary)
print(refer)

["The White House and the California government are provided aid to Tuesday's California earthquake victims. Only 25-30% of homes in the San Francisco area are insured. Insurers may have to pay billions but no company is expected to suffer financial damage. Insurers price-cutting may end as prices rise from reinsurance investments."]


In [15]:
rouge = Rouge()
precision = 0
recall = 0
f_score = 0
for i in range(len(file_name)):
    [p, r, f] = rouge.rouge_l([res[i]], [refer[i]])
    precision += p
    recall += r
    f_score += f

print(precision / len(file_name))
print(recall / len(file_name))
print(f_score / len(file_name))

0.5158730158730159
0.39274924471299094
0.44597007613250755
