In [23]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer as SummarizerLex
from sumy.summarizers.sum_basic import SumBasicSummarizer as SummarizerSumBasic
from sumy.summarizers.text_rank import TextRankSummarizer  as SummarizerTextrank
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"

import sys
sys.path.insert(0, '../src')

import metrics as me

In [24]:
def summarization_one_file(summarizer, parser, SENTENCES_COUNT):

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sentences.append(str(sentence))

    return sentences

def summarization_all_files(patents, model='textrank', SENTENCES_COUNT=3):

    stemmer = Stemmer(LANGUAGE)

    if model == 'textrank':

        summarizer = SummarizerTextrank(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

    elif model == 'sumbasic':

        summarizer = SummarizerSumBasic(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)


    summaries = []

    f = open("{}_summ.txt".format(model), 'w')
    for text in patents:

        parser = PlaintextParser(text, Tokenizer(LANGUAGE))
        summ = summarization_one_file(summarizer, parser, SENTENCES_COUNT=SENTENCES_COUNT)
        summ = ' '.join(summ)
        summaries.append(summ)
        f.write(summ)

    f.close()

    return summaries

In [34]:
def evaluation(candidates, references, sources, algorithm):

    metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "BLEU"]
    me.create_report_valid(
            candidates, references, sources,
            name_file="../validation/{}.xml".format(algorithm),
            metrics=metrics)

In [111]:
patents = open("../sumdata/abstract.valid.pp.txt").readlines()
titles = open("../sumdata/title.valid.pp.txt").readlines()

8307

# TextRank

In [116]:
candidates_text = summarization_all_files(patents, model='textrank', SENTENCES_COUNT=1)
evaluation(candidates_text, titles, patents, 'textrank')

# SumBasic

In [118]:
candidates_sumbasic = summarization_all_files(patents, model='sumbasic', SENTENCES_COUNT=1)
evaluation(candidates_sumbasic, titles, patents, 'sumbasic')