In [23]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from bs4 import BeautifulSoup

import random
import json
import os
import re
import pandas as pd

from sumy.summarizers.lex_rank import LexRankSummarizer as SummarizerLex
from sumy.summarizers.sum_basic import SumBasicSummarizer as SummarizerSumBasic
from sumy.summarizers.text_rank import TextRankSummarizer  as SummarizerTextrank
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


LANGUAGE = "english"

In [32]:
import sys

sys.path.insert(0, '../src')

In [33]:
import rouge

In [24]:
def summarization_one_file(summarizer, parser, SENTENCES_COUNT):

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sentences.append(str(sentence))

    return sentences

def summarization_all_files(patents, model='lex', SENTENCES_COUNT=3):

    stemmer = Stemmer(LANGUAGE)

    if model == 'lex':

        summarizer = SummarizerLex(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
    
    elif model == 'textrank':

        summarizer = SummarizerTextrank(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

    elif model == 'sumbasic':

        summarizer = SummarizerSumBasic(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)


    summaries = []

    f = open("{}_summ.txt".format(model), 'w')
    for text in patents:

        parser = PlaintextParser(text, Tokenizer(LANGUAGE))
        summ = summarization_one_file(summarizer, parser, SENTENCES_COUNT=SENTENCES_COUNT)
        summ = ' '.join(summ)
        summaries.append(summ)
        f.write(summ)

    f.close()

    return summaries

In [34]:
def evaluation(candidates, references, sources, algorithm):

    metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "BLEU"]
    rouge.create_report_valid(
            candidates, references, sources,
            name_file="../validation/{}.xml".format(algorithm),
            metrics=metrics)

In [111]:
patents = open("../sumdata/abstract.valid.pp.txt").readlines()
len(patents)

8307

In [112]:
titles = open("../sumdata/title.valid.pp.txt").readlines()

# TextRank

In [116]:
candidates_text = summarization_all_files(patents, model='textrank', SENTENCES_COUNT=1)

In [117]:
evaluation(candidates_text, titles, patents, 'textrank')

0
1000
2000
3000
4000
5000
6000
7000
8000


# SumBasic

In [118]:
candidates_sumbasic = summarization_all_files(patents, model='sumbasic', SENTENCES_COUNT=1)

In [119]:
evaluation(candidates_sumbasic, titles, patents, 'sumbasic')

0
1000
2000
3000
4000
5000
6000
7000
8000


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [57]:
doc = nlp(candidates_lex[0])

list_pos = []
for token in doc:
    list_pos.append(token.pos_)

In [120]:
e1 = open("../hybrid_results/e1.txt").readlines()
len(e1)

8306

In [49]:
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [127]:
def filter_summaries(candidates):

    format_candidates = []

    for candidate in candidates:

        doc = nlp(candidate)

        list_pos = []
        for token in doc:
            list_pos.append(token.pos_)

        words = candidate.split(" ")

        sentence_ws = []
        sentence = []
        cont = 0

        for i in words:

            if (not i in sw) and (len(sentence) <= 15):
                sentence.append(i)
            if (len(sentence) <= 15):
                sentence_ws.append(i)

        if (list_pos[len(sentence_ws)-1] != "NOUN"):
            for i in range(len(sentence_ws)):
                if list_pos[len(sentence_ws)-1 - i] != "NOUN":
                    cont+=1

        if cont>0:
            format_candidates.append(" ".join(sentence_ws[:cont]))
        else:
            format_candidates.append(" ".join(sentence_ws))

    
    return format_candidates


In [128]:
filtered_e1 = filter_summaries(e1)

In [130]:
evaluation(e1, titles, patents, 'hts_e1')

0
1000
2000
3000
4000
5000
6000
7000
8000
