In [2]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from bs4 import BeautifulSoup

import random
import json
import os
import re
import pandas as pd

from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import *
from summarizer import Summarizer
from summarizer.coreference_handler import CoreferenceHandler

In [3]:
import sys
sys.path.insert(1, '/media/cinthia/Dados/Mestrado/text-summarizer-long-documents/src')

import importlib
importlib.reload(evaluate)

NameError: name 'evaluate' is not defined

In [4]:
from sumy.summarizers.lex_rank import LexRankSummarizer as SummarizerLex
from sumy.summarizers.sum_basic import SumBasicSummarizer as SummarizerSumBasic
from sumy.summarizers.text_rank import TextRankSummarizer  as SummarizerTextrank
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


def summarization_one_file(summarizer, parser, SENTENCES_COUNT):

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sentences.append(str(sentence))

    return sentences

def summarization_all_files(df, model='lex', section='intro', SENTENCES_COUNT=3):

    stemmer = Stemmer(LANGUAGE)

    if model == 'lex':

        summarizer = SummarizerLex(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
    
    elif model == 'textrank':

        summarizer = SummarizerTextrank(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

    elif model == 'sumbasic':

        summarizer = SummarizerSumBasic(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)


    summaries = []

    f = open("{}_{}_summ.txt".format(model, section), 'w')
    for text in df['pp_reference']:

        parser = PlaintextParser(text, Tokenizer(LANGUAGE))
        summ = summarization_one_file(summarizer, parser, SENTENCES_COUNT=3)
        summ = ' '.join(summ)
        summaries.append(summ)
        f.write(summ)

    f.close()

    return summaries

In [31]:
def summarization_all_files_bert(df, model, model_name='bert_basic', section='intro', SENTENCES_COUNT=3):

    summaries = []

    f = open("{}_{}_summ.txt".format(model_name, section), 'w')
    for text in df['pp_reference']:

        summ = model(text, num_sentences=SENTENCES_COUNT)
        summaries.append(summ)
        f.write(summ)

    f.close()

    return summaries

In [5]:
def load_name_files(path_base, files):

    texts = []
    for file in files:
        texts.append(json.load(open('{}/{}'.format(path_base, file))))

    return texts

In [6]:
def load_files(files, path_base):

    section_1 = []
    section_2 = []
    section_3 = []
    section_4 = []
    keywords = []

    texts = load_name_files(path_base, files)

    for i in texts:

        section_1.append(format_intro(i.get('sec_abstract')))
        section_2.append(format_intro(i.get('sec_introduction')))
        section_3.append(format_intro(i.get('sec_materials_and_methods')))
        section_4.append(format_intro(i.get('sec_results_and_conclusion')))
        keywords.append(i.get('sec_keyword'))

    return section_1, section_2, section_3, section_4, keywords

In [7]:
def get_number_sentences(text):

    model = Summarizer()
    k = model.calculate_optimal_k(text, k_max=5)

    return k

In [9]:
def get_citations(text):

  soup = BeautifulSoup(text, 'html.parser')
  bib = soup.findAll('xref')

  return bib

def replace_bib(text, bibs):

  for i in bibs:
    text = text.replace(str(i), '')
    
  return text

def remove_citations(xml, text):
  
  bibs = get_citations(xml)
  text = replace_bib(text, bibs)
  text = format_text(text, post_processing=True)
  
  return text

def format_intro(text):

  text = text.replace("INTRODUCTION", "")
  text = text.replace("Introduction", "")
  text = text.replace('\n\nOBJECTIVE\n', '')
  text = text.replace('\n\nObjectives\n', '')
  text = text.replace('\nSummary\n\n', '')
  text = text.replace("\n", "")

  return text

def format_xml(xml):

  xml = xml.replace(".<xref", ". <xref")
  xml = xml.replace("</p>","</p>  " )
  xml = xml.replace('.</p>', "</p>.")
  xml = xml.replace('<title-introduction><title></title>', '')
  xml = xml.replace('</title-introduction>', '')
  xml = xml.replace("<italic>et al</italic>.", "<italic>et al</italic>")

  return xml

def format_text(text, post_processing=False):

  text = text.replace(".<xref", ". <xref")
  text = text.replace("</p>","</p> ")
  text = text.replace('.</p>', "</p>.")
  if post_processing:
    text = text.replace("-", " ")
    text = text.replace("–", '')
    text = re.sub(r'(?s)\(.*?\)', '', text) 
    text = re.sub(r'(?s)\[.*?\]', '', text) 
    text = text.replace("(,)", "")
    text = text.replace("()", "")
    text = text.replace("[,]", "")
    text = text.replace("[]", "")
    text = text.replace("(; )", "")
    text = text.replace("(; )", "")
    text = re.sub(r'(?s)<title>.*?</title>', '', text) 

  return text

In [10]:
def preprocess(section, reference):

    xml = format_xml(str(section))
    text = format_text(str(section), post_processing=False)
    reference = format_text(str(reference), post_processing=True)

    bibs = get_citations(xml)
    text = replace_bib(text, bibs)
    text = format_text(text, post_processing=True)

    soup = BeautifulSoup(text)
    text = soup.get_text()

    soup = BeautifulSoup(reference)
    reference = soup.get_text()

    return text, reference

In [11]:
def preprocess_all(sources, references):

    pp_source = []
    pp_references = []

    for i in range(len(references)):

        text, reference = preprocess(sources[i], references[i])

        pp_source.append(text)
        pp_references.append(text)

    pp_texts = {'pp_source': pp_source, 'pp_reference': pp_references}

    return pp_texts

In [12]:
def evaluation(candidates, references, sources, algorithm, section):

    metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "BLEU"]
    evaluate.create_report_valid(
            candidates, references, sources,
            name_file="../validation/validation_{}_{}.xml".format(algorithm, section),
            metrics=metrics)

In [None]:
import numpy as np

def count_len(text):

    count_sentences = []
    count_words = []

    for i in text:
        count_sentences.append(len(i.split('.')))
        count_words.append(len(i.split(' ')))

    print("Número médio de sentenças: {}".format(np.mean(count_sentences)))
    print("Número médio de palavras: {}".format(np.mean(count_words)))

    return count_sentences, count_words

# Load Data

In [33]:
df = pd.read_csv("files.csv")
name_files = df['files'].tolist()

In [14]:
path_base = '../../sumdata/dataset_articles'
section_1, section_2, section_3, section_4, keywords = load_files(name_files, path_base)

In [15]:
all_section = [section_2[i] + " " +  section_3[i] + " " + section_4[i] for i in range(len(section_1))]

In [16]:
LANGUAGE = "english"

k2 = [3] *len(section_1)
k3 = [3] *len(section_1)
k4 = [3] *len(section_1)
k5 = [9] *len(section_1)

In [17]:
pp_intro = preprocess_all(section_2, section_1)
pp_mat = preprocess_all(section_3, section_1)
pp_conc = preprocess_all(section_4, section_1)

In [30]:
pp_all = preprocess_all(all_section, section_1)

In [None]:
custom_config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased')
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
custom_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', config=custom_config)

In [None]:
bertbasic = Summarizer()

# Introduction

In [34]:
count_sentences, count_words = count_len(pp_intro['pp_reference'])
values['sentences_ref'] = count_sentences
values['words_ref'] = count_words

Número médio de sentenças: 10.873
Número médio de palavras: 210.2152


In [35]:
count_sentences, count_words = count_len(pp_intro['pp_source'])
values['sentences_intro'] = count_sentences
values['words_intro'] = count_words

Número médio de sentenças: 22.6116
Número médio de palavras: 540.2956


## LexRank

In [29]:
candidates_lex = summarization_all_files(pp_intro, model='lex', section='intro', SENTENCES_COUNT=3)

## TextRank

In [None]:
candidates_text = summarization_all_files(pp_intro, model='texrank', section='intro', SENTENCES_COUNT=3)

## SumBasic

In [None]:
candidates_sumbasic = summarization_all_files(pp_intro, model='sumbasic', section='intro', SENTENCES_COUNT=3)

# BERT-Basic

In [None]:
candidates_bertbasic = summarization_all_files_bert(pp_intro, bertbasic, model_name='bert_basic', section='intro', SENTENCES_COUNT=3)

## SciBERT Summ

In [32]:
candidates_custombert = summarization_all_files_bert(pp_intro, custom_model, model_name='custom_bert', section='intro', SENTENCES_COUNT=3)

SyntaxError: invalid syntax (<ipython-input-32-eb1b9ec2fb2c>, line 1)

In [38]:
print("Evaluation TextRank Results")
evaluation(candidates=candidates_text, references=pp_intro['pp_reference'], sources=pp_intro['pp_source'], algorithm="text", section="intro")
print("\nEvaluation LexRank Results")
evaluation(candidates=candidates_lex, references=pp_intro['pp_reference'], sources=pp_intro['pp_source'], algorithm="lex", section="intro")
print("\nEvaluation Sumbasic Results")
evaluation(candidates=candidates_sumbasic, references=pp_intro['pp_reference'], sources=pp_intro['pp_source'], algorithm="sumbasic", section="intro")
print("\nEvaluation Sumbasic Results")
evaluation(candidates=candidates_bertbasic, references=pp_intro['pp_reference'], sources=pp_intro['pp_source'], algorithm="bertbasic", section="intro")
print("\nEvaluation SciBERT Summ Results")
evaluation(candidates=candidates_custombert, references=pp_intro['pp_reference'], sources=pp_intro['pp_source'], algorithm="custombert", section="intro")

0
1000
2000
3000
4000
0
1000
2000
3000
4000
0
1000
2000
3000
4000


# Materials and Methods

In [40]:
count_sentences, count_words = count_len(source)
values['sentences_mat'] = count_sentences
values['words_mat'] = count_words

Número médio de sentenças: 59.1334
Número médio de palavras: 1077.3682


## LexRank

In [39]:
candidates_lex = summarization_all_files(pp_mat, model='lex', section='mat', SENTENCES_COUNT=3)

## TextRank

In [None]:
candidates_text = summarization_all_files(pp_mat, model='texrank', section='mat', SENTENCES_COUNT=3)

## Sumbasic

In [None]:
candidates_sumbasic = summarization_all_files(pp_mat, model='sumbasic', section='mat', SENTENCES_COUNT=3)

# BERT Basic

In [None]:
candidates_bertbasic = summarization_all_files_bert(pp_mat, bertbasic, model_name='bert_basic', section='mat', SENTENCES_COUNT=3)

## Scibert summ

In [None]:
candidates_custombert = summarization_all_files_bert(pp_mat, custom_model, model_name='custom_bert', section='mat', SENTENCES_COUNT=3)

In [43]:
print("Evaluation TextRank Results")
evaluation(candidates=candidates_text, references=pp_mat['pp_reference'], sources=pp_mat['pp_source'], algorithm="text", section="mat")
print("\nEvaluation LexRank Results")
evaluation(candidates=candidates_lex, references=pp_mat['pp_reference'], sources=pp_mat['pp_source'], algorithm="lex", section="mat")
print("\nEvaluation Sumbasic Results")
evaluation(candidates=candidates_sumbasic, references=pp_mat['pp_reference'], sources=pp_mat['pp_source'], algorithm="sumbasic", section="mat")
print("\nEvaluation Sumbasic Results")
evaluation(candidates=candidates_bertbasic, references=pp_mat['pp_reference'], sources=pp_mat['pp_source'], algorithm="bertbasic", section="mat")
print("\nEvaluation SciBERT Summ Results")
evaluation(candidates=candidates_custombert, references=pp_mat['pp_reference'], sources=pp_mat['pp_source'], algorithm="custombert", section="mat")


Evaluation TextRank Results
0
1000
2000
3000
4000

Evaluation LexRank Results
0
1000
2000
3000
4000

Evaluation Sumbasic Results
0
1000
2000
3000
4000


# Conclusion

In [47]:
count_sentences, count_words = count_len(source)
values['sentences_conc'] = count_sentences
values['words_conc'] = count_words

Número médio de sentenças: 109.9564
Número médio de palavras: 2081.2918


## LexRank

In [48]:
candidates_lex = summarization_all_files(pp_conc, model='lex', section='conc', SENTENCES_COUNT=3)

## TextRank

In [None]:
candidates_text = summarization_all_files(pp_conc, model='texrank', section='conc', SENTENCES_COUNT=3)

## SumBasic

In [None]:
candidates_sumbasic = summarization_all_files(pp_conc, model='sumbasic', section='conc', SENTENCES_COUNT=3)

# BERT - Basic

In [None]:
candidates_bertbasic = summarization_all_files_bert(pp_conc, bertbasic, model_name='bert_basic', section='conc', SENTENCES_COUNT=3)

## Scibert Summ

In [None]:
candidates_custombert = summarization_all_files_bert(pp_conc, custom_model, model_name='custom_bert', section='conc', SENTENCES_COUNT=3)

In [51]:
print("Evaluation TextRank Results")
evaluation(candidates=candidates_text, references=pp_conc['pp_reference'], sources=pp_conc['pp_source'], algorithm="text", section="conc")
print("\nEvaluation LexRank Results")
evaluation(candidates=candidates_lex, references=pp_conc['pp_reference'], sources=pp_conc['pp_source'], algorithm="lex", section="conc")
print("\nEvaluation Sumbasic Results")
evaluation(candidates=candidates_sumbasic, references=pp_conc['pp_reference'], sources=pp_conc['pp_source'], algorithm="sumbasic", section="conc")
print("\nEvaluation Sumbasic Results")
evaluation(candidates=candidates_bertbasic, references=pp_conc['pp_reference'], sources=pp_conc['pp_source'], algorithm="bertbasic", section="conc")
print("\nEvaluation SciBERT Summ Results")
evaluation(candidates=candidates_custombert, references=pp_conc['pp_reference'], sources=pp_conc['pp_source'], algorithm="custombert", section="conc")


Evaluation TextRank Results
0
1000
2000
3000
4000

Evaluation LexRank Results
0
1000
2000
3000
4000

Evaluation Sumbasic Results
0
1000
2000
3000
4000


# All text

## LexRank

In [None]:
candidates_lex = summarization_all_files(pp_all, model='lex', section='all', SENTENCES_COUNT=3)

## TextRank

In [None]:
candidates_text = summarization_all_files(pp_all, model='texrank', section='all', SENTENCES_COUNT=3)

## SumBasic

In [52]:
candidates_sumbasic = summarization_all_files( pp_all, model='sumbasic', section='all', SENTENCES_COUNT=3)

0
1000
2000
3000
4000


## BERT Basic

In [None]:
candidates_bertbasic = summarization_all_files_bert(pp_all, bertbasic, model_name='bert_basic', section='all', SENTENCES_COUNT=3)

## SciBERT Summ

In [None]:
candidates_custombert = summarization_all_files_bert(pp_all, custom_model, model_name='custom_bert', section='all', SENTENCES_COUNT=3)

In [53]:
print("Evaluation TextRank Results")
evaluation(candidates=candidates_text, references=pp_all['pp_reference'], sources=pp_all['pp_source'], algorithm="text", section="all")
print("\nEvaluation LexRank Results")
evaluation(candidates=candidates_lex, references=pp_all['pp_reference'], sources=pp_all['pp_source'], algorithm="lex", section="all")
print("\nEvaluation Sumbasic Results")
evaluation(candidates=candidates_sumbasic, references=pp_all['pp_reference'], sources=pp_all['pp_source'], algorithm="sumbasic", section="all")
print("\nEvaluation Sumbasic Results")
evaluation(candidates=candidates_bertbasic, references=pp_all['pp_reference'], sources=pp_all['pp_source'], algorithm="bertbasic", section="all")
print("\nEvaluation SciBERT Summ Results")
evaluation(candidates=candidates_custombert, references=pp_all['pp_reference'], sources=pp_all['pp_source'], algorithm="custombert", section="all")


Evaluation TextRank Results
0
1000
2000
3000
4000

Evaluation LexRank Results
0
1000
2000
3000
4000

Evaluation Sumbasic Results
0
1000
2000
3000
4000
