In [13]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import sys
sys.path.insert(0, '/scratch/cinthiasouza/mv-text-summarizer')

from sumy.sumy.parsers.plaintext import PlaintextParser
from sumy.sumy.nlp.tokenizers import Tokenizer
from sumy.sumy.summarizers.lsa import LsaSummarizer as SummarizerLsa
from sumy.sumy.summarizers.lex_rank import LexRankSummarizer as SummarizerLex
from sumy.sumy.summarizers.sum_basic import SumBasicSummarizer as SummarizerSumBasic
from sumy.sumy.summarizers.text_rank import TextRankSummarizer  as SummarizerTextrank
from sumy.sumy.summarizers.lsa import LsaSummarizer  as SummarizerLsa
from sumy.sumy.nlp.stemmers import Stemmer
from sumy.sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT=3

In [15]:
import pandas as pd

In [14]:
import re

from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from src import tokenizer
from src import extract_features
import json
import os
import numpy as np
import pandas as pd

from src import preprocess
from src import loader
from src import utils
from src import summarization

In [22]:
def pp_text(text):
    
    xml = preprocess.format_xml(str(text))
    text = preprocess.format_text(str(text), post_processing=False)

    bibs = extract_features.get_citations(xml)
    text = preprocess.replace_bib(text, bibs)
    text = preprocess.format_text(text, post_processing=True)

    soup = BeautifulSoup(text)
    text = soup.get_text()
    
    return text

In [23]:
def get_summary(df, SENTENCES_COUNT):
    
    sentences  = df.sort_values('ratings')[:SENTENCES_COUNT]['sentences']
    
    return " ".join(sentences)

In [24]:
def create_summaries_lsa(text):

    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizerlsa = SummarizerLsa(stemmer)
    summarizerlsa.stop_words = get_stop_words(LANGUAGE)

    result = summarizerlsa(parser.document, SENTENCES_COUNT)
    
    aux = result.copy()
    summary_lsa = get_summary(aux, SENTENCES_COUNT)
    
    return summary_lsa

In [25]:
def create_summaries_lex(text):

    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    
    stemmer = Stemmer(LANGUAGE)
    summarizerLex = SummarizerLex(stemmer)
    summarizerLex.stop_words = get_stop_words(LANGUAGE)

    result = summarizerLex(parser.document, SENTENCES_COUNT)
    
    aux = result.copy()
    summary_lex = get_summary(aux, SENTENCES_COUNT)
    
    return summary_lex

In [26]:
def create_summaries_text(text):

    parser = PlaintextParser(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizerLex = SummarizerLex(stemmer)
    summarizerLex.stop_words = get_stop_words(LANGUAGE)

    result = summarizerLex(parser.document, SENTENCES_COUNT)
    
    aux = result.copy()
    summary_text = get_summary(aux, SENTENCES_COUNT)
    
    return summary_text

In [28]:
import pickle

with open('dataset/dataset_{}.pkl'.format('features'), 'rb') as fp:
    dataset = pickle.load(fp)

In [29]:
df = {}
sections=['introduction', 'materials', 'conclusion', 'concat']

for section in sections:
    
    grouped = dataset[section][5].groupby('articles')
    texts = [' '.join(group['sentences']) for idx, group in grouped]
    
    df['articles'] =  [idx for idx, group in grouped]
    df[section] = texts 

In [30]:
references_df = pd.read_csv("dataset/references_df.csv")

In [31]:
df = pd.DataFrame(df)
df = df.merge(references_df, on='articles')
df.to_csv("baselines/lex_text_input.csv", index=False)

In [32]:
df['introduction'] = df['introduction'].apply(pp_text)
df['materials'] = df['materials'].apply(pp_text)
df['conclusion'] = df['conclusion'].apply(pp_text)
df['concat'] = df['concat'].apply(pp_text)

In [33]:
df['text_intro'] =  df['introduction'].apply(create_summaries_text)
df['lex_intro'] =  df['introduction'].apply(create_summaries_lex)

In [34]:
df['text_mat'] =  df['materials'].apply(create_summaries_text)
df['lex_mat'] =  df['materials'].apply(create_summaries_lex)

In [35]:
df['text_conc'] =  df['conclusion'].apply(create_summaries_text)
df['lex_conc'] =  df['conclusion'].apply(create_summaries_lex)

In [36]:
df['text_concat'] =  df['concat'].apply(create_summaries_text)
df['lex_concat'] =  df['concat'].apply(create_summaries_lex)

KeyboardInterrupt: 

In [9]:
def evaluate_summaries(df, name_models, metrics):

    vfunc = np.vectorize(summarization.rouge_metrics)
    
    for name_model in name_models:
        
        df['{}_r1'.format(name_model)],df['{}_r2'.format(name_model)],df['{}_rl'.format(name_model)] = vfunc(df[name_model], df['references'])
        
    return df

In [38]:
import numpy as np

In [39]:
df.to_csv("baselines/lex_text_summaries.csv", index=False)

In [4]:
df = pd.read_csv("baselines/lex_text_summaries.csv")

In [40]:
metrics=['rouge-1', 'rouge-2', 'rouge-l']

In [41]:
name_models = ['text_intro', 'lex_intro', 'text_mat', 'lex_mat', 'text_conc', 'lex_conc']
result= summarization.evaluate_summariesv2(df, name_models, metrics)

In [42]:
summaries_comb = pd.DataFrame()
summaries_comb['references'] = result['references']
summaries_comb['lex'] = result['lex_intro'] + result['lex_mat'] + result['lex_conc']
summaries_comb['text'] = result['text_intro'] + result['text_mat'] + result['text_conc']

In [43]:
name_models = ['text', 'lex']
result_comb = summarization.evaluate_summariesv2(summaries_comb, name_models, metrics)

In [47]:
summaries_comb

Unnamed: 0,references,lex,text,text_rouge-1,text_rouge-2,text_rouge-l,lex_rouge-1,lex_rouge-2,lex_rouge-l
0,context: evidence suggests that babies' fat ma...,Few studies have examined the long term conseq...,Few studies have examined the long term conseq...,0.326693,0.064000,0.202958,0.326693,0.064000,0.202958
1,summary information on the development and fun...,"In humans, this region continues developing th...","In humans, this region continues developing th...",0.327381,0.071856,0.244997,0.327381,0.071856,0.244997
2,"the present study examined ethnic, gender and ...",Adolescence is a period of significant develop...,Adolescence is a period of significant develop...,0.318471,0.051282,0.216704,0.318471,0.051282,0.216704
3,adenotonsillectomy (at) is among the most comm...,The groups also showed no baseline differences...,The groups also showed no baseline differences...,0.342391,0.027322,0.183325,0.342391,0.027322,0.183325
4,the performance of high sensitivity x ray imag...,"Similar to Roos, several other authors have us...","Similar to Roos, several other authors have us...",0.426614,0.113949,0.226958,0.426614,0.113949,0.226958
...,...,...,...,...,...,...,...,...,...
919,abnormal tau hyperphosphorylation and its aggr...,"Indeed, NFTs are observed early in the pathoge...","Indeed, NFTs are observed early in the pathoge...",0.411379,0.070330,0.229748,0.411379,0.070330,0.229748
920,research indicates that sleep duration and qua...,Morning chronotype may be related to successfu...,Morning chronotype may be related to successfu...,0.342222,0.066964,0.197288,0.342222,0.066964,0.197288
921,background: as the prevalence of depression is...,. Hair cortisol may be a particularly useful b...,. Hair cortisol may be a particularly useful b...,0.341365,0.129032,0.258946,0.341365,0.129032,0.258946
922,beats are among the basic units of perceptual ...,"To test signal robustness, external time domai...","To test signal robustness, external time domai...",0.317949,0.036082,0.175050,0.317949,0.036082,0.175050


In [46]:
result_comb

Unnamed: 0,references,lex,text,text_rouge-1,text_rouge-2,text_rouge-l,lex_rouge-1,lex_rouge-2,lex_rouge-l
0,context: evidence suggests that babies' fat ma...,Few studies have examined the long term conseq...,Few studies have examined the long term conseq...,0.326693,0.064000,0.202958,0.326693,0.064000,0.202958
1,summary information on the development and fun...,"In humans, this region continues developing th...","In humans, this region continues developing th...",0.327381,0.071856,0.244997,0.327381,0.071856,0.244997
2,"the present study examined ethnic, gender and ...",Adolescence is a period of significant develop...,Adolescence is a period of significant develop...,0.318471,0.051282,0.216704,0.318471,0.051282,0.216704
3,adenotonsillectomy (at) is among the most comm...,The groups also showed no baseline differences...,The groups also showed no baseline differences...,0.342391,0.027322,0.183325,0.342391,0.027322,0.183325
4,the performance of high sensitivity x ray imag...,"Similar to Roos, several other authors have us...","Similar to Roos, several other authors have us...",0.426614,0.113949,0.226958,0.426614,0.113949,0.226958
...,...,...,...,...,...,...,...,...,...
919,abnormal tau hyperphosphorylation and its aggr...,"Indeed, NFTs are observed early in the pathoge...","Indeed, NFTs are observed early in the pathoge...",0.411379,0.070330,0.229748,0.411379,0.070330,0.229748
920,research indicates that sleep duration and qua...,Morning chronotype may be related to successfu...,Morning chronotype may be related to successfu...,0.342222,0.066964,0.197288,0.342222,0.066964,0.197288
921,background: as the prevalence of depression is...,. Hair cortisol may be a particularly useful b...,. Hair cortisol may be a particularly useful b...,0.341365,0.129032,0.258946,0.341365,0.129032,0.258946
922,beats are among the basic units of perceptual ...,"To test signal robustness, external time domai...","To test signal robustness, external time domai...",0.317949,0.036082,0.175050,0.317949,0.036082,0.175050


In [45]:
result_comb.describe()

Unnamed: 0,text_rouge-1,text_rouge-2,text_rouge-l,lex_rouge-1,lex_rouge-2,lex_rouge-l
count,924.0,924.0,924.0,924.0,924.0,924.0
mean,0.393194,0.116755,0.24754,0.393194,0.116755,0.24754
std,0.072316,0.07582,0.056203,0.072316,0.07582,0.056203
min,0.16775,0.006006,0.106008,0.16775,0.006006,0.106008
25%,0.345557,0.066057,0.210635,0.345557,0.066057,0.210635
50%,0.390023,0.099211,0.237153,0.390023,0.099211,0.237153
75%,0.43524,0.142999,0.270784,0.43524,0.142999,0.270784
max,0.720403,0.536709,0.590199,0.720403,0.536709,0.590199
