In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
stop_word = stopwords.words('english')

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
ps = PorterStemmer()

In [2]:
def bold(candidate, common_words):
    
    words = candidate.split(" ")
    
    for i in range(len(words)):
        for j in range(len(common_words)):
        
            if common_words[j] == words[i]:
            
                new_word = "\033[1m" + words[i] + "\033[0m"
                words[i] = new_word
         
    return words

In [3]:
def pp_text(text):
    
    text = text.lower()
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")

    
    return text

In [4]:
def rev_pp_text(text):
    
    text = text.lower()
    text = text.replace(" . ", ".")
    text = text.replace(" , ", ",")
    text = text.replace(" ; ", ";")
    text = text.replace(" : ", ":")
    text = text.replace(" = ", "=")
    text = text.replace(" + ", "+")
    
    words = text.split(" ")
    words = list(filter(None, words))
    
    text = " ".join(words)
    
    return text

In [5]:
def main_bold(reference, candidate):
    
    reference = rev_pp_text(reference)
    candidate = rev_pp_text(candidate)
    
    pp_reference = pp_text(reference)
    pp_candidate = pp_text(candidate)
    
    words_reference = pp_reference.split(" ")
    words_candidade = pp_candidate.split(" ")

    common_words = list(set(words_reference) & set(words_candidade))

    common_words = [i for i in common_words if not i in stop_word]
    common_words = list(filter(None, common_words))

    bold_text = bold(candidate, common_words)
    bold_text = " ".join(bold_text)
    
    bold_text = rev_pp_text(bold_text)

    return bold_text, common_words

In [6]:
def load_summaries(sections, experiments, path_to_read):
    
    results = {}
    
    for section in sections:
        aux = {}
        for experiment in experiments:
            aux[experiment] =  pd.read_csv("{}/summaries/{}.csv".format(path_to_read, section, experiment))
        results[section] = aux
        
    return results

In [7]:
def get_rouges(section, experiment, columns, summ):
    
    r1 = results[section][experiment][columns[0]][summ]
    r2 = results[section][experiment][columns[1]][summ]
    rl = results[section][experiment][columns[2]][summ]
               
    return r1, r2, rl

In [8]:
def print_results(results, summ):
    reference = results['comb']['ex1']['references'][summ]

    print("\033[1m REFERENCE \033[0m : \n {}\n\n".format(reference))

    #candidate = results['comb']['ex1']['cb'][summ]
    #r1, r2, rl = get_rouges('comb', 'ex1', ['cb_rouge-1', 'cb_rouge-2', 'cb_rouge-l'], summ)
    #bold_text, common_words = main_bold(reference, candidate)

    #print("\033[1m CANDIDATE - CB\033[0m : \n {}".format(bold_text))
    #print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    #print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))
    
    candidate = results['comb']['ex1']['knn'][summ]
    r1, r2, rl = get_rouges('comb', 'ex1', ['knn_rouge-1', 'knn_rouge-2', 'knn_rouge-l'], summ)
    bold_text, common_words = main_bold(reference, candidate)

    print("\033[1m CANDIDATE - kNN\033[0m : \n {}".format(bold_text))
    print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))
    
    candidate = results['comb']['ex1']['rf'][summ]
    r1, r2, rl = get_rouges('comb', 'ex1', ['rf_rouge-1', 'rf_rouge-2', 'rf_rouge-l'], summ)
    bold_text, common_words = main_bold(reference, candidate)

    print("\033[1m CANDIDATE - RF\033[0m : \n {}".format(bold_text))
    print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))
    
    candidate = results['comb']['ex1']['ab'][summ]
    r1, r2, rl = get_rouges('comb', 'ex1', ['ab_rouge-1', 'ab_rouge-2', 'ab_rouge-l'], summ)
    bold_text, common_words = main_bold(reference, candidate)

    print("\033[1m CANDIDATE - AB\033[0m : \n {}".format(bold_text))
    print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))
    
    candidate = results['comb']['ex1']['cb'][summ]
    r1, r2, rl = get_rouges('comb', 'ex1', ['cb_rouge-1', 'cb_rouge-2', 'cb_rouge-l'], summ)
    bold_text, common_words = main_bold(reference, candidate)

    print("\033[1m CANDIDATE - CB\033[0m : \n {}".format(bold_text))
    print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))
    
    candidate = results['comb']['ex1']['gb'][summ]
    r1, r2, rl = get_rouges('comb', 'ex1', ['gb_rouge-1', 'gb_rouge-2', 'gb_rouge-l'], summ)
    bold_text, common_words = main_bold(reference, candidate)

    print("\033[1m CANDIDATE - GB\033[0m : \n {}".format(bold_text))
    print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))
    
    candidate = results['comb']['ex1']['mlp'][summ]
    r1, r2, rl = get_rouges('comb', 'ex1', ['mlp_rouge-1', 'mlp_rouge-2', 'mlp_rouge-l'], summ)
    bold_text, common_words = main_bold(reference, candidate)

    print("\033[1m CANDIDATE - MLP\033[0m : \n {}".format(bold_text))
    print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))

    df = pd.read_csv("../baselines/summaries/app2_combination.csv")
    candidate = df['max_rouge'][summ]
    r1, r2, rl = df['max_rouge_rouge-1'][summ], df['max_rouge_rouge-2'][summ], df['max_rouge_rouge-l'][summ]
    bold_text, common_words = main_bold(reference, candidate)

    print("\033[1m CANDIDATE - MAX\033[0m : \n {}".format(bold_text))
    print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))

    df = pd.read_csv("../baselines/summaries/app4_combination.csv")
    candidate = df['first_three'][summ]
    r1, r2, rl = df['first_three_rouge-1'][summ], df['first_three_rouge-2'][summ], df['first_three_rouge-l'][summ]
    bold_text, common_words = main_bold(reference, candidate)

    print("\033[1m CANDIDATE - FT\033[0m : \n {}".format(bold_text))
    print("\nNúmero de palavras em comum: {}".format(len(common_words)))
    print("\nR1: {}, R2: {}, RL: {}\n\n".format(r1, r2, rl))
    

         
    

In [9]:
path_to_read = '/scratch/cinthiasouza/mv-text-summarizer/notebook/mv_models_f1/test_10'

In [10]:
sections = ['comb']
experiments = ['ex1', 'ex2', 'ex3', 'ex4', 'ex5']

In [11]:
results = load_summaries(sections, experiments, path_to_read)

In [60]:
sv_ex1 = results['comb']['ex1']

In [12]:
mv_ex1 = results['comb']['ex1']

In [13]:
x = ['knn_rouge-1', 'rf_rouge-1',  'ab_rouge-1', 'cb_rouge-1', 'gb_rouge-1', 'mlp_rouge-1']
names =  ['kNN', 'RF',  'AB', 'CB', 'GB', 'MLP']

In [39]:
max_df = pd.read_csv("../baselines/summaries/app2_combination.csv")

In [72]:
min_df = pd.read_csv("../baselines/summaries/app3_combination.csv")

In [41]:
FT_df = pd.read_csv("../baselines/summaries/app4_combination.csv")

In [76]:
LEX_df = pd.read_csv("../baselines/lex_result_comb.csv")

In [79]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Violin(y=max_df['max_rouge_rouge-1'],
                        name="MAX",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=min_df['min_rouge_rouge-1'],
                        name="MIN",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=FT_df['first_three_rouge-1'],
                        name="FT",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=LEX_df['lex_rouge-1'],
                        name="LexRank",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=results['comb']['ex1']['knn_rouge-1'],
                        name="kNN",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=results['comb']['ex1']['rf_rouge-1'],
                        name="RF",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=results['comb']['ex1']['ab_rouge-1'],
                        name="AB",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=results['comb']['ex1']['cb_rouge-1'],
                        name="CB",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=results['comb']['ex1']['gb_rouge-1'],
                        name="GB",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=results['comb']['ex1']['mlp_rouge-1'],
                        name="MLP",
                        box_visible=True,
                        meanline_visible=True))



fig.show()

In [12]:
aux = pd.DataFrame({})
aux['knn'] = results['comb']['ex1']['knn_rouge-1']
aux['rf']=results['comb']['ex1']['rf_rouge-1']
aux['ab']=results['comb']['ex1']['ab_rouge-1']
aux['cb']=results['comb']['ex1']['cb_rouge-1']
aux['gb']=results['comb']['ex1']['gb_rouge-1']
aux['mlp']=results['comb']['ex1']['mlp_rouge-1']

In [61]:
std_mv1 = aux.T.std().tolist()

In [154]:
np.mean(std_mv2)

0.033108170371695056

In [14]:
import plotly.graph_objects as go
import numpy as np


colors = ['rgba(255, 182, 193, .9)', 'rgba(174, 250, 110, .9)', 'rgba(250, 71, 40, .9)', 'rgba(103, 206, 255, .9)',
         'rgba(172, 99, 230, .9)', 'rgba(230, 204, 33, .9)']

fig = go.Figure()

for idx, row in aux[:50].iterrows():
    j = 0
    fig.add_trace(go.Box(
            x=['Summary_{}'.format(idx)], y=list(row)
        ))
    j+=1


# Set options common to all traces with fig.update_traces

#fig.update(layout_yaxis_range = [0.1,0.75])
fig.show()

In [28]:
import plotly.graph_objects as go
import numpy as np


colors = ['rgba(255, 182, 193, .9)', 'rgba(174, 250, 110, .9)', 'rgba(250, 71, 40, .9)', 'rgba(103, 206, 255, .9)',
         'rgba(172, 99, 230, .9)', 'rgba(230, 204, 33, .9)']

fig = go.Figure()

for idx, row in aux[:50].iterrows():
    j = 0
    for i in list(row):
        fig.add_trace(go.Scatter(
            x=['Summary_{}'.format(idx)], y=[i],
            mode='markers',
            marker_color= colors[j]
        ))
        j+=1


# Set options common to all traces with fig.update_traces

fig.update_traces(mode='markers', marker_line_width=2, marker_size=10)
fig.update(layout_yaxis_range = [0.1,0.75])
fig.update_layout( xaxis = dict(
                        tickfont=dict(family='Helvetica', size=15, color='black')),
                 yaxis = dict(
                        tickfont=dict(family='Helvetica', size=15, color='black')))
fig.update_xaxes(color='black') 
fig.update_yaxes(color='black') 
fig.show()

In [68]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Violin(y=std_mv1,
                        name="METS-F1",
                        box_visible=True,
                        meanline_visible=True))

fig.add_trace(go.Violin(y=std_mv2,
                        name="METS-F2",
                        box_visible=True,
                        meanline_visible=True))

fig.show()




# Exemplo 1

In [97]:
results['comb']['ex1'].loc[344:345]

Unnamed: 0,knn,rf,ab,gb,cb,mlp,articles,references,knn_rouge-1,knn_rouge-2,...,ab_rouge-l,gb_rouge-1,gb_rouge-2,gb_rouge-l,cb_rouge-1,cb_rouge-2,cb_rouge-l,mlp_rouge-1,mlp_rouge-2,mlp_rouge-l
344,Improved chromatographic methods coupled with ...,Addition of basic and fixed charge moieties ha...,Addition of basic and fixed charge moieties ha...,Addition of basic and fixed charge moieties ha...,Addition of basic and fixed charge moieties ha...,Improved chromatographic methods coupled with ...,PMC5624212.json,we evaluate the impact of carbamylation of the...,0.57874,0.197628,...,0.310003,0.613169,0.231405,0.333412,0.55,0.223938,0.31545,0.536765,0.184502,0.315727
345,"On the basis of the published evidence, the Ad...","On the basis of the published evidence, the Ad...","On the basis of the published evidence, the Ad...","On the basis of the published evidence, the Ad...","On the basis of the published evidence, the Ad...","On the basis of the published evidence, the Ad...",PMC3273643.json,background therapeutic hypothermia has been sh...,0.492126,0.26087,...,0.301852,0.528651,0.248609,0.366145,0.492701,0.227106,0.351057,0.424544,0.156406,0.271631


In [177]:
print_results(results, 20)

[1m REFERENCE [0m : 
 past research has observed inverse associations between neighborhood and personal level measures of socioeconomic status and body mass index (bmi), but has not assessed how personal and neighborhood level measures might interact together to predict bmi. using a sample of 13,102 adult residents of new york city who participated in a health survey, cross sectional multi level analyses assessed whether personal income, education and zip code level poverty rates were associated with bmi. demographic, income, education and objectively measured height and weight data were collected in the survey and poverty rates and the proportion of black and hispanic residents in the subjects zip code were retrieved from the census. zip code level population density and land use mix, indices of neighborhood walk ability which are often higher in lower income neighborhoods and are associated with lower bmi, were also measured. after controlling for individual and zip code level demo

In [64]:
print_results(results, 900)

[1m REFERENCE [0m : 
 the present study examined the morphological development of the otolith vestibular receptors in quail. here we describe epithelial growth, hair cell density, stereocilia polarization, and afferent nerve innervation during development. the otolith maculae epithelial areas increased exponentially throughout embryonic development reaching asymptotic values near post hatch day p7. increases in hair cell density were dependent upon macular location; striolar hair cells developed first followed by hair cells in extrastriola regions. stereocilia polarization was initiated early, with defining reversal zones forming at e8. less than half of all immature hair cells observed had non polarized internal kinocilia with the remaining exhibiting planar polarity. immunohistochemistry and neural tracing techniques were employed to examine the shape and location of the striolar regions. initial innervation of the maculae was by small fibers with terminal growth cones at e6, follo

In [86]:
print_results(results, 500)

[1m REFERENCE [0m : 
 objective to estimate if live birth in single blastocyst transfers is correlated with the number of sibling supernumerary vitrified blastocysts (embryos not transferred) generated from that same cycle. design retrospective cohort study. setting a large academic art clinic. patients all single blastocyst transfers in of a sart grade good embryo. interventions none. main outcome measures implantation and live birth. results single blastocyst transfers met inclusion criteria. implantation occurred in 65% and live birth in 54% of cycles. in chi square analysis, patients with supernumerary vitrified blastocysts had statistically higher implantation rate (65% versus 50%) and live birth rate (56% versus 41%) when compared to patients without supernumerary blastocysts. univariate logistic regression demonstrated an increase in implantation (or 1.09, 95%ci 1.031.15) and live birth (or 1.06, 95%ci 1.021.09) with increasing number of supernumerary blastocysts. multivariate

[1m CANDIDATE - MAX[0m : 
 while the absence or presence of [1msupernumerary[0m embryos is [1mcorrelated[0m to implantation, we are not aware of any data examining the correlation of the [1mnumber[0m of [1msupernumerary[0m embryos on [1mimplantation[0m in [1msingle[0m [1mblastocyst[0m transfers. the [1mobjective[0m of this [1mstudy[0m was to [1mestimate[0m if [1mlive[0m [1mbirth[0m in [1msingle[0m [1mblastocyst[0m [1mtransfers[0m was [1mcorrelated[0m with the [1mnumber[0m of [1msibling[0m [1msupernumerary[0m [1mvitrified[0m [1mblastocysts[0m [1mgenerated[0m from that same cycle. we hypothesized that the [1mnumber[0m of [1msupernumerary[0m [1mblastocysts[0m available for vitrification would [1mpositively[0m correlate with [1mimplantation[0m and [1mlive[0m [1mbirth[0m in [1msingle[0m [1mblastocyst[0m [1mtransfers[0m .for most patients, oral contraceptive treatment was initiated 19 days prior to stimulation. for gnrh antago

In [88]:
print_results(results, 670)

[1m REFERENCE [0m : 
 objective to validate a model previously developed using the shared equal access regional cancer hospital (search) database to predict the risk of aggressive recurrence after surgery, defined as a prostate specific antigen (psa) doubling time (dt) of < months, incorporating pathological stage, preoperative psa level and pathological gleason sum, that had an area under the curve (auc) of 0.79 using a cohort of men from the duke prostate center (dpc). patients and methods data were included from men from the dpc database who underwent rp for node negative prostate cancer between and 2003. of these men, had disease recurrence, with a psadt of < months, while either did not have a recurrence but had months of follow up or had a recurrence with a psadt of months. we examined the ability of the search model to predict aggressive recurrence within the dpc cohort, and examined the correlation between the predicted risk of aggressive recurrence and the actual outcome wit

[1m CANDIDATE - MAX[0m : 
 however, the [1mpsadt[0m is not known until [1mmen[0m have already had a [1mrecurrence[0m and have been followed for [1mmonths[0m to years. in a previous study that [1mincluded[0m [1mmen[0m from the [1mshared[0m [1mequal[0m [1maccess[0m [1mregional[0m [1mcancer[0m [1mhospital[0m [1m(search)[0m cohort,we identified three factors that were available at the [1mtime[0m of [1msurgery[0m that could be [1mused[0m to identify [1mmen[0m who were at [1mrisk[0m of an [1maggressive[0m [1mrecurrence[0m (defined as [1mrecurrence[0m with a [1mpsadt[0m of [1m<[0m 9 months), i.e. [1mpreoperative[0m [1mpsa[0m level, [1mpathological[0m [1mgleason[0m sum, and [1mpathological[0m data.therefore, we sought to [1mvalidate[0m this [1mmodel[0m [1musing[0m a [1mcohort[0m of [1mmen[0m treated at the [1mduke[0m [1mprostate[0m [1mcenter[0m (dpc), a tertiary care referral centre, by examining the [1mability[0m of t

In [89]:
print_results(results, 900)

[1m REFERENCE [0m : 
 the present study examined the morphological development of the otolith vestibular receptors in quail. here we describe epithelial growth, hair cell density, stereocilia polarization, and afferent nerve innervation during development. the otolith maculae epithelial areas increased exponentially throughout embryonic development reaching asymptotic values near post hatch day p7. increases in hair cell density were dependent upon macular location; striolar hair cells developed first followed by hair cells in extrastriola regions. stereocilia polarization was initiated early, with defining reversal zones forming at e8. less than half of all immature hair cells observed had non polarized internal kinocilia with the remaining exhibiting planar polarity. immunohistochemistry and neural tracing techniques were employed to examine the shape and location of the striolar regions. initial innervation of the maculae was by small fibers with terminal growth cones at e6, follo

[1m CANDIDATE - MAX[0m : 
 [1mcalyx[0m [1mafferents[0m contain only [1mcalyceal[0m [1mterminals[0m to innervate [1mtype[0m i [1mhair[0m cells. in contrast, [1mbouton[0m [1mafferents[0m are exclusively [1mlocated[0m in the [1mextrastriola[0m [1mregions[0m and a unique [1mband[0m of [1mtype[0m [1mii[0m [1mcells[0m that flanks the [1mreversal[0m line of the utricle. here, we focused on [1mhair[0m [1mcell[0m distribution, [1mstereocilia[0m polarization, and [1mafferent[0m [1mnerve[0m [1minnervation[0m of the [1motolith[0m maculae.20155736 the total [1mareas[0m of the utricular and saccular receptor epithelia were measured (n=3 16 maculae) for each developmental [1mday[0m from e6e15, p0 (hatch), and for [1mpost[0m [1mhatch[0m days p1 (24 hours [1mpost[0m hatch), p7, and p48 (adult). initially, both the utricular and saccular [1mmaculae[0m [1mincreased[0m in area at about the same rate during [1mearly[0m [1membryonic[0m develo