In [1]:
import sys
sys.path.insert(1, '/scratch/cinthiasouza/mv-text-summarizer')

import itertools
import re

import glob, os
import pandas as pd
import json
import spacy
import nltk
import numpy as np
import json
#import smogn
import seaborn as sns
import pickle

from bs4 import BeautifulSoup
from pysbd.utils import PySBDFactory
import math

from sumeval.metrics.rouge import RougeCalculator
rouge = RougeCalculator(stopwords=True, lang="en")
import matplotlib.pyplot as plt
import random
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from timeit import default_timer as timer 
from imblearn.over_sampling import SMOTE
from sklearn.metrics import matthews_corrcoef
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
#from tensorflow.keras.models import model_from_json

#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#nltk.download('punkt')
#nltk.download('stopwords')

#!python -m spacy download en_core_web_sm
#nlp_sm = spacy.load('en_core_web_sm')

import en_core_web_sm
nlp_md = en_core_web_sm.load()

import en_core_web_md
nlp_md = en_core_web_md.load()
#!python -m spacy download en_core_web_md
#nlp_md = spacy.load('en_core_web_md')

import warnings
warnings.filterwarnings("ignore")

path_base = "/scratch/cinthiasouza/mv-text-summarizer"
path_to_read="/scratch/cinthiasouza/mv-text-summarizer/result/{}/{}_*.csv"

In [2]:
%load_ext autoreload
%autoreload 2

from src import preprocess
from src import extract_features
from src import tokenizer
from src import create_features_df
from src import transform_data
from src import loader
from src import utils
from src import ensemble_tree_models
from src import tunning_hyperparametrs as th
#from src import mlp_regressor
#from src import mlp_classifier
from src import summarization
from src import normalization
from src import ensemble_tree_models as classifiers
from src import utils_classification as utils_clf
from src import evaluate_classifiers as ev
from src import prepare_data
from src import display_results as dr
import joblib
from joblib import Parallel, delayed
#from tensorflow.keras.utils import to_categorical
from src import pipeline_extract_features as pef

In [3]:
from tensorflow.keras.models import model_from_json
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision

In [4]:
def load_keras_model(section, path_to_save):

    json_file = open('{}/mlp_{}.json'.format(path_to_save, section), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights('{}/mlp_{}.h5'.format(path_to_save, section))
    print("Loaded model from disk")
    
    loaded_model.compile(loss='categorical_crossentropy', optimizer=Adam(
                learning_rate=0.001), metrics=[Precision()])
    
    return loaded_model

In [5]:
def load_classifiers(sections, path_to_read, name_models):

    models = {}
    
    for section in sections:
        
        aux = {}
        
        for name_model in name_models:
            if name_model != 'mlp':
                aux[name_model] = joblib.load('{}/{}_{}.pkl'.format(path_to_read, name_model,  section))
            elif name_model == 'mlp':
                aux[name_model] = load_keras_model(section, path_to_read)
            
        models[section] = aux
        
    return models

In [6]:
def remove_ascii(text):
    
    try:
        return re.sub(r'[^\x00-\x7F]+', '', text)
    except:
        return " "

In [7]:
def pipeline_summarization(
    features, scores, references, predictions, section, name_models,
    summ_items, k=3, sort_scores=True, proba=False, ascending=False):


    X_test = features.loc[features['articles'].isin(summ_items)].reset_index(drop=True)
    y_test = scores.loc[scores['articles'].isin(summ_items)].reset_index(drop=True)
    
    vfunc = np.vectorize(remove_ascii)
    X_test['sentences'] = vfunc(X_test['sentences'])

    
    df_proba = summarization.create_df(name_models, X_test, y_test['rouge_1'], predictions, section, proba=proba)
    
    if proba:
        df = summarization.binarize_proba(df_proba.copy(), name_models, k, sort_scores, ascending)
    else:
        df = df_proba.copy()
        
    summaries = summarization.create_summaries(df, references, summ_items, name_models)
    result = summarization.evaluate_summaries(summaries, name_models)
    
    return df_proba, df, summaries, result

In [8]:
def create_df_v2(name_models, x_summ):
    
    df = pd.DataFrame({'sentences': x_summ['sentences'],
                       'rouge_1': list(x_summ['rouge_1']),
                       'articles': x_summ['articles']})
    
    for name_model in name_models:
            df[name_model] = list(x_summ['rouge_1'])
            
    return df

In [9]:
def summarization_target(
    df, references, name_models, summ_items, sort_scores=False, proba=False, ascending=False):
    
    df = create_df_v2(name_models, df)

    df = summarization.binarize_proba(df, name_models, 3, sort_scores, ascending)

    summaries = summarization.create_summaries(df, references, summ_items, name_models)
    result = summarization.evaluate_summaries(summaries, name_models)
    
    return df, summaries, result

In [202]:
with open('dataset5_{}.pkl'.format('features'), 'rb') as fp:
    dataset = pickle.load(fp)
    
columns_name = ['text_rank', 'lex_rank', 'count_one_gram', 'count_one_gram', 'count_one_gram',
        'count_article_keywords',
       'tf-isf', 'position_score', 'paragraph_score',
       'length_score', 'pos_score', 'ner_score', 'dist_centroid']

sections=['introduction', 'materials', 'conclusion']

path_to_save = "/scratch/cinthiasouza/mv-text-summarizer/notebook/models_v5"

In [204]:
sections=['introduction', 'materials', 'conclusion']
name_models = ['knn', 'rf', 'cb', 'mlp']

models = load_classifiers(sections, path_to_save, name_models)

Loaded model from disk
Loaded model from disk
Loaded model from disk


In [205]:
predictions, predictions_proba, results = ev.create_reports(models, dataset, columns_name, name_models, verbose=False)

In [206]:
results

{'introduction': {'knn':              model  precision    recall  f1-score       support
  0              knn   0.920360  0.689158  0.788153  27300.000000
  1              knn   0.288266  0.678578  0.404639   5065.000000
  accuracy       knn   0.687502  0.687502  0.687502      0.687502
  macro avg      knn   0.604313  0.683868  0.596396  32365.000000
  weighted avg   knn   0.821440  0.687502  0.728134  32365.000000,
  'rf':              model  precision    recall  f1-score       support
  0               rf   0.848559  0.997289  0.916932  27300.000000
  1               rf   0.735714  0.040671  0.077081   5065.000000
  accuracy        rf   0.847582  0.847582  0.847582      0.847582
  macro avg       rf   0.792136  0.518980  0.497007  32365.000000
  weighted avg    rf   0.830899  0.847582  0.785498  32365.000000,
  'cb':              model  precision    recall  f1-score       support
  0               cb   0.844021  0.999963  0.915398  27300.000000
  1               cb   0.952381  0.0039

# Summarization

In [21]:
summ_items = pd.read_csv("indices_summ.csv")['summ']

In [16]:
"""summ_items = pd.read_csv("indices_summ.csv")['summ']

path = ["../../PMC002xxxxxx_pp", "../../PMC003xxxxxx_pp", "../../PMC005xxxxxx_pp", "../../PMC006xxxxxx_pp"]
references = []
article_name = []
for i in path:
    
    files = os.listdir(i)
    inter = list(set(files) & set(summ_items))
    
    for j in inter:
        references.append(summarization.get_ref_summary(j, i))
        article_name.append(j)
        
references_df = pd.DataFrame({"articles": article_name, 'references': references})

features_intro, scores_intro = loader.read_features(path="../result/introduction/features_*.csv")
features_mat, scores_mat = loader.read_features(path="../result/materials/features_*.csv")
features_conc, scores_conc = loader.read_features(path="../result/conclusion/features_*.csv")"""

'summ_items = pd.read_csv("indices_summ.csv")[\'summ\']\n\npath = ["../../PMC002xxxxxx_pp", "../../PMC003xxxxxx_pp", "../../PMC005xxxxxx_pp", "../../PMC006xxxxxx_pp"]\nreferences = []\narticle_name = []\nfor i in path:\n    \n    files = os.listdir(i)\n    inter = list(set(files) & set(summ_items))\n    \n    for j in inter:\n        references.append(summarization.get_ref_summary(j, i))\n        article_name.append(j)\n        \nreferences_df = pd.DataFrame({"articles": article_name, \'references\': references})\n\nfeatures_intro, scores_intro = loader.read_features(path="../result/introduction/features_*.csv")\nfeatures_mat, scores_mat = loader.read_features(path="../result/materials/features_*.csv")\nfeatures_conc, scores_conc = loader.read_features(path="../result/conclusion/features_*.csv")'

In [22]:
references_df = pd.read_csv("references_dataset4.csv")

In [23]:
features_intro = pd.read_csv('features_intro.csv')
features_mat = pd.read_csv('features_mat.csv')
features_conc = pd.read_csv('features_conc.csv')

scores_intro = pd.read_csv('scores_intro.csv')
scores_mat = pd.read_csv('scores_mat.csv')
scores_conc = pd.read_csv('scores_conc.csv')

# Experimento 1

Sumarização utilizando os classificadores

In [84]:
pd.unique(df_intro_ex1['articles'])

array(['PMC2757635.json', 'PMC2946068.json', 'PMC2845524.json', ...,
       'PMC2894538.json', 'PMC2767451.json', 'PMC6138536.json'],
      dtype=object)

In [85]:
section = 'introduction'

proba_intro_ex1, df_intro_ex1, summaries_intro_ex1, result_intro_ex1 = pipeline_summarization(
    features_intro, scores_intro, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

In [86]:
section = 'materials'

proba_mat_ex1, df_mat_ex1, summaries_mat_ex1, result_mat_ex1 = pipeline_summarization(
    features_mat, scores_mat, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

In [87]:
section = 'conclusion'

df_proba_conc_ex1, df_conc_ex1, summaries_conc_ex1, result_conc_ex1 = pipeline_summarization(
    features_conc, scores_conc, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

In [243]:
def pipeline_summarization(
    features, scores, references, predictions, section, name_models,
    summ_items, k=3, sort_scores=True, proba=False, ascending=False):
    
    X_test = features
    y_test = scores
    
    vfunc = np.vectorize(remove_ascii)
    X_test['sentences'] = vfunc(X_test['sentences'])

    df_proba = summarization.create_df(name_models, X_test, y_test['rouge_1'], predictions, section, proba=proba)
    
    if proba:
        df = summarization.binarize_proba(df_proba.copy(), name_models, k, sort_scores, ascending)
    else:
        df = df_proba.copy()
        
    summaries = summarization.create_summaries(df, references, summ_items, name_models)
    result = summarization.evaluate_summaries(summaries, name_models)
    
    return df_proba, df, summaries, result

In [244]:
section = 'introduction'

df = dataset[section][5].reset_index(drop=True)
features_intro = df[['sentences', 'articles']]
scores_intro = pd.DataFrame()
scores_intro['rouge_1'] = df['rouge_1']

proba_intro_ex1, df_intro_ex1, summaries_intro_ex1, result_intro_ex1 = pipeline_summarization(
    features_intro, scores_intro, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

In [245]:
section = 'materials'

df = dataset[section][5].reset_index(drop=True)
features_mat = df[['sentences', 'articles']]
scores_mat = pd.DataFrame()
scores_mat['rouge_1'] = df['rouge_1']

proba_mat_ex1, df_mat_ex1, summaries_mat_ex1, result_mat_ex1 = pipeline_summarization(
    features_mat, scores_mat, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

In [246]:
section = 'conclusion'

df = dataset[section][5].reset_index(drop=True)
features_conc = df[['sentences', 'articles']]
scores_conc = pd.DataFrame()
scores_conc['rouge_1'] = df['rouge_1']

df_proba_conc_ex1, df_conc_ex1, summaries_conc_ex1, result_conc_ex1 = pipeline_summarization(
    features_conc, scores_conc, references_df, predictions_proba, section, name_models,
    summ_items, sort_scores=True, proba=True, ascending=False)

In [247]:
summaries_intro_ex1.describe()

Unnamed: 0,knn_r1,knn_r2,knn_rl,rf_r1,rf_r2,rf_rl,cb_r1,cb_r2,cb_rl,mlp_r1,mlp_r2,mlp_rl
count,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0
mean,0.256107,0.097171,0.167135,0.271927,0.103987,0.174626,0.268947,0.101697,0.171426,0.256062,0.096187,0.167813
std,0.086993,0.064588,0.064887,0.089327,0.070218,0.068773,0.088073,0.068612,0.06766,0.086174,0.064182,0.065291
min,0.010471,0.0,0.010471,0.034335,0.0,0.025751,0.035556,0.0,0.026667,0.049587,0.0,0.034091
25%,0.193833,0.050179,0.123077,0.211864,0.054795,0.128655,0.209524,0.054545,0.127451,0.194805,0.052083,0.123711
50%,0.25,0.08658,0.158537,0.266667,0.091429,0.165414,0.265306,0.089286,0.162162,0.25,0.086957,0.156682
75%,0.313131,0.130435,0.201005,0.329268,0.139535,0.208955,0.323077,0.135593,0.205128,0.309179,0.127168,0.201439
max,0.795181,0.666667,0.771084,0.764706,0.701493,0.764706,0.764706,0.701493,0.764706,0.717391,0.6,0.695652


In [248]:
summaries_mat_ex1.describe()

Unnamed: 0,knn_r1,knn_r2,knn_rl,rf_r1,rf_r2,rf_rl,cb_r1,cb_r2,cb_rl,mlp_r1,mlp_r2,mlp_rl
count,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0
mean,0.164424,0.0,0.12024,0.171346,0.0,0.123123,0.168262,0.0,0.118386,0.17212,0.0,0.126519
std,0.120869,0.0,0.105479,0.123345,0.0,0.106887,0.120877,0.0,0.102738,0.125371,0.0,0.11096
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.081633,0.0,0.06015,0.085308,0.0,0.061728,0.085271,0.0,0.059259,0.087432,0.0,0.064171
50%,0.145455,0.0,0.100559,0.153846,0.0,0.103226,0.14876,0.0,0.099237,0.153061,0.0,0.104348
75%,0.221277,0.0,0.146893,0.231023,0.0,0.15,0.227642,0.0,0.14433,0.23053,0.0,0.152284
max,0.862385,0.0,0.862385,0.862385,0.0,0.862385,0.862385,0.0,0.862385,0.862385,0.0,0.862385


In [249]:
summaries_conc_ex1.describe()

Unnamed: 0,knn_r1,knn_r2,knn_rl,rf_r1,rf_r2,rf_rl,cb_r1,cb_r2,cb_rl,mlp_r1,mlp_r2,mlp_rl
count,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0
mean,0.25702,0.118028,0.182031,0.270459,0.118103,0.184509,0.280573,0.12377,0.190924,0.267,0.126804,0.191453
std,0.115989,0.116279,0.107658,0.111505,0.106961,0.100806,0.113344,0.110188,0.101904,0.118945,0.121681,0.11314
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.179104,0.040268,0.113744,0.195652,0.046154,0.120172,0.204545,0.04878,0.125,0.182857,0.044444,0.118721
50%,0.245614,0.083333,0.157635,0.262295,0.089552,0.163424,0.274678,0.095238,0.171053,0.255034,0.087912,0.163522
75%,0.319149,0.151515,0.217391,0.335025,0.155251,0.221106,0.345946,0.162162,0.227979,0.333333,0.162791,0.231481
max,0.890756,0.837607,0.890756,0.835294,0.833333,0.835294,0.835294,0.833333,0.835294,0.787234,0.678571,0.787234


### Combine

In [213]:
summaries_comb_ex1 = summarization.combine_three_summ(summaries_intro_ex1, summaries_mat_ex1, summaries_conc_ex1, references_df, name_models)

In [214]:
result_comb_ex1 = summarization.evaluate_summaries(summaries_comb_ex1, name_models)

In [215]:
result_comb_ex1.describe()

Unnamed: 0,knn_r1,knn_r2,knn_rl,rf_r1,rf_r2,rf_rl,cb_r1,cb_r2,cb_rl,mlp_r1,mlp_r2,mlp_rl
count,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0
mean,0.36581,0.161817,0.219175,0.358944,0.155842,0.210164,0.356502,0.154165,0.20951,0.372054,0.167634,0.22518
std,0.096989,0.096833,0.080916,0.102027,0.091762,0.080891,0.102038,0.091889,0.080869,0.098471,0.100633,0.084651
min,0.071006,0.0,0.047337,0.049587,0.0,0.029038,0.055838,0.0,0.035533,0.076923,0.0,0.050061
25%,0.300469,0.090498,0.162162,0.294118,0.089172,0.154412,0.287625,0.08867,0.152542,0.303571,0.093863,0.166008
50%,0.362416,0.141892,0.204724,0.36019,0.14,0.198758,0.358824,0.13459,0.195918,0.367893,0.142349,0.207254
75%,0.429907,0.214286,0.261981,0.427119,0.20598,0.254181,0.423841,0.203175,0.254902,0.435088,0.220126,0.272446
max,0.683128,0.556017,0.601881,0.708861,0.570213,0.607595,0.708861,0.570213,0.607595,0.695279,0.554113,0.618026


# Experimento 2

Max Rouge Scores

In [250]:
df_intro_ex2, summaries_intro_ex2, result_intro_ex2 = summarization_target(
    df_intro_ex1, references_df, ['max_rouge'], summ_items, sort_scores=True, proba=True, ascending=False)

In [251]:
df_mat_ex2, summaries_mat_ex2, result_mat_ex2 = summarization_target(
    df_mat_ex1, references_df, ['max_rouge'], summ_items, sort_scores=True, proba=True, ascending=False)

In [252]:
df_conc_ex2, summaries_conc_ex2, result_conc_ex = summarization_target(
    df_conc_ex1.copy(), references_df, ['max_rouge'], summ_items, sort_scores=True, proba=True, ascending=False)

In [253]:
summaries_intro_ex2.describe()

Unnamed: 0,max_rouge_r1,max_rouge_r2,max_rouge_rl
count,1689.0,1689.0,1689.0
mean,0.315354,0.148881,0.218688
std,0.095284,0.087043,0.088029
min,0.055046,0.0,0.044199
25%,0.25,0.090909,0.160714
50%,0.307054,0.132701,0.203046
75%,0.37,0.184397,0.256881
max,0.813559,0.793103,0.813559


In [254]:
summaries_mat_ex2.describe()

Unnamed: 0,max_rouge_r1,max_rouge_r2,max_rouge_rl
count,1689.0,1689.0,1689.0
mean,0.220292,0.000592,0.168778
std,0.158207,0.024332,0.152773
min,0.0,0.0,0.0
25%,0.112245,0.0,0.083333
50%,0.197674,0.0,0.133333
75%,0.290909,0.0,0.191617
max,1.0,1.0,1.0


In [255]:
summaries_conc_ex2.describe()

Unnamed: 0,max_rouge_r1,max_rouge_r2,max_rouge_rl
count,1689.0,1689.0,1689.0
mean,0.342265,0.207408,0.266363
std,0.146218,0.179548,0.162993
min,0.0,0.0,0.0
25%,0.242678,0.080808,0.155556
50%,0.321101,0.142857,0.214876
75%,0.414508,0.278481,0.327586
max,1.0,1.0,1.0


# Combine

In [256]:
summaries_comb_ex2 = summarization.combine_three_summ(summaries_intro_ex2, summaries_mat_ex2, summaries_conc_ex2, references_df, ['max_rouge'])
result_comb_ex2 = summarization.evaluate_summaries(summaries_comb_ex2, ['max_rouge'])

In [257]:
result_comb_ex2.describe()

Unnamed: 0,max_rouge_r1,max_rouge_r2,max_rouge_rl
count,1689.0,1689.0,1689.0
mean,0.478175,0.271673,0.319852
std,0.11097,0.138895,0.120285
min,0.101633,0.00885,0.072658
25%,0.4,0.161194,0.226148
50%,0.474474,0.237838,0.295775
75%,0.557143,0.383099,0.396552
max,0.790476,0.698039,0.7393


# Experimento 3 

Min ROUGE scores

In [258]:
df_intro_ex3, summaries_intro_ex3, result_intro_ex3 = summarization_target(
    df_intro_ex1.copy(), references_df, ['min_rouge'], summ_items, sort_scores=True, proba=True, ascending=True)

In [259]:
df_mat_ex3, summaries_mat_ex3, result_mat_ex3 = summarization_target(
    df_mat_ex1.copy(), references_df, ['min_rouge'], summ_items, sort_scores=True, proba=True, ascending=True)

In [260]:
df_conc_ex3, summaries_conc_ex3, result_conc_ex3 = summarization_target(
    df_conc_ex1.copy(), references_df, ['min_rouge'], summ_items, sort_scores=True, proba=True, ascending=True)

In [261]:
summaries_intro_ex3.describe()

Unnamed: 0,min_rouge_r1,min_rouge_r2,min_rouge_rl
count,1689.0,1689.0,1689.0
mean,0.078118,0.011127,0.0534
std,0.062101,0.023339,0.040282
min,0.0,0.0,0.0
25%,0.033898,0.0,0.025
50%,0.066116,0.0,0.047244
75%,0.111111,0.013072,0.072727
max,0.435644,0.26,0.29703


In [262]:
summaries_mat_ex3.describe()

Unnamed: 0,min_rouge_r1,min_rouge_r2,min_rouge_rl
count,1689.0,1689.0,1689.0
mean,0.054739,0.0,0.041504
std,0.08103,0.0,0.066364
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.024291,0.0,0.021277
75%,0.07767,0.0,0.057416
max,0.862385,0.0,0.862385


In [263]:
summaries_conc_ex3.describe()

Unnamed: 0,min_rouge_r1,min_rouge_r2,min_rouge_rl
count,1689.0,1689.0,1689.0
mean,0.070818,0.0,0.048874
std,0.092312,0.0,0.065944
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.034884,0.0,0.026087
75%,0.104478,0.0,0.069444
max,0.619048,0.0,0.534653


## Combine

In [264]:
summaries_comb_ex3 = summarization.combine_three_summ(summaries_intro_ex3, summaries_mat_ex3, summaries_conc_ex3, references_df, ['min_rouge'])
result_comb_ex3 = summarization.evaluate_summaries(summaries_comb_ex3, ['min_rouge'])

In [265]:
result_comb_ex3.describe()

Unnamed: 0,min_rouge_r1,min_rouge_r2,min_rouge_rl
count,1689.0,1689.0,1689.0
mean,0.13181,0.030816,0.079782
std,0.086786,0.049756,0.056986
min,0.0,0.0,0.0
25%,0.066667,0.0,0.041885
50%,0.118182,0.012618,0.068702
75%,0.180556,0.04,0.102326
max,0.538776,0.516854,0.522222


# Experimento 4

3 primeiras sentenças de cada seção

In [266]:
df_intro_ex4, summaries_intro_ex4, result_intro_ex4 = summarization_target(
    df_intro_ex1.copy(), references_df, ['first_three'], summ_items, sort_scores=False, proba=False, ascending=True)

In [267]:
df_mat_ex4, summaries_mat_ex4, result_mat_ex4 = summarization_target(
    df_mat_ex1.copy(), references_df, ['first_three'], summ_items, sort_scores=False, proba=False, ascending=True)

In [268]:
df_conc_ex4, summaries_conc_ex4, result_conc_ex4 = summarization_target(
    df_conc_ex1.copy(), references_df, ['first_three'], summ_items, sort_scores=False, proba=False, ascending=True)

In [269]:
summaries_intro_ex4.describe()

Unnamed: 0,first_three_r1,first_three_r2,first_three_rl
count,1689.0,1689.0,1689.0
mean,0.156422,0.051535,0.109644
std,0.078781,0.050105,0.056854
min,0.0,0.0,0.0
25%,0.098039,0.017391,0.070707
50%,0.145455,0.03871,0.099291
75%,0.204444,0.071429,0.138889
max,0.469945,0.41989,0.464


In [270]:
summaries_mat_ex4.describe()

Unnamed: 0,first_three_r1,first_three_r2,first_three_rl
count,1689.0,1689.0,1689.0
mean,0.140027,0.0,0.102375
std,0.11002,0.0,0.094406
min,0.0,0.0,0.0
25%,0.069444,0.0,0.051282
50%,0.120219,0.0,0.085561
75%,0.184211,0.0,0.127451
max,0.862385,0.0,0.862385


In [271]:
summaries_conc_ex4.describe()

Unnamed: 0,first_three_r1,first_three_r2,first_three_rl
count,1689.0,1689.0,1689.0
mean,0.226873,0.084297,0.15462
std,0.10778,0.079661,0.08527
min,0.0,0.0,0.0
25%,0.152542,0.030534,0.100629
50%,0.219512,0.066667,0.142857
75%,0.293333,0.113636,0.191489
max,0.910714,0.828402,0.892857


## Combine

In [272]:
summaries_comb_ex4 = summarization.combine_three_summ(summaries_intro_ex4, summaries_mat_ex4, summaries_conc_ex4, references_df, ['first_three'])
result_comb_ex4 = summarization.evaluate_summaries(summaries_comb_ex4, ['first_three'])

In [273]:
result_comb_ex4.describe()

Unnamed: 0,first_three_r1,first_three_r2,first_three_rl
count,1689.0,1689.0,1689.0
mean,0.3014,0.114038,0.18002
std,0.092835,0.079545,0.074439
min,0.021505,0.0,0.021505
25%,0.24,0.061303,0.131274
50%,0.296296,0.097561,0.166667
75%,0.358209,0.144231,0.211111
max,0.681034,0.55914,0.606383


# Summary

In [274]:
result_comb_ex1.describe()

Unnamed: 0,knn_r1,knn_r2,knn_rl,rf_r1,rf_r2,rf_rl,cb_r1,cb_r2,cb_rl,mlp_r1,mlp_r2,mlp_rl
count,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0
mean,0.36581,0.161817,0.219175,0.358944,0.155842,0.210164,0.356502,0.154165,0.20951,0.372054,0.167634,0.22518
std,0.096989,0.096833,0.080916,0.102027,0.091762,0.080891,0.102038,0.091889,0.080869,0.098471,0.100633,0.084651
min,0.071006,0.0,0.047337,0.049587,0.0,0.029038,0.055838,0.0,0.035533,0.076923,0.0,0.050061
25%,0.300469,0.090498,0.162162,0.294118,0.089172,0.154412,0.287625,0.08867,0.152542,0.303571,0.093863,0.166008
50%,0.362416,0.141892,0.204724,0.36019,0.14,0.198758,0.358824,0.13459,0.195918,0.367893,0.142349,0.207254
75%,0.429907,0.214286,0.261981,0.427119,0.20598,0.254181,0.423841,0.203175,0.254902,0.435088,0.220126,0.272446
max,0.683128,0.556017,0.601881,0.708861,0.570213,0.607595,0.708861,0.570213,0.607595,0.695279,0.554113,0.618026


In [275]:
result = pd.concat([result_comb_ex2.describe(), result_comb_ex3.describe() ,result_comb_ex4.describe()], axis=1)
result

Unnamed: 0,max_rouge_r1,max_rouge_r2,max_rouge_rl,min_rouge_r1,min_rouge_r2,min_rouge_rl,first_three_r1,first_three_r2,first_three_rl
count,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0,1689.0
mean,0.478175,0.271673,0.319852,0.13181,0.030816,0.079782,0.3014,0.114038,0.18002
std,0.11097,0.138895,0.120285,0.086786,0.049756,0.056986,0.092835,0.079545,0.074439
min,0.101633,0.00885,0.072658,0.0,0.0,0.0,0.021505,0.0,0.021505
25%,0.4,0.161194,0.226148,0.066667,0.0,0.041885,0.24,0.061303,0.131274
50%,0.474474,0.237838,0.295775,0.118182,0.012618,0.068702,0.296296,0.097561,0.166667
75%,0.557143,0.383099,0.396552,0.180556,0.04,0.102326,0.358209,0.144231,0.211111
max,0.790476,0.698039,0.7393,0.538776,0.516854,0.522222,0.681034,0.55914,0.606383


In [None]:
from nltk.corpus import stopwords
stop_word = stopwords.words('english')

In [417]:
def bold(candidate, common_words):
    
    words = candidate.split(" ")
    
    for i in range(len(words)):
        for j in range(len(common_words)):
        
            if common_words[j].find(words[i]) != -1:
            
                new_word = "\033[1m" + words[i] + "\033[0m"
                words[i] = new_word
            
    return words

In [423]:
def pp_text(text):
    
    text = text.lower()
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")
    
    return text

In [402]:
def rev_pp_text(text):
    
    text = text.lower()
    text = text.replace(" . ", ".")
    text = text.replace(" , ", ",")
    text = text.replace(" ; ", ";")
    text = text.replace(" : ", ":")
    text = text.replace(" = ", "=")
    text = text.replace(" + ", "+")
    
    words = text.split(" ")
    words = list(filter(None, words))
    
    text = " ".join(words)
    
    return text

In [424]:
def main_bold(reference, candidate):
    
    reference = rev_pp_text(reference)
    candidate = rev_pp_text(candidate)
    
    pp_reference = pp_text(reference)
    pp_candidate = pp_text(candidate)
    
    words_reference = pp_reference.split(" ")
    words_candidade = pp_candidate.split(" ")

    common_words = list(set(words_reference) & set(words_candidade))

    common_words = [i for i in common_words if not i in stop_word]

    bold_text = bold(candidate, common_words)
    bold_text = " ".join(bold_text)
    
    bold_text = rev_pp_text(bold_text)

    return bold_text, common_words

In [435]:
df_intro_ex1

Unnamed: 0,sentences,rouge_1,articles,knn,rf,cb,mlp
0,Humans have a tendency to nonconsciously and n...,0.054054,PMC2080579.json,0,0,0,0
1,The processes of mimicry seem to rely on a dir...,0.235294,PMC2080579.json,0,0,0,0
2,Evidence is accumulating that perceiving an ac...,0.750000,PMC2080579.json,0,0,0,0
3,This overlap might allow humans to embody the ...,0.074074,PMC2080579.json,0,0,0,0
4,Direct evidence for the embodiment of action c...,0.166667,PMC2080579.json,0,0,0,0
...,...,...,...,...,...,...,...
32360,This goal was considered exploratory because p...,0.200000,PMC6995786.json,0,0,0,0
32361,"In summary, as described in , we developed and...",0.146341,PMC6995786.json,0,1,1,0
32362,The first goal of the present study was to dem...,0.368421,PMC6995786.json,0,0,1,1
32363,The second goal was to establish preliminary e...,0.352941,PMC6995786.json,0,0,0,0


In [436]:
path_to_save = '/scratch/cinthiasouza/mv-text-summarizer/notebook/models_v5/'

'/scratch/cinthiasouza/mv-text-summarizer/notebook/models_v5'

In [None]:
classification/

In [438]:
""""result_comb_ex1.to_csv("{}/summaries/comb_ex1.csv".format(path_to_save), index=False)
result_comb_ex2.to_csv("{}/summaries/comb_ex2.csv".format(path_to_save), index=False)
result_comb_ex3.to_csv("{}/summaries/comb_ex3.csv".format(path_to_save), index=False)
result_comb_ex4.to_csv("{}/summaries/comb_ex4.csv".format(path_to_save), index=False)

result_intro_ex1.to_csv("{}/summaries/intro_ex1.csv".format(path_to_save), index=False)
result_intro_ex2.to_csv("{}/summaries/intro_ex2.csv".format(path_to_save), index=False)
result_intro_ex3.to_csv("{}/summaries/intro_ex3.csv".format(path_to_save), index=False)
result_intro_ex4.to_csv("{}/summaries/intro_ex4.csv".format(path_to_save), index=False)

result_mat_ex1.to_csv("{}/summaries/mat_ex1.csv".format(path_to_save), index=False)
result_mat_ex2.to_csv("{}/summaries/mat_ex2.csv".format(path_to_save), index=False)
result_mat_ex3.to_csv("{}/summaries/mat_ex3.csv".format(path_to_save), index=False)
result_mat_ex4.to_csv("{}/summaries/mat_ex4.csv".format(path_to_save), index=False)

result_conc_ex1.to_csv("{}/summaries/conc_ex1.csv".format(path_to_save), index=False)
result_conc_ex.to_csv("{}/summaries/conc_ex2.csv".format(path_to_save), index=False)
result_conc_ex3.to_csv("{}/summaries/conc_ex3.csv".format(path_to_save), index=False)
result_conc_ex4.to_csv("{}/summaries/conc_ex4.csv".format(path_to_save), index=False)

df_comb_ex1.to_csv("{}/classification/result_comb_ex1.csv".format(path_to_save), index=False)
df_comb_ex2.to_csv("{}/classification/result_comb_ex2.csv".format(path_to_save), index=False)
df_comb_ex3.to_csv("{}/classification/result_comb_ex3.csv".format(path_to_save), index=False)
df_comb_ex4.to_csv("{}/classification/result_comb_ex4.csv".format(path_to_save), index=False)

df_intro_ex1.to_csv("{}/classification/result_intro_ex1.csv".format(path_to_save), index=False)
df_intro_ex2.to_csv("{}/classification/result_intro_ex2.csv".format(path_to_save), index=False)
df_intro_ex3.to_csv("{}/classification/result_intro_ex3.csv".format(path_to_save), index=False)
df_intro_ex4.to_csv("{}/classification/result_intro_ex4.csv".format(path_to_save), index=False)

df_mat_ex1.to_csv("{}/classification/result_mat_ex1.csv".format(path_to_save), index=False)
df_mat_ex2.to_csv("{}/classification/result_mat_ex2.csv".format(path_to_save), index=False)
df_mat_ex3.to_csv("{}/classification/result_mat_ex3.csv".format(path_to_save), index=False)
df_mat_ex4.to_csv("{}/classification/result_mat_ex4.csv".format(path_to_save), index=False)

df_conc_ex1.to_csv("{}/classification/result_conc_ex1.csv".format(path_to_save), index=False)
df_conc_ex2.to_csv("{}/classification/result_conc_ex2.csv".format(path_to_save), index=False)
df_conc_ex3.to_csv("{}/classification/result_conc_ex3.csv".format(path_to_save), index=False)
df_conc_ex4.to_csv("{}/classification/result_conc_ex4.csv".format(path_to_save), index=False)""""

# Exemplo 1

In [378]:
summ=50

## Resumo Referência

In [305]:
result_comb_ex4['references'][summ]

'the opioid receptor is the site of action of opiates and opioids. we examined whether there are differences in cpg dinucleotide methylation in the oprm1 promoter between former heroin addicts and controls. we analyzed methylation at sixteen cpg dinucleotides in dna obtained from lymphocytes of caucasian former severe heroin addicts stabilized in methadone maintenance treatment and caucasian control subjects. direct sequencing of bisulfite treated dna showed that the percent methylation at two cpg sites was significantly associated with heroin addiction. the level of methylation at the cpg site was 25.4% in the stabilized methadone maintained former heroin addicts and 21.4% in controls (p = 0.0035, generalized estimating equations (gee); p = 0.0077, t test; false discovery rate (fdr) = 0.048), and the level of methylation at the +84 cpg dinucleotide site was 7.4% in cases and 5.6% in controls (p = 0.0095, gee; p = 0.0067, t test; fdr = 0.080). both the and the +84 cpg sites are located

## Resumos Candidatos

### Catboost

In [425]:
reference = result_comb_ex4['references'][summ]
candidate = result_comb_ex1['cb'][summ]

bold_text, common_words = main_bold(reference, candidate)
print(bold_text)
print("\n\n Número de palavras em comum: {}".format(len(common_words)))

maternal cocaine administration [1min[0m rats increased [1mmethylation[0m [1mat[0m [1ma[0m [1mcpg[0m [1msite[0m [1min[0m the [1mpromoter[0m of the protein kinase [1mc[0m (pkc) gene [1min[0m fetal heart, and decreased pkc mrna and protein levels, presumably through [1ma[0m decrease [1min[0m ap1 [1mtranscription[0m [1mfactor[0m binding.this variant was [1massociated[0m with vulnerability [1mto[0m develop [1mheroin[0m addiction, and was shown [1mto[0m alter the function of the hypothalamic pituitary adrenal axis [reviewed in].herein, we have [1mexamined[0m the [1mmethylation[0m levels of [1msixteen[0m [1mcpg[0m [1msites[0m [1min[0m the [1moprm1[0m [1mpromoter[0m region [1min[0m [1mformer[0m [1msevere[0m [1mheroin[0m [1maddicts[0m [1min[0m [1mmethadone[0m [1mmaintenance[0m pharmacotherapy and [1min[0m controls.five [1mdna[0m samples chosen [1mat[0m random from [1mcases[0m and five from [1mcontrols[0m were [1mana

### RF

In [431]:
reference = result_comb_ex4['references'][summ]
candidate = result_comb_ex1['rf'][summ]

bold_text, common_words = main_bold(reference, candidate)
print(bold_text)
print("\n\n Número de palavras em comum: {}".format(len(common_words)))

vulnerability to develop [1mheroin[0m [1maddiction[0m is due to drug induced and environmental factors, [1mas[0m well [1mas[0m genetic factors.this variant was [1massociated[0m with vulnerability to develop [1mheroin[0m addiction, and was shown to alter the function of the hypothalamic pituitary adrenal axis [reviewed in].herein, we have [1mexamined[0m the [1mmethylation[0m levels of [1msixteen[0m [1mcpg[0m [1msites[0m [1min[0m the [1moprm1[0m [1mpromoter[0m region [1min[0m [1mformer[0m [1msevere[0m [1mheroin[0m [1maddicts[0m [1min[0m [1mmethadone[0m [1mmaintenance[0m pharmacotherapy and [1min[0m controls.five [1mdna[0m samples chosen [1mat[0m random from [1mcases[0m and five from [1mcontrols[0m were [1manalyzed[0m using the cloning method. our sample consisted of 194 [1mformer[0m [1msevere[0m [1mheroin[0m [1maddicts[0m and 135 [1mcontrol[0m [1msubjects[0m (n=329), all of [1mcaucasian[0m ethnicity, drawn from consecu

### FT

In [426]:
reference = result_comb_ex4['references'][summ]
candidate = result_comb_ex4['first_three'][summ]

bold_text, common_words = main_bold(reference, candidate)
print(bold_text)
print("\n\n Número de palavras em comum: {}".format(len(common_words)))

chronic [1mheroin[0m use disrupts multiple physiological systems, contributing [1mto[0m [1maddiction[0m and relapse. vulnerability [1mto[0m develop [1mheroin[0m [1maddiction[0m is due [1mto[0m drug induced and environmental factors, [1mas[0m well [1mas[0m genetic factors.among the non genetic factors [1mmay[0m be integrated epigenetic factors including [1mdna[0m [1mmethylation[0m .five [1mdna[0m samples chosen [1mat[0m random from [1mcases[0m and five from [1mcontrols[0m were [1manalyzed[0m using the cloning method. the [1mmethylation[0m pattern of each is shown in.we also analyzed, with this method, five selected samples of [1mcases[0m previously evaluated by sequencing/esme analysis .this study demonstrates hypermethylation of [1mtwo[0m [1mcpg[0m [1mdinucleotide[0m [1msites[0m [1min[0m the [1moprm1[0m [1mpromoter[0m region [1min[0m [1mformer[0m [1mheroin[0m addicts. hypermethylation of the 18 and the +204 [1mcpg[0m [1msites

### MAX

In [443]:
summaries_comb_ex1

Unnamed: 0,knn,rf,cb,mlp,articles,references,knn_r1,knn_r2,knn_rl,rf_r1,rf_r2,rf_rl,cb_r1,cb_r2,cb_rl,mlp_r1,mlp_r2,mlp_rl
0,Anogenital human papillomavirus (HPV) infectio...,Anogenital human papillomavirus (HPV) infectio...,A key contributor to the incidence of HPV rela...,Anogenital human papillomavirus (HPV) infectio...,PMC2976820.json,background estimates of human papillomavirus (...,0.232394,0.056738,0.140845,0.286645,0.111475,0.182410,0.309677,0.110390,0.167742,0.290541,0.115646,0.189189
1,Blockers are recommended in the seventh repor...,Blockers are recommended in the seventh repor...,Blockers are recommended in the seventh repor...,Blockers are recommended in the seventh repor...,PMC2999810.json,study objective to evaluate whether the level ...,0.337349,0.103030,0.162651,0.344023,0.117302,0.198251,0.280872,0.082725,0.159806,0.327381,0.089820,0.178571
2,Poor oral health is common in Alaska Native co...,Tooth decay is a multifactorial disease linked...,Tooth decay is a multifactorial disease linked...,Poor oral health is common in Alaska Native co...,PMC6019600.json,objectives dental health aide therapists (dhat...,0.340249,0.175732,0.190871,0.404255,0.200000,0.170213,0.288732,0.063830,0.161972,0.394737,0.194690,0.228070
3,Given the importance of protein protein intera...,Rather than relying on protein protein interac...,Given the importance of protein protein intera...,"However, an artificial dimer of FK506 named FK...",PMC2925120.json,induction of protein protein interactions is a...,0.328571,0.101449,0.214286,0.181818,0.061350,0.109091,0.218391,0.058140,0.149425,0.220779,0.052632,0.103896
4,Many of these injuries require surgery and reh...,Many of these injuries require surgery and reh...,Many of these injuries require surgery and reh...,Many of these injuries require surgery and reh...,PMC2855045.json,purpose to identify pre operative and intra op...,0.364964,0.235294,0.277372,0.364964,0.235294,0.277372,0.375451,0.232727,0.274368,0.351145,0.223077,0.221374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1684,The protean clinical manifestations of varicel...,The protean clinical manifestations of varicel...,The protean clinical manifestations of varicel...,The cerebrospinal fluid (CSF) usually contains...,PMC2846975.json,we describe an immunocompetent 45 year old wom...,0.504000,0.290323,0.184000,0.431718,0.248889,0.167401,0.457565,0.267658,0.177122,0.351852,0.130841,0.185185
1685,The present report includes results of serum m...,Age and gender are well known correlates of se...,Age and gender are well known correlates of se...,The present report includes results of serum m...,PMC2702997.json,objectives the igf axis plays a significant ro...,0.342342,0.138973,0.186186,0.379009,0.170088,0.192420,0.405556,0.167598,0.216667,0.412500,0.169811,0.193750
1686,The complicating factor in understanding the p...,Some research has begun to investigate these p...,Some research has begun to investigate these p...,The complicating factor in understanding the p...,PMC2804884.json,applying localized external displacement to th...,0.415771,0.086643,0.164875,0.334426,0.066007,0.157377,0.334426,0.066007,0.157377,0.426230,0.105611,0.190164
1687,"In addition, the organ also contains several t...",The organ of Corti contains inner and outer ha...,The organ of Corti contains inner and outer ha...,The organ of Corti contains inner and outer ha...,PMC2894618.json,purpose of review a significant proportion of ...,0.349515,0.117264,0.181230,0.270270,0.108761,0.168168,0.261905,0.107784,0.166667,0.344828,0.083333,0.158621


In [428]:
reference = result_comb_ex4['references'][summ]
candidate = summaries_comb_ex2['max_rouge'][summ]

bold_text, common_words = main_bold(reference, candidate)
print(bold_text)
print("\n\n Número de palavras em comum: {}".format(len(common_words)))

many [1mtranscription[0m [1mfactor[0m [1mbinding[0m sites, such [1mas[0m those [1mfor[0m sp1, have [1mcpg[0m dinucleotides, and when these are methylated, they display altered [1mbinding[0m [1mto[0m their cognate [1mtranscription[0m factors [e.g. (;;)].the [1mopioid[0m [1mreceptor[0m [1mis[0m the [1msite[0m of [1maction[0m of endorphin, morphine, and methadone.herein, we have [1mexamined[0m the [1mmethylation[0m levels of [1msixteen[0m [1mcpg[0m [1msites[0m [1min[0m the [1moprm1[0m [1mpromoter[0m region [1min[0m [1mformer[0m [1msevere[0m [1mheroin[0m [1maddicts[0m [1min[0m [1mmethadone[0m [1mmaintenance[0m pharmacotherapy and [1min[0m controls.methadone [1mstabilized[0m [1mformer[0m [1mheroin[0m [1maddicts[0m were found [1mto[0m have [1msignificantly[0m higher [1mmethylation[0m than [1mcontrols[0m [1mat[0m [1mtwo[0m [1mcpg[0m sites.at the 18 [1mcpg[0m site, the [1mlevel[0m of [1mmethylation[0m was

# Analise de Resultados com a utilização de features posicionais

## Experimento 6

Objetivo: analisar a distribuição entre os scores das métricas rouge entre os resumos candidatos e os resumos referência

1. Crio um resumo composto por 3 sentenças de cada seção;
2. Comparo esse resumo com o resumo referencia. 


In [222]:
import plotly.graph_objects as go
import numpy as np

In [166]:
def plot_hist_rouge_sent(df, title):

    fig = go.Figure()

    for i in df.columns:
        fig.add_trace(go.Histogram(x=df[i], name=i))


    fig.update_layout(barmode='overlay', title=title)
    fig.show()

In [223]:
aux = pd.DataFrame()

aux['Max']= result_comb_ex2['max_rouge_r1']
aux['Min']= result_comb_ex3['min_rouge_r1']
aux['first_three']= result_comb_ex4['first_three_r1']
aux['RF']= result_comb_ex1['rf_r1']
aux['CB']= result_comb_ex1['cb_r1']


plot_hist_rouge_sent(aux, title="Distribuição dos scores de rouge-1 dos resumos candidatos")

In [224]:
aux = pd.DataFrame()

aux['Max']= result_comb_ex2['max_rouge_r2']
aux['Min']= result_comb_ex3['min_rouge_r2']
aux['first_three']= result_comb_ex4['first_three_r2']
aux['RF']= result_comb_ex1['rf_r2']
aux['CB']= result_comb_ex1['cb_r2']


plot_hist_rouge_sent(aux, title="Distribuição dos scores de rouge-2 dos resumos candidatos")

# Experimento 7

Objetivo: Verificar se existe alguma diferença na distribuição de scores das sentenças selecionadas pela abordagem q seleciona as 3 primeiras sentenças e os algoritmos de classificação?


1. Crio um resumo composto por 3 sentenças de cada seção;
2. Comparo esse resumo com o resumo referencia. 


Ap 1 - Selecionou as sentenças 9 sentenças, 3 de cada seção. 
Ap 2 - Selecionou as sentenças 9 sentenças, 3 de cada seção.

Comparamos os resumos de Ap 1 e 2 com o resumo candidato. Os scores obtidos estão sendo apresentados nas figuras.


In [225]:
import plotly.graph_objects as go
import numpy as np

def plot_dist_rouge_sent(df, title, box_visible=False):

    fig = go.Figure()

    for i in df.columns:
        fig.add_trace(go.Violin(y=df[i], name=i, box_visible=box_visible, meanline_visible=True))

    fig.update_traces(orientation='v')
    fig.update_layout(title_text=title)
    fig.show()


In [230]:
r1_plot = pd.DataFrame()

r1_plot['max_rouge']= result_comb_ex2['max_rouge_r1']
r1_plot['min_rouge'] = result_comb_ex3['min_rouge_r1']
r1_plot['first_three'] = result_comb_ex4['first_three_r1']
r1_plot['RF'] = result_comb_ex1['rf_r1']
r1_plot['CB'] = result_comb_ex1['cb_r1']
r1_plot['MLP'] = result_comb_ex1['mlp_r1']


plot_dist_rouge_sent(r1_plot, title="Distribuição dos scores de rouge-1 dos resumos gerados", box_visible=True)

In [231]:
r2_plot = pd.DataFrame()

r2_plot['max_rouge']= result_comb_ex2['max_rouge_r2']
r2_plot['min_rouge'] = result_comb_ex3['min_rouge_r2']
r2_plot['first_three'] = result_comb_ex4['first_three_r2']
r2_plot['RF'] = result_comb_ex1['rf_r2']
r2_plot['CB'] = result_comb_ex1['cb_r2']
r2_plot['MLP'] = result_comb_ex1['mlp_r2']


plot_dist_rouge_sent(r2_plot, title="Distribuição dos scores de rouge-2 dos resumos gerados", box_visible=True)

# Experimento 7

Objetivo: Verificar a distribuição dos scores de rouge-1 das sentenças selecionadas pelos algoritmos e pela abordagem que seleciona as três primeiras sentenças. 

Quero ver se realmente as médias dos scores são iguais pois as sentenças seleciodas possuem scores semelhantes ou se ao combinar as sentenças os resumos obtiveram scores semelhantes.

1. Seleciono três sentenças de cada seção
2. Seleciono o rótulo dessas sentenças e avalio a distribuição.


Obs. O rótulo da sentença é o max do score de rouge-1 da sentença com todas as sentenças do resumo referência.



In [232]:
df_intro_ex2

Unnamed: 0,sentences,rouge_1,articles,max_rouge
0,Humans have a tendency to nonconsciously and n...,0,PMC2080579.json,0
1,The processes of mimicry seem to rely on a dir...,0,PMC2080579.json,0
2,Evidence is accumulating that perceiving an ac...,1,PMC2080579.json,1
3,This overlap might allow humans to embody the ...,0,PMC2080579.json,0
4,Direct evidence for the embodiment of action c...,0,PMC2080579.json,0
...,...,...,...,...
32360,This goal was considered exploratory because p...,0,PMC6995786.json,0
32361,"In summary, as described in , we developed and...",0,PMC6995786.json,0
32362,The first goal of the present study was to dem...,0,PMC6995786.json,0
32363,The second goal was to establish preliminary e...,0,PMC6995786.json,0


In [276]:
label_intro = pd.DataFrame()

label_intro['max_rouge'] = df_intro_ex2.loc[df_intro_ex2['max_rouge'] == 1 ]['rouge_1'].reset_index(drop=True)
label_intro['first_three']  = df_intro_ex4.loc[df_intro_ex4['first_three'] == 1 ]['rouge_1'].reset_index(drop=True)
label_intro['cb'] = df_intro_ex1.loc[df_intro_ex1['cb'] == 1 ]['rouge_1'].reset_index(drop=True)
label_intro['rf'] = df_intro_ex1.loc[df_intro_ex1['rf'] == 1 ]['rouge_1'].reset_index(drop=True)
plot_dist_rouge_sent(label_intro, title="Distribuição dos score max de rouge-1 entre as sentenças selecionadas e o resumo de referência")
                                                                                         

In [277]:
def plot_bar(df, names, title):

    fig = go.Figure()
    
    for i in df.columns:
        fig.add_trace(go.Bar(y=df[i],name=names.get(i)))
        
    fig.update_layout(title_text=title)

    fig.show()

In [278]:
test = pd.DataFrame()

test['rf_gt_ft'] =  label_intro.rf > label_intro.first_three 
test['rf_gt_ft'] =test['rf_gt_ft'].astype(int)
test['rf_lt_ft'] =  label_intro.rf < label_intro.first_three 
test['rf_lt_ft'] =test['rf_lt_ft'].astype(int)
test['rf_eq_ft'] = label_intro.first_three == label_intro.rf
test['rf_eq_ft']=test['rf_eq_ft'].astype(int)
test['cb_gt_ft'] =  label_intro.cb > label_intro.first_three
test['cb_gt_ft'] = test['cb_gt_ft'].astype(int)
test['cb_lt_ft'] =  label_intro.cb < label_intro.first_three
test['cb_lt_ft'] = test['cb_lt_ft'].astype(int)
test['cb_eq_ft'] = label_intro.first_three == label_intro.cb
test['cb_eq_ft']=test['cb_eq_ft'].astype(int)

names = {"rf_gt_ft": "Score RF > FT",  'rf_lt_ft': "Score RF < FT", 'rf_eq_ft': "Score RF = FT", 'cb_gt_ft': "Score CB > FT",  'cb_lt_ft':"Score CB < FT", 'cb_eq_ft':"Score CB = FT"}

count = test.apply(pd.value_counts)
count = count.drop(0, axis=0)

plot_bar(count, names, 'Análise dos scores de ROUGE-1 das sentenças selecionadas - Introdução')

In [279]:
names = {"rf_gt_ft": "Score RF > FT",  'rf_lt_ft': "Score RF < FT", 'rf_eq_ft': "Score RF = FT", 'cb_gt_ft': "Score CB > FT",  'cb_lt_ft':"Score CB < FT", 'cb_eq_ft':"Score CB = FT"}

count = test.apply(pd.value_counts)
count = count.drop(0, axis=0)

plot_bar(count, names, 'Análise dos scores de ROUGE-1 das sentenças selecionadas - Introdução')

In [280]:
label_mat = pd.DataFrame()

label_mat['max_rouge'] = df_mat_ex2.loc[df_mat_ex2['max_rouge'] == 1 ]['rouge_1'].reset_index(drop=True)
label_mat['first_three']  = df_mat_ex4.loc[df_mat_ex4['first_three'] == 1 ]['rouge_1'].reset_index(drop=True)
label_mat['cb'] = df_mat_ex1.loc[df_mat_ex1['cb'] == 1 ]['rouge_1'].reset_index(drop=True)
label_mat['rf'] = df_mat_ex1.loc[df_mat_ex1['rf'] == 1 ]['rouge_1'].reset_index(drop=True)

plot_dist_rouge_sent(label_mat, title="Distribuição dos score max de rouge-1 entre as sentenças selecionadas e o resumo de referência")
                                                                                         

In [281]:
test = pd.DataFrame()

test['rf_gt_ft'] =  label_mat.rf > label_mat.first_three 
test['rf_gt_ft'] =test['rf_gt_ft'].astype(int)
test['rf_lt_ft'] =  label_mat.rf < label_mat.first_three 
test['rf_lt_ft'] =test['rf_lt_ft'].astype(int)
test['rf_eq_ft'] = label_mat.first_three == label_mat.rf
test['rf_eq_ft']=test['rf_eq_ft'].astype(int)
test['cb_gt_ft'] =  label_mat.cb > label_mat.first_three
test['cb_gt_ft'] = test['cb_gt_ft'].astype(int)
test['cb_lt_ft'] =  label_mat.cb < label_mat.first_three
test['cb_lt_ft'] = test['cb_lt_ft'].astype(int)
test['cb_eq_ft'] = label_mat.first_three == label_mat.cb
test['cb_eq_ft']=test['cb_eq_ft'].astype(int)

names = {"rf_gt_ft": "Score RF > FT",  'rf_lt_ft': "Score RF < FT", 'rf_eq_ft': "Score RF = FT", 'cb_gt_ft': "Score CB > FT",  'cb_lt_ft':"Score CB < FT", 'cb_eq_ft':"Score CB = FT"}

count = test.apply(pd.value_counts)
count = count.drop(0, axis=0)

plot_bar(count, names, 'Análise dos scores de ROUGE-1 das sentenças selecionadas - Materiais e Métodos')

In [282]:
label_conc = pd.DataFrame()

label_conc['max_rouge'] = df_conc_ex2.loc[df_conc_ex2['max_rouge'] == 1 ]['rouge_1'].reset_index(drop=True)
label_conc['first_three']  = df_conc_ex4.loc[df_conc_ex4['first_three'] == 1 ]['rouge_1'].reset_index(drop=True)
label_conc['cb'] = df_conc_ex1.loc[df_conc_ex1['cb'] == 1 ]['rouge_1'].reset_index(drop=True)
label_conc['rf'] = df_conc_ex1.loc[df_conc_ex1['rf'] == 1 ]['rouge_1'].reset_index(drop=True)

plot_dist_rouge_sent(label_conc, title="Distribuição dos score max de rouge-1 entre as sentenças selecionadas e o resumo de referência")
                                                                                         

In [283]:
test = pd.DataFrame()

test['rf_gt_ft'] =  label_conc.rf > label_conc.first_three 
test['rf_gt_ft'] =test['rf_gt_ft'].astype(int)
test['rf_lt_ft'] =  label_conc.rf < label_conc.first_three 
test['rf_lt_ft'] =test['rf_lt_ft'].astype(int)
test['rf_eq_ft'] = label_conc.first_three == label_conc.rf
test['rf_eq_ft']=test['rf_eq_ft'].astype(int)
test['cb_gt_ft'] =  label_conc.cb > label_conc.first_three
test['cb_gt_ft'] = test['cb_gt_ft'].astype(int)
test['cb_lt_ft'] =  label_conc.cb < label_conc.first_three
test['cb_lt_ft'] = test['cb_lt_ft'].astype(int)
test['cb_eq_ft'] = label_conc.first_three == label_conc.cb
test['cb_eq_ft']=test['cb_eq_ft'].astype(int)

names = {"rf_gt_ft": "Score RF > FT",  'rf_lt_ft': "Score RF < FT", 'rf_eq_ft': "Score RF = FT", 'cb_gt_ft': "Score CB > FT",  'cb_lt_ft':"Score CB < FT", 'cb_eq_ft':"Score CB = FT"}

count = test.apply(pd.value_counts)
count = count.drop(0, axis=0)

plot_bar(count, names, 'Análise dos scores de ROUGE-1 das sentenças selecionadas - Conclusão')

# Experimento 8

Objetivo: Identificar tendência posicionais na seleção de sentenças.

1. Seleciono três sentenças de cada seção;
2. Categorizo as sentenças em três classes:
    Begin: posição 0, 1, 2
    End: posição num(sentenças)-0,num(sentenças)-1, num(sentenças)-2

In [284]:
def get_positions(df, column_name):

    positions = []
    length = []
    
    for i in summ_items:

        aux = df.loc[ df['articles'] == i].reset_index(drop= True)
        length.append(aux.shape[0])
        positions.append(aux.loc[aux[column_name] == 1].index.tolist())
        
    return positions, length

In [285]:
def count_positions(df, column_name, new_column):
    j = 0
    aux = []
    
    positions, length = get_positions(df, column_name)

    for position in positions:
        for i in position:

            if (i == 0) or (i<4):
                aux.append("begin")
            elif (i == length[j]) or (i <= (length[j]-2)):
                aux.append('end')
            else:
                aux.append('middle')

        j+=1

    count_positions = pd.DataFrame(Counter(aux), index=[new_column]).T.reset_index()
    return count_positions

In [286]:
def count_all_sections(df_intro, df_mat, df_conc, column_name):

    count_intro = count_positions(df_intro, column_name=column_name, new_column='count_intro')
    count_mat = count_positions(df_mat, column_name=column_name, new_column='count_mat')
    count_conc = count_positions(df_conc, column_name=column_name, new_column='count_conc')

    aux = count_intro.merge(count_mat, on='index')
    aux = aux.merge(count_conc, on='index')

    return aux

In [287]:
import plotly.graph_objects as go

def plot_count_positions(df, title):

    x=['end', 'middle', 'begin']

    fig = go.Figure(data=[
        go.Bar(name='Introduction', x=x, y=df['count_intro']),
        go.Bar(name='Materials and Methods', x=x, y=df['count_mat']),
        go.Bar(name='Conclusion', x=x, y=df['count_conc'])])

    fig.update_layout(barmode='group', title=title)
    fig.show()

In [288]:
count_ex1_cb = count_all_sections(df_intro_ex1, df_mat_ex1, df_conc_ex1, column_name='cb')

In [289]:
count_ex1_rf = count_all_sections(df_intro_ex1, df_mat_ex1, df_conc_ex1, column_name='rf')

In [290]:
count_ex2 = count_all_sections(df_intro_ex2, df_mat_ex2, df_conc_ex2, column_name='max_rouge')

In [291]:
plot_count_positions(count_ex1_cb, title="Localização das 3 sentenças com maior probabilidade - CatBoost")

In [292]:
plot_count_positions(count_ex1_rf, title="Localização das 3 sentenças com maior probabilidade - Random Forest")

In [293]:
plot_count_positions(count_ex2, title="Localização das 3 sentenças de maior score por seção")

In [294]:
import plotly.graph_objects as go

import pandas as pd

fig = go.Figure()

x_intro = ['Introduction'] * len(summaries_intro_ex1['mlp_r1'])
x_mat = ['Materials'] * len(summaries_mat_ex1['mlp_r1'])
x_conc = ['Conclusion'] * len(summaries_conc_ex1['mlp_r1'])

fig.add_trace(go.Violin(x = x_intro, y=summaries_intro_ex1['rf_r1'], name='AP1',
                        side='negative'))
             
fig.add_trace(go.Violin(x = x_intro, y=summaries_intro_ex4['first_three_r1'], name='AP4',
                        side='positive'))

fig.add_trace(go.Violin(x = x_mat, y=summaries_mat_ex1['rf_r1'], name='AP1',
                        side='negative'))
             
fig.add_trace(go.Violin(x = x_mat, y=summaries_mat_ex4['first_three_r1'], name='AP4',
                        side='positive'))

fig.add_trace(go.Violin(x = x_conc, y=summaries_conc_ex1['rf_r1'], name='AP1',
                        side='negative'))
             
fig.add_trace(go.Violin(x = x_conc, y=summaries_conc_ex4['first_three_r1'], name='AP4',
                        side='positive'))
             
             
fig.update_traces(meanline_visible=True, orientation='v')
fig.update_layout( violinmode='overlay')
fig.show()