In [1]:
import sys
sys.path.insert(1, '/media/cinthia/Dados/Mestrado/mv-text-summarizer')

import glob, os
import pandas as pd
import json
import spacy
import nltk
import numpy as np
import json

from sumeval.metrics.rouge import RougeCalculator
from bs4 import BeautifulSoup
from pysbd.utils import PySBDFactory
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import EarlyStopping

from src import preprocess
from src import extract_features
from src import tokenizer
from src import create_features_df
from src import transform_data
from src import loader
from src import tunning_hyperparametrs as th
from src import classifiers 
from src import neural_model
from src import normalization

rouge = RougeCalculator(stopwords=True, lang="en")

nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')

!python -m spacy download en_core_web_sm
nlp_sm = spacy.load('en_core_web_sm')

!python -m spacy download en_core_web_md
nlp_md = spacy.load('en_core_web_md')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/cinthia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/cinthia/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/cinthia/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /home/cinthia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Collecting en_core_web_sm==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 5.2 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting en_core_web_md==2.1.0
  Downloading https://github.

In [2]:
def create_batches(path_base, tam=45):

    files = os.listdir(path_base)
    batch_files = np.array_split(files,tam)

    return batch_files

In [3]:
path_base = "../../sumdata/dataset_articles"
batch_files = create_batches(path_base, tam=45)

In [4]:
def load_batches(files, path_base):

    disct_list = {}
    cont = 1

    section_1 = []
    section_2 = []
    section_3 = []
    section_4 = []
    keywords = []

    texts = loader.load_files(path_base, files)

    for i in texts:

        #name = files[cont-1].replace(".json", "")
        #disct_list[name] = i

        #if cont != save_each == 0:
        #    with open('data/{}.json'.format(i), 'w') as f:
        #        json.dump(disct_list, f)
        #    disct_list = {}
        #    cont = 1

        section_1.append(preprocess.format_intro(i.get('sec_abstract')))
        section_2.append(preprocess.format_intro(i.get('sec_introduction')))
        section_3.append(preprocess.format_intro(i.get('sec_materials_and_methods')))
        section_4.append(preprocess.format_intro(i.get('sec_results_and_conclusion')))
        keywords.append(i.get('sec_keyword'))

    return section_1, section_2, section_3, section_4, keywords

In [15]:

def extract_features_batches(batches, path_base, start=0,verbose=True):

    all_scores =[ ]
    all_features = []
    cont = 1

    for batch in batches:

        if verbose:
            print("Batch: {} \n".format(cont))
    
        section_1, section_2, section_3, section_4, keywords = load_batches(
            batch, path_base)

        if verbose:
            print("Iniciando a extração de features...")
            print("Total de arquivos: {} \n".format(len(section_1)))

        for i in range(len(section_1)):
            
            features_df, scores_df = extract_features_file(
                    section=section_2[i], reference=section_1[i], keywords=keywords,
                    number_text=i, verbose=True)

            if not((features_df.empty) or (scores_df.empty)):

                if features_df.shape[0] == scores_df.shape[0]:
                    all_scores.append(scores_df)
                    all_features.append(features_df)

            if (i % 100 == 0) and (i !=0):
                print("Quantidade de arquivos processados: {}".format(i))
                print("Saving Results")

                save_results(all_features, all_scores, number_text=i, verbose=False)
                all_scores =[]
                all_features = []

        cont+=1
    

In [6]:
 def save_results(all_features, all_scores, number_text, name_section='intro', verbose=False):
    
    features_df = pd.concat(all_features)
    scores_df = pd.concat(all_scores)

    features_df.to_csv("../result/{}/features_{}.csv".format(name_section, number_text), index=False)
    scores_df.to_csv("../result/{}/scores_{}.csv".format(name_section, number_text), index=False)

In [7]:
def extract_features_file(section, reference, keywords, number_text, verbose=False):
  
  xml = preprocess.format_xml(str(section))
  text = preprocess.format_text(str(section), post_processing=False)
  reference = preprocess.format_text(str(reference), post_processing=True)

  bibs = extract_features.get_citations(xml)
  text = preprocess.replace_bib(text, bibs)
  text = preprocess.format_text(text, post_processing=True)

  soup = BeautifulSoup(text)
  text = soup.get_text()

  sentences = tokenizer.split_sentences([text])
  sentences = list(map(str, sentences[0]))
  sentences = preprocess.format_sentences(sentences)

  try: 

    features = create_features_df.main(
      sentences, xml, keywords, number_text, nlp_sm, nlp_md)
    features_df = create_features_df.format_df (sentences, features)
    features_df['number_text'] = [number_text]*len(features_df)

    sentences_ref = tokenizer.split_sentences([reference])
    sentences_ref = list(map(str, sentences_ref[0]))

    scores_df, label = transform_data.main_create_label(sentences, sentences_ref, rouge)
    scores_df['label'] = label
    scores_df['number_text'] = [number_text]*len(scores_df)

    return features_df, scores_df

  except ValueError as error:
    return pd.DataFrame(), pd.DataFrame()

In [16]:
path_base = "../../sumdata/dataset_articles"

extract_features_batches(batch_files[:1], path_base, verbose=True)

Batch: 1 

Iniciando a extração de features...
Total de arquivos: 1054 

Quantidade de arquivos processados: 0
Saving Results
Quantidade de arquivos processados: 100
Saving Results


KeyboardInterrupt: 

In [2]:
features_df = loader.read_features(path="../result/features_*.csv")
scores_df = loader.read_features(path="../result/scores_*.csv")

In [3]:
print(features_df.shape)
print(scores_df.shape)

(20947, 15)
(20947, 4)


In [4]:
columns_name = features_df.columns
X = features_df[columns_name[1:]]
y = scores_df['label']

In [11]:
 X = normalization.standart_norm(X)

In [5]:
X_train, X_test, y_train, y_test = classifiers.data_classification(X, y)

In [6]:
print(X_train.shape)
print(X_test.shape)

(4710, 14)
(6285, 14)


# Random Forest Classifier

In [16]:
rfsearch = th.get_hiperparametrs_rf(X_train, y_train, parameters=None)

In [24]:
rf = classifiers.fit_rf(X_train, y_train, rfsearch)

In [27]:
scores, y_pred = classifiers.evaluate_model(X_test, y_test, rf)
scores

Unnamed: 0,precision,recall,f1-score,support
0,0.925513,0.770237,0.840766,5275.0
1,0.360422,0.676238,0.470224,1010.0
accuracy,0.755131,0.755131,0.755131,0.755131
macro avg,0.642967,0.723237,0.655495,6285.0
weighted avg,0.834702,0.755131,0.78122,6285.0


# Neural Model Classifier

In [7]:
from src import neural_model
import keras

In [34]:
history = []

METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

kFold = StratifiedKFold(n_splits=5)
scores =[]
for train, test in kFold.split(X, y):

    model = neural_model.simple_nn(METRICS)
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=40)
    model.fit(X[train], y[train], epochs=100, batch_size=16, validation_split=0.2, callbacks=[es])
    scores.append(model.evaluate(X[test], y[test], verbose=0))

    history.append(model)

y: 0.8577 - val_precision: 0.6667 - val_recall: 0.0084 - val_auc: 0.7081 - val_prc: 0.2981
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 00093: early stopping


In [21]:
model = neural_model.simple_nn(METRICS)
model.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2, callbacks=[es])

0
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7fbbb459f2b0>

In [30]:
 loss, tp, fp, tn, fn, acc, prec, recall, auc, prc = model.evaluate(X_test, y_test, verbose=0)


In [35]:
scores

[[0.3765993118286133,
  2.0,
  7.0,
  3510.0,
  671.0,
  0.8381861448287964,
  0.2222222238779068,
  0.002971768146380782,
  0.7706210017204285,
  0.48837658762931824],
 [0.40408533811569214,
  14.0,
  10.0,
  3507.0,
  659.0,
  0.8403341174125671,
  0.5833333134651184,
  0.02080237679183483,
  0.7201049327850342,
  0.3362295925617218],
 [0.4155384302139282,
  18.0,
  9.0,
  3507.0,
  655.0,
  0.8414896130561829,
  0.6666666865348816,
  0.026745913550257683,
  0.6919609308242798,
  0.3183708190917969],
 [0.40109848976135254,
  19.0,
  6.0,
  3510.0,
  654.0,
  0.8424444794654846,
  0.7599999904632568,
  0.028231797739863396,
  0.7205083966255188,
  0.3592396676540375],
 [0.4066544771194458,
  24.0,
  9.0,
  3507.0,
  649.0,
  0.8429219126701355,
  0.7272727489471436,
  0.03566121682524681,
  0.7130293250083923,
  0.347160667181015]]