# Abstractive Text Summarization using Attention and Beam Search

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

!pip install nltk

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import pandas as pd
import os
import io
import time
import re
import requests
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os
import pickle

## Preprocess

In [None]:
def load_files(
    train_article_path, train_title_path, valid_article_path, valid_title_path):
    """
    Lê os arquivos de entrada.

    Parameters
    ----------
    train_article_path : str
        Caminho do arquivo que possui os artigos da base de dados de treinamento.
    train_title_path : str
        Caminho do arquivo que possui os resumos da base de dados de treinamento.
    valid_article_path : str
        Caminho do arquivo que possui os artigos da base de dados de validacao.
    valid_title_path : str
        Caminho do arquivo que possui os titulos da base de dados de validacao.
    Returns
    -------
    article_train : artigos da base de treino.
    summary_train : titulos da base de treino.
    article_valid : artigos da base de validacao.
    summary_valid : titulos da base de validacao. 
    """

    article_train = open(train_article_path,'r') 
    summary_train = open(train_title_path,'r') 
    article_valid = open(valid_article_path,'r')
    summary_valid = open(valid_title_path ,'r') 

    return article_train, summary_train, article_valid, summary_valid

def remove_stopwords(line):
    """
    Remove stopwords.

    Parameters
    ----------
    line : str
        Sentença com 'n' palavras.
    Returns
    -------
    sentence : sentença com 'n' palavras sem stopwords.
    """

    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(line)
    sentence = [i for i in tokens if not i in stop_words]
    
    return sentence

def preprocess(file):
    """
    Remove caracteres especias e ruídos do texto.

    Parameters
    ----------
    file : object
        Object de arquivo.
    Returns
    -------
    processed : arquivo pré-processado sem caracteres especiais ou ruídos.
    """
    
    processed = [] # preprocess the file
    lines = file.read().split('\n')
    for line in lines:
        line = line.lower() # lower case
        line = line.replace('#','') # replace the token '#' with '<num>'
        line = line.replace(',','')
        line = line.replace('-','')
        line = line.replace('-','')
        #Remove strings compostas por caracteres unicos
        line = re.sub(r'\b\w\b', '', line)
        line = re.sub(r'\d+', '', line)
        text = re.sub(r"[()`#/@';:%<>$&\"{}~+=?|]", " ", line) # replace other tokens with a space
        text = text.rstrip().strip() # strip white space
        text = text.replace(".","#")
        processed.append(text)
    del(processed[len(processed)-1])

    return processed

def addTokens(
    lines, type_input, summary_max_len, aricle_max_len, stopwords=False): 
    """
    Filtra o texto de entrada de acordo com o limite de tamanho pré-estabelecido
    e adiciona os caracteres de inicio (sostok) e fim (eostok) de sentença. Somente
    os texto do tipo article podem ter suas stopwords removidas.

    Parameters
    ----------
    lines : str
        Sentença com 'n' palavras.
    type_input : str
        Tipo de entrada, pode ser summary or article.
    summary_max_len : int
        Tamanho máximo do resumo de saída.
    aricle_max_len : int
        Tamanho máximo do texto de entrada.
    stopwords : bool
        If true, remove stopwords.
    Returns
    -------
    sentence : sentença com 'n' palavras sem stopwords.
    """

    textos = []
    for line in lines:
        if(type_input == "summary"):
          text = line.split(" ")[0:summary_max_len]
          text = " ".join(text)
          text = 'sostok ' + text + ' eostok' # beginning and end tokens for each sentence
        elif(type_input == "article"):
          if(stopwords):
            text = remove_stopwords(line)
            text = text[0:article_max_len]
          else:
            text = line.split(" ")[0:article_max_len]
          text = " ".join(text)
          text = 'sostok ' + text + ' eostok'
        textos.append(text)

    return textos

## Tokenizer

In [None]:
def tokenizer_texts(
    mode, name_model, all_text, article_max_len, summary_max_len,
    article_train, summary_train, path_save, verbose=False):
    """
    Cria tokenizador pro texto. Se o caminho do diretório do modelo já existir ou
    a função for chamada em modo de validação, apenas carrega um tokenizador já existente
    caso contrário cria tokenizador.

    Parameters
    ----------
    mode : str
        Indica se está executando treinamento ou validação.
    name_model : str
        Nome do modelo.
    all_text : 
        Todos os textos de entrada.
    article_max_len : int
        Tamanho máximo do texto de entrada.
    summary_max_len : int
        Tamanho máximo do texto de saída.
    article_train : list
        Lista com todos os artigos.
    summary_train : lis
        Lista com todos os resumos.
    verbose : bool
        If true, utiliza os prints.
    Returns
    -------
    vocab_size : tamanho do vocabulário criado, o vocabulário é crido apenas com as palavras da base de treino.
    tokenizer : objeto tokenizer.
    article_train : artigo após aplicação do padding.
    summary_train : resumo após aplicação do padding.
    """

    if not os.listdir(path_save + name_model):

      if verbose:
        print("Create a new Tokenizer")

      tokenizer = Tokenizer() 
      tokenizer.fit_on_texts(list(all_text))
 
      with open(path_save + name_model + '/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    elif(os.listdir(path_save + name_model) or mode == "VALID"):

      if(verbose):
        print("Load a old Tokenizer")

      with open(path_save + name_model + '/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    x_tr_seq    =   tokenizer.texts_to_sequences(article_train) 
    article_train    =   pad_sequences(x_tr_seq,  maxlen=article_max_len, padding='post')

    y_tr_seq    =   tokenizer.texts_to_sequences(summary_train) 
    summary_train    =   pad_sequences(y_tr_seq, maxlen=summary_max_len, padding='post') 
    vocab_size = len(tokenizer.word_index)+1 

    return vocab_size, tokenizer, article_train, summary_train

## Data Preparation

In [None]:
def data_preparation(
    mode, name_model, train_article_path, train_title_path,
    valid_article_path, valid_title_path, path_save, article_max_len=150,
    summary_max_len=15, verbose=False):
  
    """
    Pipeline para realizar todos as etapas de preparação dos dados para entrada do
    modelo.

      1. Load files
      2. Preprocess text
      3. Add start an end tokens
      4. Save inputs
      5. Remove header
      6. Tokenizer
      7. Create output

    Parameters
    ----------
    mode : str
        Tipo de execução, treino ou validação.
    name_model : str
        Nome do modelo.
    train_article_path : str
        Caminho do arquivo que possui os artigos da base de dados de treinamento.
    train_title_path : 
        Caminho do arquivo que possui os titulos da base de dados de treinamento.
    valid_article_path : 
        Caminho do arquivo que possui os artigos da base de dados de validacao.
    valid_title_path : 
        Caminho do arquivo que possui os titulos da base de dados de validação.
    article_max_len : int,  default = 150
        Tamanho maximo do artigo de entrada.
    summary_max_len : int , default = 15
        Tamanho máximo do resumo de saida.
    verbose : bool
        If true, utiliza os prints.
    Returns
    -------
    dataset : base de treino formatada.
    steps_per_epoch : quantidade de etapas em cada epoca de treinamento.
    tokenizer : objeto tokenizer.
    vocab_size : tamanho do vocabulário.
    """

   #1. Load Files
    article_train,summary_train,article_valid,summary_valid = load_files(train_article_path, train_title_path,
                                                                     valid_article_path, valid_title_path)

    # 2. Preprocess
    lines_article_train = preprocess(article_train)
    lines_summary_train = preprocess(summary_train)
    lines_article_valid = preprocess(article_valid)
    lines_summary_valid = preprocess(summary_valid)

    # 3. Add end and start tokens
    lines_article_train = addTokens(lines_article_train, "article",summary_max_len, article_max_len, False)  
    lines_summary_train = addTokens(lines_summary_train, "summary",summary_max_len, article_max_len, False)
    lines_article_valid = addTokens(lines_article_valid, "article",summary_max_len, article_max_len, False)
    lines_summary_valid = addTokens(lines_summary_valid, "summary",summary_max_len, article_max_len, False)

    all_text = lines_article_train + lines_summary_train + ['unktok']

    # 4. Save
    TRAIN = pd.DataFrame({'Input':lines_article_train,'Summary':lines_summary_train})
    VALID = pd.DataFrame({'Input':lines_article_valid,'Summary':lines_summary_valid})

    # 5. Remove header
    article_train = TRAIN.iloc[:,0].values
    summary_train = TRAIN.iloc[:,1].values
    article_valid = VALID.iloc[:,0].values
    summary_valid = VALID.iloc[:,1].values

    # 6. Tokenizer
    vocab_size,tokenizer,article_train_token,summary_train_token = tokenizer_texts(mode, name_model, all_text, article_max_len,
                                                                               summary_max_len, article_train, summary_train,
                                                                               path_save=path_save, verbose=verbose)
    BUFFER_SIZE = len(article_train)
    steps_per_epoch = BUFFER_SIZE//BATCH_SIZE

    # 7. Create output
    dataset = tf.data.Dataset.from_tensor_slices((article_train_token, summary_train_token)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

    if(verbose):
      print("Maximum input size: " + str(article_max_len))
      print("Maximum output abstract size: " + str(summary_max_len))
      print("Vocabulary size: " + str(vocab_size) + '\n')

    return dataset, VALID, steps_per_epoch, tokenizer, vocab_size

## Load Word Embeddings

In [None]:
def create_embeddings(
    tokenizer, article_max_len, vocab_size, path_save, number_model, verbose=False):
    """
    Create a embedding matrix se esse não existir, caso exista apenas carrega o
    arquivo existente.

    Parameters
    ----------
    tokenizer : object
        Objeto tokenizer.
    article_max_len : int
        Tamanho máximo do texto de entrada.
    vocab_size : int
        Tamanho do vocabulario.
    path : str
        Caminho base.
    verbose : bool
        If true, usa os prints
    Returns
    -------
    embedding_matrix
    """

    MAX_SEQUENCE_LENGTH=article_max_len
    MAX_NUM_WORDS = vocab_size + 2
    num_words = vocab_size

    if verbose:
      print('Indexing word vectors.')

    if not os.path.isfile(path_save + number_model + '/embedding_matrix.pickle'):

        if verbose:
          print("Create a new embedding matrix")

        embeddings_index = {}
        with open('/content/drive/My Drive/Colab Notebooks/glove/glove.6B.300d.txt') as f:
          for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs

        if verbose:
          print('Found %s word vectors.' % len(embeddings_index))

        word_index = tokenizer.word_index

        if verbose:
          print('Found %s unique tokens.' % len(word_index))

        # prepare embedding matrix
        num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
        embedding_matrix = np.zeros((num_words, embedding_dims))
        for word, i in word_index.items():
          if i >= MAX_NUM_WORDS:
            continue
          embedding_vector = embeddings_index.get(word)
          if embedding_vector is not None:
              # words not found in embedding index will be all-zeros.
              embedding_matrix[i] = embedding_vector

        with open(path_save + number_model + '/embedding_matrix.pickle', 'wb') as handle:
          pickle.dump( embedding_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    else:

        if verbose:
          print("Load embedding matrix")

        with open(path_save + number_model + '/embedding_matrix.pickle', 'rb') as handle:
          embedding_matrix = pickle.load(handle)

    return embedding_matrix

# Model

In [None]:
from tensorflow.keras.layers import  Bidirectional, LSTM, Concatenate

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_matrix, embedding_dims=300, dropout=0.8, hidden_size=150, batch_size=64, name_layer='BiLSTM'):
    super(Encoder, self).__init__()
    """
    Codificador do modelo. 

    Parameters
    ----------
    vocab_size : int
        Tamanho do vocabulário.
    embedding_matrix : list
        Matrix de embeddings.
    embedding_dims : int, default 300
        Dimensão do vetor de embeddings.
    dropout : float
        Quantidade de neurônios mantidos.
    hidden_size : int, default 150
        Quantidade de neuronios da camada oculta.
    batch_size : int, default 64
        Quantidade de batch.
    name_layer : str, default BiLSTM
        Tipo de rede utilizada, LSTM, GRU, BiGRU ou BiLSTM
    Returns
    -------
    processed : arquivo pré-processado sem caracteres especiais ou ruídos.
    """

    self.vocab_size=vocab_size
    self.batch_size=batch_size
    self.name_layer=name_layer
    self.dropout=dropout
    self.hidden_size=hidden_size
    self.embedding_matrix=embedding_matrix

    self.encoder_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, 
                                                           weights=[self.embedding_matrix],
                                                           output_dim=embedding_dims,
                                                           trainable=False)
    if(self.name_layer == "GRU"):
      self.gru1 = tf.keras.layers.GRU(self.hidden_size,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
      self.gru2 = tf.keras.layers.GRU(self.hidden_size,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    elif(self.name_layer == "LSTM"):
      self.lstm1 = tf.keras.layers.LSTM(self.hidden_size,
                                   return_sequences=True,
                                   return_state=True)
      
      self.lstm2 = tf.keras.layers.LSTM(self.hidden_size,
                                   return_sequences=True,
                                   return_state=True)
    elif(self.name_layer == "BiLSTM"):
      self.BiLSTM1 = Bidirectional(LSTM(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout=self.dropout))
      self.BiLSTM2 = Bidirectional(LSTM(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout=self.dropout))
    elif(self.name_layer == "BiGRU"):
      self.BiGRU1 = Bidirectional(tf.keras.layers.GRU(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout=self.dropout))
      self.BiGRU2 = Bidirectional(tf.keras.layers.GRU(self.hidden_size,
                                       return_sequences=True,
                                       return_state=True,
                                       dropout=self.dropout))

  def call(self, x, hidden):

    x = self.encoder_embedding(x)

    if(self.name_layer == "GRU"):
      encoder_outputs_1, *_ = self.gru1 (x, initial_state=hidden)
      encoder_outputs, encoder_states = self.gru2 (encoder_outputs_1)
    
    elif(self.name_layer == "LSTM"):
      encoder_outputs_1, *_ = self.lstm1 (x, initial_state = hidden)
      encoder_outputs, encoder_states, *_ = self.lstm2 (encoder_outputs_1)
    
    elif(self.name_layer == "BiLSTM"):
      encoder_outputs_1, *_ = self.BiLSTM1 (x,initial_state = hidden)
      encoder_outputs, forward_h, forward_c, backward_h, backward_c, *_ = self.BiLSTM2 (encoder_outputs_1)
      state_h = Concatenate()([forward_h, backward_h])
      state_c = Concatenate()([forward_c, backward_c])
      encoder_states = Concatenate()([state_h, state_c])
    
    elif(self.name_layer == "BiGRU"):
      encoder_outputs_1, *_ = self.BiGRU1 (x,initial_state = hidden)
      encoder_outputs, forward_h, forward_c, backward_h, backward_c, *_ = self.BiGRU2 (encoder_outputs_1)
      state_h = Concatenate()([forward_h, backward_h])
      state_c = Concatenate()([forward_c, backward_c])
      encoder_states = Concatenate()([state_h, state_c])

    return encoder_outputs, encoder_states

  def initialize_hidden_state(self):
      if(self.name_layer == "LSTM" or self.name_layer == "GRU"):
        return (tf.zeros([self.batch_size, self.hidden_size]),
              tf.zeros([self.batch_size, self.hidden_size]))
      elif (self.name_layer == "BiLSTM" or self.name_layer == "BiGRU"):
        return (tf.zeros([self.batch_size, self.hidden_size]),
              tf.zeros([self.batch_size, self.hidden_size]),
              tf.zeros([self.batch_size, self.hidden_size]),
              tf.zeros([self.batch_size, self.hidden_size]))
        
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score

    query_with_time_axis = tf.expand_dims(query, 1)

    #print(values)
    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))
    
    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_matrix, embedding_dim, hidden_size, batch_size, name_layer, name_layer_encoder):
    super(Decoder, self).__init__()
    
    self.vocab_size=vocab_size
    self.batch_size = batch_size
    self.name_layer = name_layer
    self.embedding_matrix = embedding_matrix

    if(name_layer_encoder == "LSTM" or name_layer_encoder == "GRU"): self.hidden_size = hidden_size
    elif(name_layer_encoder == "BiLSTM" or name_layer_encoder == "BiGRU"): self.hidden_size = hidden_size*4

    self.decoder_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, 
                                                           weights=[self.embedding_matrix],
                                                           output_dim=embedding_dims,
                                                           trainable=False)
    
    if(self.name_layer == "GRU"):
      self.gru = tf.keras.layers.GRU(self.hidden_size,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    elif(self.name_layer == "LSTM"):
      self.lstm = tf.keras.layers.LSTM(self.hidden_size,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
      
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.hidden_size)
    #self.attention = AttentionLayer(self.hidden_size)

  def call(self, x, hidden, enc_output):
    #print(enc_output)
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.decoder_embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    if(self.name_layer == "GRU"):
      #GRU Layer 1
      decoder_outputs, decoder_states = self.gru (x)
    
    elif(self.name_layer == "LSTM"):
      #LSTM Layer 1
      decoder_outputs, decoder_states, *_ = self.lstm (x)

    # output shape == (batch_size * 1, hidden_size)
    decoder_outputs = tf.reshape(decoder_outputs, (-1, decoder_outputs.shape[2]))


    # output shape == (batch_size, vocab)
    x = self.fc(decoder_outputs)

    return x, decoder_states, attention_weights

## Define the optimizer and the loss function

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

@tf.function
def train_step(inp, targ, enc_hidden, encoder, decoder):
  loss = 0

  with tf.GradientTape() as tape:
    
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([tokenizer.word_index['sostok']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

## Training


In [None]:
import time
import datetime


def model_training(vocab_size, embedding_matrix, embedding_dims, hidden_size, BATCH_SIZE, name_layer_encoder, name_layer_decoder, path_save, steps_per_epoch, dropout):

  #Create model e Load checkpoint
  encoder = Encoder(vocab_size, embedding_matrix, embedding_dims=embedding_dims, dropout=dropout, hidden_size=hidden_size, batch_size=BATCH_SIZE, name_layer=name_layer_encoder)
  decoder = Decoder(vocab_size, embedding_matrix, embedding_dims, hidden_size, BATCH_SIZE, name_layer_decoder, name_layer_encoder)
  
  checkpoint_dir = path_save + number_model + '/'
  checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
  checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                  encoder=encoder,
                                  decoder=decoder)
  manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

  if manager.latest_checkpoint:
      print("Restored from {}".format(manager.latest_checkpoint))
      checkpoint.restore(manager.latest_checkpoint)
      EPOCHS = 100-(int(manager.latest_checkpoint.split('/')[-1].split('-')[-1]))
      passo = int(manager.latest_checkpoint.split('/')[-1].split('-')[-1]) +1
  else:
      print("Initializing")
      EPOCHS = 100
      passo = 0

  print("\nStarting Model Training\n")
  print("Number of epochs: " + str(EPOCHS))
  print("Batch size: " + str(BATCH_SIZE) + "\n\n")

  with open(path_save + number_model + '/time_hybrid.txt', 'a') as f:
    f.write('Model: {} Inicio: {}'.format(number_model, datetime.datetime.now()))

  start = time.time()
  for epoch in range(EPOCHS):

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
      batch_loss = train_step(inp, targ, enc_hidden, encoder, decoder)
      total_loss += batch_loss
      
      if batch % 400 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + passo,
                                                    batch,
                                                    batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 1 == 0:
      manager.save()

    print('Epoch {} Loss {:.4f}'.format(epoch + passo,
                                        total_loss / steps_per_epoch))
    with open(path_save + number_model + '/history.txt','a') as f:
      f.write('Epoch {} Loss {:.4f}'.format(epoch +passo,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

  elapsed = time.time()-start
  print("Time: %s seconds"%(elapsed))

  #Sending mensage to slack
  web_hook_url = 'https://hooks.slack.com/services/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'
  slack_msg = {'text': 'Training was finished'}
  requests.post(web_hook_url,data = json.dumps(slack_msg))

#model_training(vocab_size, embedding_matrix, embedding_dims, hidden_size, BATCH_SIZE, name_layer_encoder, name_layer_decoder)

## Generate Summaries

In [None]:
import pandas as pd

def evaluate(sentence, encoder, decoder):

  attention_plot = np.zeros((summary_max_len, article_max_len))
  input_lines = ['sostok '+sentence+''] 
  
  #Adiciona token unktok nas palavras que não pertencem ao dicionário
  for line in input_lines:
    i = 0
    vet = []
    for w in line.split(' '):
      try:
        tokenizer.word_index[w]
        vet.append(w)
      except:
        vet.append('unktok')

      i+=1

  input_lines = [" ".join(vet)]
  input_sequences = [[tokenizer.word_index[w] for w in line.split(' ')] for line in input_lines]
  input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences,maxlen=article_max_len, padding='post')
  inputs = tf.convert_to_tensor(input_sequences)

  result = ''

  if(name_layer_encoder == "LSTM" or name_layer_encoder == "GRU"):
    hidden = (tf.zeros([1, hidden_size]),tf.zeros([1, hidden_size]))
  elif(name_layer_encoder == "BiLSTM" or name_layer_encoder == "BiGRU"):
    hidden = (tf.zeros([1, hidden_size]),tf.zeros([1, hidden_size]),tf.zeros([1, hidden_size]),tf.zeros([1, hidden_size]))
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([tokenizer.word_index['eostok']], 0)

  for t in range(summary_max_len):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += tokenizer.index_word[predicted_id] + ' '

    if tokenizer.index_word[predicted_id] == 'eostok':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

def plot_attention(attention, article, summary):

  '''
      Função para gerar gráfico com os pesos do mecanismo de atenção

      Inputs: 
              attention: matriz com pesos de atenção
              article: abstract da patente
              summary: resumo gerado pelo modelo

      Outputs: None
  '''
  
  fig = plt.figure(figsize=(30,30))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention.T, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + summary, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + article, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

def attention_vector(attention_matrix):
  '''
    A partir dos valores de pesos de atençao, essa função cria um vetor que contém
    o peso geral das palavras de entrada para as palavras de saída

    Input: 
            attention_matrix = matriz de pesos com com quantidades de colunas
            iguais as quantidades de palavras de entrada
    Output: vetor com o peso geral de cada palavra
  '''

  attention_values = []
  j = 0

  for attn_vector in attention_matrix:
    i = 0
    #attn_word representa o peso de atenção de uma determinada palavra
    for attn_word in attn_vector:
      #Adiciona os primeiros valores de vetor de atenção
      if(j == 0):
        attention_values.append(attn_word)
      #Incrementa os valores de atenção
      else:
        attention_values[i] = attention_values[i] + attn_word
      i+=1
    j+=1

  return attention_values

def get_max_attention(attention_vector,article,n=20):

  '''
      Função que seleciona os 'n' maiores valores de atenção
      
      Input: 
            attention_vector = vetor com valores de atenção de cada palavra
            n = quantidade de palavras selecionas; default = 20

      Output: vetor com os indices das 'n' palavras com maior valor de atenção
  '''

  #Converte vetor de entrada
  vector = pd.Series(attention_vector)
  #Seleciona os 'n' maiores valores
  index_max_values = vector.nlargest(n)
  #Converte vetor em lista
  list_index_max_values = index_max_values.index.values.tolist()

  words_max_values = []

  #Seleciona as palavras de maior valor de atenção
  for i in list_index_max_values:
    if(article.split(" ")[i] != "#"):

      words_max_values.append(article.split(" ")[i])

  #Remove as palavras repetidas
  words_max_values = sorted(set(words_max_values))

  return words_max_values

def abstractive_summary(article, encoder, decoder):

  '''
      Função para gerar resumo de saída

      Input: Abstract da patente
      Output: Summary and attention_plot
  '''

  summary, article_reference, attention_plot = evaluate(article, encoder, decoder)

  attention_plot = attention_plot[:len(summary.split(' ')), :len(article_reference.split(' '))]

  return attention_plot,summary

def hybrid_summarizer(article,words_max_attention):

  #Separa o texto em sentenças
  lines_article = article.split('.')

  #Verifica a quantidade de interseções entre as palavras com maior valor de atenção
  # e cada sentença do texto de entrada
  count_intersections = []
  for i in lines_article:
    words_article = i.strip().split(' ')
    cont = 0
    for word in words_article:
      for word_max in words_max_attention:
        if(word_max == word):
          cont+=1
    count_intersections.append(cont)

  return lines_article[count_intersections.index(max(count_intersections))]

def generates_summaries(VALID, path_save, encoder, decoder):

  #Load validation dataset
  base_article_valid = VALID.iloc[:,0].values
  base_summary_valid = VALID.iloc[:,1].values

  hybrid_e1 = []
  hybrid_e2 = []
  hybrid_e3 = []
  article = []
  candidates_hybrid_summary = []
  candidates_abstractive_summary = []
  vet_words_max_attention = []
  references_summary = []

  file_hybrid_e1 = open(path_save + "summaries/e1/" + number_model + ".txt", "w")
  file_hybrid_e2 = open(path_save + "summaries/e2/" + number_model + ".txt", "w")
  file_hybrid_e3 = open(path_save + "summaries/e3/" + number_model + ".txt", "w")

  print("Generating Summaries")

  valid_article_path = "/content/drive/My Drive/Colab Notebooks/sumdata/database_scim_extend/with_stopwords/abstract.valid.pp.txt"
  valid_title_path   = "/content/drive/My Drive/Colab Notebooks/sumdata/database_scim_extend/with_stopwords/title.valid.pp.txt"


  _,_,article_valid,summary_valid = load_files(train_article_path,
                                              train_title_path,
                                              valid_article_path,
                                              valid_title_path)

  article_valid = article_valid.read().split('\n')
  summary_valid = summary_valid.read().split('\n')

  for i in range(8306):

    #Select sentence
    input_raw = base_article_valid[i].replace('eostok','').replace('sostok','').strip()
    #Abstrative summarization
    attention_matrix, candidate_abstractive_summary = abstractive_summary(input_raw, encoder, decoder)
    #Calcula vetor de atenção
    attention_vector_result = attention_vector(attention_matrix)
    #Seleciona as palavras com maior valor de atenção
    vet_words_max_attention = get_max_attention(attention_vector_result,input_raw,n=20)


    article.append(article_valid[i])
    aux = hybrid_summarizer(article_valid[i],vet_words_max_attention)

    #print("\nSummary Referene")
    references_summary.append(summary_valid[i].replace('eostok','').replace('sostok',''))

    candidate_hybrid_summary = ' '.join(list(filter(None, aux.rstrip().strip().split(" ")))[:])
    e1 = candidate_hybrid_summary.replace('eostok','').replace('sostok','').strip()
    hybrid_e1.append(e1)
    file_hybrid_e1.write(e1 + "\n")

    #estratégia 2
    candidate_hybrid_summary = ' '.join(list(filter(None, aux.rstrip().strip().split(" ")))[:15])
    e2 = candidate_hybrid_summary.replace('eostok','').replace('sostok','').strip()
    hybrid_e2.append(e2)
    file_hybrid_e2.write(e2 + "\n")

    #estratégia 3
    len_output = int((len(input_raw.split(" ")[:150])*0.1)+0.5)
    candidate_hybrid_summary = ' '.join(list(filter(None, aux.rstrip().strip().split(" ")))[:len_output])
    e3 = candidate_hybrid_summary.replace('eostok','').replace('sostok','').strip()
    hybrid_e3.append(e3)
    file_hybrid_e3.write(e3 + "\n")


    #print("\nAbstractive summary: ")
    candidates_abstractive_summary.append(candidate_abstractive_summary.replace('eostok','').replace('sostok',''))
    #print(candidate_abstractive_summary)

  file_hybrid_e1.close()
  file_hybrid_e2.close()
  file_hybrid_e3.close()
  print("Fim da geração de resumos")

  #Sending mensage to slack
  web_hook_url = 'https://hooks.slack.com/services/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'
  slack_msg = {'text': 'The Generation of Sentences was finished'}
  requests.post(web_hook_url,data = json.dumps(slack_msg))

  return hybrid_e1, hybrid_e2, hybrid_e3, candidates_abstractive_summary


# Training


In [None]:
# Paths
train_article_path = "/content/drive/My Drive/Colab Notebooks/sumdata/database_scim_extend/without_stopwords/abstract.train.pp.txt"
train_title_path   = "/content/drive/My Drive/Colab Notebooks/sumdata/database_scim_extend/without_stopwords/title.train.pp.txt"
valid_article_path = "/content/drive/My Drive/Colab Notebooks/sumdata/database_scim_extend/without_stopwords/abstract.valid.pp.txt"
valid_title_path   = "/content/drive/My Drive/Colab Notebooks/sumdata/database_scim_extend/without_stopwords/title.valid.pp.txt"

BATCH_SIZE = 64
embedding_dims = 300
hidden_size  = 150
learning_rate = 0.001
article_max_len = 150 + 2 #Tamanho maximo do texto de entrada
summary_max_len = 15 + 2 #Tamanho maximo do texto de saida
number_model = 'model_14'
dropout=0.4
mode='VALID'

name_layer_encoder = "BiLSTM"
name_layer_decoder = "LSTM"

path_save = '/content/drive/My Drive/Colab Notebooks/Hybrid_Summ_App/approach_2/'

if not os.path.exists(path_save + number_model):
    print("Create a new directory")
    os.mkdir(path_save + number_model)

dataset, VALID, steps_per_epoch, tokenizer, vocab_size = data_preparation(mode, number_model, train_article_path, train_title_path,
                 valid_article_path, valid_title_path, path_save=path_save, article_max_len=article_max_len,
                 summary_max_len=summary_max_len, verbose=True)

embedding_matrix = create_embeddings(tokenizer, article_max_len, vocab_size, path_save=path_save, number_model=number_model, verbose=False)


#model_training(vocab_size, embedding_matrix, embedding_dims, hidden_size, BATCH_SIZE, name_layer_encoder, name_layer_decoder, path_save, steps_per_epoch, dropout)

#Plot attention weights
#attention_plot = attention_matrix
#plot_attention(attention_plot, input_raw.split(' '), candidate_abstractive_summary.split(' '))"""

## Restore the latest checkpoint and test

In [None]:
encoder = Encoder(vocab_size, embedding_matrix, embedding_dims, dropout, hidden_size, BATCH_SIZE, name_layer_encoder)
decoder = Decoder(vocab_size, embedding_matrix, embedding_dims, hidden_size, BATCH_SIZE, name_layer_decoder, name_layer_encoder)
checkpoint_dir = path_save + number_model + '/'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)

print("Restored from {}".format(manager.latest_checkpoint))
checkpoint.restore(manager.latest_checkpoint)

In [None]:
base_article_valid = VALID.iloc[:,0].values
input_raw = base_article_valid[1212].replace('eostok','').replace('sostok','').strip()
print(input_raw)
#Abstrative summarization
attention_matrix, candidate_abstractive_summary = abstractive_summary(input_raw, encoder, decoder)
#Calcula vetor de atenção
attention_vector_result = attention_vector(attention_matrix)
vet_words_max_attention = get_max_attention(attention_vector_result,input_raw,n=20)
print(vet_words_max_attention)
plot_attention(attention_matrix, input_raw.split(' '), candidate_abstractive_summary.split(' '))

In [None]:
#hybrid_e1, hybrid_e2, hybrid_e3, candidates_abstractive_summary = generates_summaries(VALID, path_save, encoder, decoder)                                  

# Evaluate model using ROUGEs e NUBIA metrics

In [None]:
cd /content/drive/My Drive/Colab Notebooks

In [None]:
'''
!git clone https://github.com/google-research/bleurt.git
os.chdir('bleurt')
!pip install .
from bleurt import score
tf.compat.v1.flags.DEFINE_string('f','','')
checkpoint = "bleurt/test_checkpoint"
bleurt = score.BleurtScorer(checkpoint)
'''

#!git clone https://github.com/wl-research/nubia.git
os.chdir('nubia')
!pip install -r requirements.txt
from nubia import Nubia
nubia = Nubia()

!pip install sumeval
!python -m spacy download en

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from sumeval.metrics.rouge import RougeCalculator
from nubia import Nubia
from xml.etree import ElementTree
from xml.dom import minidom
from functools import reduce
from xml.etree.ElementTree import Element, SubElement, Comment


def eval(
    reference_summary, model_summary, metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "NUBIA", "BLEURT"]):

    rouge = RougeCalculator(stopwords=True, lang="en")

    if("ROUGE_1" in metrics):
      rouge_1 = rouge.rouge_n( summary=model_summary, references=reference_summary, n=1)
    else:
      rouge_1 = None

    if("ROUGE_2" in metrics):
      rouge_2 = rouge.rouge_n(summary=model_summary,references=[reference_summary],n=2)
    else:
      rouge_2 = None

    if("ROUGE_L" in metrics):
      rouge_l = rouge.rouge_l( summary=model_summary,references=[reference_summary])
    else:
      rouge_l = None

    if("NUBIA" in metrics):
      nubia_score = nubia.score(reference_summary, model_summary)
    else:
      nubia_score =  None

    if("BLEURT" in metrics):
      bleurt_score = scorer.score([reference_summary], [model_summary])
      assert type(bleurt_score) == list and len(bleurt_score) == 1
    else:
      bleurt_score = None

    return rouge_1, rouge_2,rouge_l, nubia_score, bleurt_score

def prettify(elem):
      """Return a pretty-printed XML string for the Element.
      """
      rough_string = ElementTree.tostring(elem, 'utf-8')
      reparsed = minidom.parseString(rough_string)
      return reparsed.toprettyxml(indent="  ")
  
def create_report_valid(
    summary_array, references_summary, article, name_file,
     metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "NUBIA", "BLEURT"]):

  rouge_1_arr  = []
  rouge_2_arr  = []
  rouge_L_arr  = []
  NUBIA_arr = []
  bleurt_arr = []

  top = Element('ZakSum')

  comment = Comment('Generated by Amr Zaki')
  top.append(comment)

  i=0
  for summ in summary_array:

      
      example = SubElement(top, 'example')
      article_element   = SubElement(example, 'article')
      article_element.text = article[i]
  
      reference_element = SubElement(example, 'reference')
      reference_element.text = references_summary[i]
  
      summary_element   = SubElement(example, 'summary')
      summary_element.text = summ

      if(len(summ) != 0):
        rouge_1, rouge_2, rouge_L, nubia_score, bleurt_score = eval(references_summary[i],summ, metrics=metrics )
      else: 
        rouge_1 = rouge_2 = rouge_L = nubia_score, bleurt_score = 0
  
      eval_element = SubElement(example, 'eval')
      if(rouge_1 != None):
        ROUGE_1_element  = SubElement(eval_element, 'ROUGE_1' , {'score':str(rouge_1)})
        rouge_1_arr.append(rouge_1) 
      if(rouge_2 != None):
        ROUGE_2_element  = SubElement(eval_element, 'ROUGE_2' , {'score':str(rouge_2)})
        rouge_2_arr.append(rouge_2)
      if(rouge_L != None):
        ROUGE_L_element  = SubElement(eval_element, 'ROUGE_l' , {'score':str(rouge_L)})
        rouge_L_arr.append(rouge_L)
      if(nubia_score != None): 
        NUBIA_element =  SubElement(eval_element,'NUBIA', {'score':str(nubia_score)})
        NUBIA_arr.append(nubia_score)
      if(bleurt_score != None): 
        BLEURT_element =  SubElement(eval_element,'BLEURT', {'score':str(bleurt_score[0])})
        bleurt_arr.append(bleurt_score[0])
  
      i+=1

  if(rouge_1_arr != []): top.set('rouge_1', str(np.mean(rouge_1_arr)))
  if(rouge_2_arr != []): top.set('rouge_2', str(np.mean(rouge_2_arr)))
  if(rouge_L_arr != []): top.set('rouge_L', str(np.mean(rouge_L_arr)))
  if(NUBIA_arr != []): top.set('NUBIA', str(np.mean(NUBIA_arr)))
  if(bleurt_arr != []):top.set('BLEURT', str(np.mean(bleurt_arr)))


  with open(name_file, "w+") as f:
    print(prettify(top), file=f)

In [None]:
#Load summaries generated
hybrid_e3 = open(path_save + "summaries/e1/" + number_model + ".txt").readlines()
hybrid_e3 = [i.replace("\n","") for i in hybrid_e3]

#Load Files not preprocess
_,_,article_valid,summary_valid = load_files(train_article_path,
                                             train_title_path,
                                             valid_article_path,
                                             valid_title_path)
article = article_valid.read().split('\n')
references_summary = summary_valid.read().split('\n')


#Create validation reporton"
metrics=["ROUGE_1", "ROUGE_2", "ROUGE_L", "NUBIA"]
create_report_valid(
    hybrid_e3, references_summary, article, name_file="{}{}/{}/{}".format(path_save,"validation", "e3", number_model + ".xml" ), metrics=metrics)

In [None]:
#Sending mensage to slack
web_hook_url = 'https://hooks.slack.com/services/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'
slack_msg = {'text': 'Validation of Sentences was finished'}
requests.post(web_hook_url,data = json.dumps(slack_msg))