# Projeto Final - Querido Diario

Nome: Fabio Akahoshi Collado

# Extraindo embeddings

## Import

In [1]:
import torch
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader, dataloader
#import torch.nn.functional as F

from google.colab import drive
!pip3 install pickle5
import pickle5 as pickle

!pip install transformers
from transformers import AutoTokenizer, BertPreTrainedModel, BertModel
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
import os
import re

drive.mount('/content/drive')
%cd /content/drive/MyDrive/Mestrado Unicamp/SegundoSemestre/IA376/ProjetoFinal/
from preprocess import preprocess2
params = {
    'model_name': 'neuralmind/bert-base-portuguese-cased',
    'batch_size': 16,
    'batch_accum': 5,
    'eval_batch_size': 16,
    'n_epochs': 3,
    'max_len': 300,
    'learning_rate' : 1e-5,
    'num_workers' : 2,
    'embeddings_file' : 'embeddings'
}
if torch.cuda.is_available(): 
    device = torch.device("cuda:0")
    print(torch.cuda.get_device_name("cuda:0"))
else: 
    print('CPU')
    device = torch.device("cpu" )

torch.manual_seed(123)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Mestrado Unicamp/SegundoSemestre/IA376/ProjetoFinal
Tesla P100-PCIE-16GB


<torch._C.Generator at 0x7fa67d872d30>

In [2]:
!pwd

/content/drive/MyDrive/Mestrado Unicamp/SegundoSemestre/IA376/ProjetoFinal


## Preparando Dados

In [3]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

querido_diario_data = load_obj('queridodiario2')
len(querido_diario_data['data'])

174

## BERT Embeddings

### Model



In [4]:
class BertEmbeddings(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        outputs = self.bert(input_ids = input_ids, 
                            attention_mask = attention_mask)
        pooled_outputs = outputs.pooler_output           #[batch, 768]
        # outputs[1][0] == outputs.pooler_output == outputs[0]
        return pooled_outputs
model = BertEmbeddings.from_pretrained(params['model_name'])

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertEmbeddings: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertEmbeddings from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertEmbeddings from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Dataset

In [5]:
tokenizer = AutoTokenizer.from_pretrained(params['model_name'])
def collate_function(params):
    def f(data):
        """
        data -> list of string. Each string is a paragraph
        """
        processed_paragraphs = [preprocess2(paragraph) for paragraph in data]
        enc = tokenizer(processed_paragraphs,
                        max_length=params['max_len'],
                        padding='longest',
                        return_tensors='pt',
                        truncation=True)
        return enc.input_ids, enc.attention_mask
    return f

collate_fn = collate_function(params)
example = collate_fn([('I am the first example of the batch.'), ('Another example of the batch.')])
print(example)
print('Paragraph 1 =', tokenizer.decode(example[0][0]))
print('Paragraph 2 =', tokenizer.decode(example[0][1]))
pooled_outputs  = model(*example)
print('pooled_outputs =', pooled_outputs)
print('pooled_outputs size =', len(pooled_outputs[0]))

(tensor([[  101,   254,  1052,  1621,  5101,  7485, 11367,  3066,   586,  1621,
          3985,   493,   119,   102],
        [  101,   622,  5238, 11367,  3066,   586,  1621,  3985,   493,   119,
           102,     0,     0,     0]]), tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
Paragraph 1 = [CLS] i am the first example of the batch. [SEP]
Paragraph 2 = [CLS] another example of the batch. [SEP] [PAD] [PAD] [PAD]
pooled_outputs = tensor([[ 0.0326, -0.1599,  0.0082,  ..., -0.0382, -0.0083,  0.0571],
        [ 0.0846, -0.1449, -0.0224,  ..., -0.0045,  0.0101,  0.0125]],
       grad_fn=<TanhBackward0>)
pooled_outputs size = 768


In [6]:
class QueridoDiario_Dataset(Dataset):
    def __init__(self, x:tuple):
        self.x = x
        #self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx]

### Get_Embeddings class

In [7]:
def join_lists(x):
    return [inner for outer in x for inner in outer]

class Get_Embeddings():
    def __init__(self, model, params:dict, device:torch.device):
        self.model = model.to(device)
        self.device = device
        self.params = params

    def get_model(self):
        return self.model

    def model_output_list(self, loader:dataloader.DataLoader):
        diario_embeddings = []
        batch_iter = tqdm(loader)
        
        for x, mask in batch_iter:
            with torch.no_grad():
                x = x.to(self.device) #[32, 11670]
                mask = mask.to(self.device) #[32, 1]
                pooled_outputs = self.model(input_ids = x, 
                                            attention_mask = mask)
                diario_embeddings = diario_embeddings + \
                                    list(pooled_outputs.cpu().numpy())
        return diario_embeddings
    
    def embedded_paragraphs(self, data:list):
        # data is a list of strings
        queridodiario_dataset = QueridoDiario_Dataset(data)
        loader = DataLoader(queridodiario_dataset,
                            shuffle = False,
                            batch_size = self.params['batch_size'],
                            collate_fn = collate_fn,
                            num_workers = self.params['num_workers'])
        return self.model_output_list(loader)
    
    def get_embeddings(self, 
                       data: list, 
                       reset_embeddings: bool = False, 
                       save_load: bool = True):
        if reset_embeddings or save_load == False:
            embeddings = []
        else:
            embeddings = load_obj(self.params['embeddings_file'])

        for i in range(len(embeddings), len(data)):
            print(i)
            embeddings.append(self.embedded_paragraphs(data[i]))
            if save_load: save_obj(embeddings, 'embeddings_file')

        for e, d in zip(embeddings, data):
            assert len(e) == len(d)
        return embeddings

### Creating Embeddings Vector

In [8]:
get_embeddings = Get_Embeddings(model = model, 
                                params = params, 
                                device = device)
embeddings = get_embeddings.get_embeddings(data = querido_diario_data['data'], 
                                           reset_embeddings = False)

In [25]:
len(embeddings[1][3])

768

## Clusterer

In [9]:
def compute_distance_matrix(embeddings:list, 
                            metric:str = 'cosine', 
                            pca_components:int = 0):
    if pca_components > 0:
        pca = PCA(n_components = pca_components)
        emb_pca = pca.fit_transform(embeddings)
        print("Explained Variance =", pca.explained_variance_ratio_.sum())
        return np.tril(pairwise_distances(X = emb_pca, metric = metric))
    return np.tril(pairwise_distances(X = embeddings, metric = metric))

def mean_distance(start:int, end:int, distance_matrix):
    """
    computes the mean of the distances of all possible pairs in the block
    includes start, excludes end
    """
    s = end - start
    if s == 1: return distance_matrix[start: end, start: end].sum()
    return distance_matrix[start: end, start: end].sum() / ((s**2 - s) / 2)

def sum_of_mean_distances(start:int, end:int, 
                          distance_matrix, border_index:int):
    
    dist_sum = mean_distance(start, border_index, distance_matrix)
    dist_sum += mean_distance(border_index, end, distance_matrix)
    #print('start =', start, '. end =', end, '. border =', border_index, ' sum =', dist_sum)
    return dist_sum

def divide_in_two_parts(start:int, end:int, distance_matrix):
    assert distance_matrix.shape[0] >= end
    assert start >= 0
    assert end - start >= 3
    margin = 2
    if end - start <= 5: margin = 1
    smallest_distances = -1
    for i in range(start + margin, end - margin):
        distances = sum_of_mean_distances(start, end, distance_matrix, i)
        #print(i, new_mean, mean)
        if smallest_distances == -1 or distances < smallest_distances:
            smallest_distances = distances
            new_border = i
    return new_border

def divide_in_sections2(start, end, distance_matrix, cuts = 20, min_block = 2):
    """
    divides the text cutting the part with the highest mean distance
    """
    
    borders_list = [start, end]
    for i in range(cuts):
        block_to_cut = 0
        largest_mean = 0
        for j in range(len(borders_list) - 1):
          
            if borders_list[j+1] - borders_list[j] > min_block:
                new_mean = mean_distance(borders_list[j], 
                                         borders_list[j+1], 
                                         distance_matrix)
                #print('new_mean =', new_mean, ' - largest_mean =', largest_mean, ' - j =', j, ' - block_to_cut =', block_to_cut)
                if new_mean > largest_mean:
                    largest_mean = new_mean
                    block_to_cut = j
        
        new_border = divide_in_two_parts(borders_list[block_to_cut],
                                         borders_list[block_to_cut+1],
                                         distance_matrix)
        borders_list = borders_list[:block_to_cut+1]\
                       + [new_border]\
                       + borders_list[block_to_cut+1:]
        #print(borders_list)
    return borders_list

def divide_in_sections(start, end, distance_matrix, max_size = 20):
    """
    divides the text cutting the part with size greater than max_size
    """
    if end - start <= max_size:
        return []
    d = divide_in_two_parts(start, end, distance_matrix)
    return divide_in_sections(start+2, d, distance_matrix, max_size) + [d] + divide_in_sections(d, end-2, distance_matrix, max_size)

def sections_with_words(paragraphs, 
                        divisions, 
                        key_words: list, 
                        threshold: float,
                        verbose: bool = True):
    #highest_freq = 0
    end = 0
    start = divisions[-1]
    best_result = 0
    for i in range(len(divisions)-1):
        count = 0
        for j in range(divisions[i], divisions[i+1]):
            count += len(re.findall('|'.join(key_words), 
                                    paragraphs[j], 
                                    re.IGNORECASE))
        freq = count/(divisions[i+1] - divisions[i])
        #if freq >= highest_freq:
        #    highest_freq = freq
        #    best_result = [divisions[i], divisions[i+1]]
        if freq > threshold:
            start = min(start, divisions[i])
            end = max(end, divisions[i+1])
        if count > 0 and verbose: 
            print('count =', count, ' - start/end =', divisions[i], divisions[i+1], ' - freq =', freq)
    return [start, end]

def find_secao_de_licitacoes(file_name:str, 
                             data: dict, 
                             embeddings: list,
                             key_words: list, 
                             threshold: float,
                             cuts: int,
                             metric: str = 'cosine',
                             verbose: bool = True,
                             pca_components: int = 15):
    doc = data['file'].index(file_name)
    print('file', data['file'][doc])
    distance_matrix = compute_distance_matrix(embeddings = embeddings[doc],
                                          pca_components = pca_components,
                                          metric = metric)
    number_of_paragraphs = distance_matrix.shape[0]
    if verbose: print('Number of Paragraphs:', number_of_paragraphs)
    borders_list = divide_in_sections2(0, 
                                      distance_matrix.shape[0], 
                                      distance_matrix, 
                                      cuts = cuts)
    if verbose: print(borders_list)
    secao_de_licitacoes = sections_with_words(querido_diario_data['data'][doc], 
                          borders_list, 
                          key_words = key_words, 
                          threshold = threshold,
                          verbose = verbose)
    return secao_de_licitacoes

### Testing the distance matrix

In [26]:
data_test = [[
              'pratiquei esporte no fim de semana', 
              'joguei futebol domingo', 
              'a inflação está alta', 
              'o preço de tudo aumentou', 
              'foi a melhor feijoada que já comi', 
              'ficamos satisfeitos com a qualidade da comida']]
embeddings_test = get_embeddings.get_embeddings(data = data_test,
                                                save_load = False)
distance_matrix_test = compute_distance_matrix(embeddings = embeddings_test[0],
                                               pca_components = 0)
d = pd.DataFrame(distance_matrix_test)
pd.set_option('display.max_columns', None)  
print(d)
print()
for i in range(1,7):
    print(sum_of_mean_distances(0, 7, distance_matrix_test, i))

0


  0%|          | 0/1 [00:00<?, ?it/s]

              0             1             2         3             4  \
0  1.192093e-07  0.000000e+00  0.000000e+00  0.000000  0.000000e+00   
1  5.514550e-02  2.384186e-07  0.000000e+00  0.000000  0.000000e+00   
2  8.601749e-02  6.294936e-02  5.960464e-08  0.000000  0.000000e+00   
3  1.314989e-01  7.744926e-02  4.833144e-02  0.000000  0.000000e+00   
4  1.328302e-01  9.885746e-02  1.006019e-01  0.081372  2.384186e-07   
5  9.457111e-02  8.262467e-02  8.497560e-02  0.093607  7.880962e-02   

              5  
0  0.000000e+00  
1  0.000000e+00  
2  0.000000e+00  
3  0.000000e+00  
4  0.000000e+00  
5  1.192093e-07  

0.05397202571233114
0.1039156198501587
0.11033568779627481
0.10316872596740723
0.08750550746917725
0.087309463818868


# Executing code to find purchase section

In [27]:
validation_dataset = load_obj('validation_dataset')
test_dataset = load_obj('test_dataset')
key_words = [' licitaç', 'pregão eletrônico', '10.522', 'aviso de resultado']
def calculate_mean(dataset, embeddings, key_words):
    border_diff_sum = 0
    for file_name, label in dataset:
        secao_de_licitacoes = find_secao_de_licitacoes(file_name = file_name, 
                                    data = querido_diario_data,
                                    embeddings = embeddings,
                                    key_words = key_words, 
                                    threshold = 0.2,
                                    cuts = 45,
                                    metric = 'cosine',
                                    verbose = False,
                                    pca_components = 15) #'euclidean'
        print('Seção de Licitações encontra-se em:', secao_de_licitacoes)
        print('label =', label)
        border_diff = abs(secao_de_licitacoes[0] - label[0])\
                      + abs(secao_de_licitacoes[1] - label[1])
        print('border_diff =', border_diff)
        border_diff_sum += border_diff
        print('------------------------')
    border_diff_mean = border_diff_sum/len(dataset)
    print('border_diff_mean =', border_diff_mean)
calculate_mean(validation_dataset, embeddings, key_words)

file 1f852e8033f1e5ee1531a978ca523daa184adca0.pdf
Explained Variance = 0.9341412057338956
Seção de Licitações encontra-se em: [460, 629]
label = [462, 564]
border_diff = 67
------------------------
file 7f0b35bea0d05de832812facded5dc11f00ce57b.pdf
Explained Variance = 0.9326408564259889
Seção de Licitações encontra-se em: [166, 245]
label = [172, 243]
border_diff = 8
------------------------
file 7443db4e49488ead67c5c8d7f97c5907de55fda5.pdf
Explained Variance = 0.9342390746845142
Seção de Licitações encontra-se em: [176, 255]
label = [179, 217]
border_diff = 41
------------------------
file 74ba1ebfce5f8fa6520eb96128cf06176a1dd49e.pdf
Explained Variance = 0.9321620232280311
Seção de Licitações encontra-se em: [424, 479]
label = [429, 486]
border_diff = 12
------------------------
file 83b87284f80d4ab75933239f68caad3e57f26751.pdf
Explained Variance = 0.9331224746078024
Seção de Licitações encontra-se em: [342, 455]
label = [345, 436]
border_diff = 22
------------------------
border_diff

Evaluating

In [12]:
calculate_mean(test_dataset, embeddings, key_words)

file 2728b7301c46f5495def2225383e971d7f662fda.pdf
Explained Variance = 0.9611933058611252
Seção de Licitações encontra-se em: [209, 239]
label = [209, 233]
border_diff = 6
------------------------
file 31cd6b3a89dce330d2ccd510c29348175ad335ae.pdf
Explained Variance = 0.9405912218682592
Seção de Licitações encontra-se em: [271, 288]
label = [266, 291]
border_diff = 8
------------------------
file 8563fade5dfbf323cd3a4abb83d5ef2743759abd.pdf
Explained Variance = 0.955348995423295
Seção de Licitações encontra-se em: [145, 203]
label = [178, 198]
border_diff = 38
------------------------
file e936c0cf48524e5babc5ea0fccbbac1f01a469bd.pdf
Explained Variance = 0.9249727654002945
Seção de Licitações encontra-se em: [266, 386]
label = [271, 383]
border_diff = 8
------------------------
file b4d5ed20af1b33e4fcef9b164733d9690c3a7edd.pdf
Explained Variance = 0.939853716566477
Seção de Licitações encontra-se em: [335, 484]
label = [366, 425]
border_diff = 90
------------------------
border_diff_mea

# Paragraphs for visualization

In [14]:
doc = 1
for i, paragraph in enumerate(querido_diario_data['data'][doc]):
    print(i, paragraph)

0 DIÁRIO OFICIAL INFORMA:POR UM ERRO DE IMPRESSÃO NA EDIÇÃO Nº 4778, DE 13 DE JANEIRO DE 2010, DEVE-SE DESCONSIDERAR A página Nº 05, POR CONTER ERROS. (MATÉRIAS RELATIVAS A OUTRO DIÁRIO).
1 A PÁGINA CORRETA PODE SER ACESSADA NO SITE: www.goiânia.go.gov.br, E SERÁ REPUBLICADA NO DIÁRIO OFICIAL DE Nº 4.862 DE 17 DE MAIO DE 2010.
2 CÂMARA MUNICIPAL DE GOIÂNIALEI Nº 8.896 DE 07 DE ABRIL DE 2010INCLUI NO CALENDÁRIO DE EVENTOS CULTURAIS DE GOIÂNIA O LIQUIDA GOIÂNIA.
3 A CÂMARA MUNICIPAL DE GOIÂNIA APROVA E EU PROMULGO A SEGUINTE LEI: artigo 1ºInclui no Calendário de Eventos Culturais de Goiânia o Liquida Goiânia entre os dias vinte de agosto e dez de setembro. artigo 2ºEsta Lei entra em vigor na data da sua publicação. artigo 3ºRevogam-se as disposições em contrário.
4 GABINETE DO PRESIDENTE DA CÂMARA MUNICIPAL DE GOIÂNIA, aos sete dias do mês de abril do ano de dois mil e dez (07.04.2.010).
5 FRANCISCO VALE JÚNIORPRESIDENTE Diário Oficial MUNICÍPIO DE GOIÂNIA2010N° 4.861GOIÂNIA, 14 DE MAIOS

### Brute Force
Ram is exploding

In [None]:
def next_borders_list(borders_list: list, doc_len: int):
    if borders_list[-1] < doc_len - 1:
        borders_list[-1] += 1
        return borders_list
    else:
        if len(borders_list) == 1: return -1
        short_borders_list = next_borders_list(borders_list[:-1], doc_len - 1)
        if short_borders_list == -1: return -1
        return short_borders_list + [short_borders_list[-1] + 1]

def generate_candidates(number_of_borders, doc_len):
    assert number_of_borders < doc_len
    borders_list = list(range(1, number_of_borders + 1))
    candidates = []
    while True:
        candidates.append(borders_list)
        borders_list = next_borders_list(borders_list, doc_len)
        if borders_list == -1:
            break
    return candidates
#candidates = generate_candidates(5, 768)

# Arquivos quebrados
procure a palavra licitação no arquivo 49a524c1e8d3274c32e0680b3c99b6b7e8c7a2ff.pdf