# Introduction
This file was developed as part of the project reported in the paper below. If you use our work, please cite our paper.

- Title: UlyssesNER-Br: a Corpus of Brazilian Legislative Documents for Named Entity Recognition
- Authors: Hidelberg O. Albuquerque, Rosimeire Costa, Gabriel Silvestre, Ellen Souza, Nádia F. F. da Silva, Douglas Vitório, Gyovana Moriyama, Lucas Martins, Luiza Soezima, Augusto Nunes, Felipe Siqueira, João P. Tarrega, Joao V. Beinotti, Marcio Dias, Matheus Silva, Miguel Gardini, Vinicius Silva, Andrré C. P. L. F. de Carvalho and Adriano L. I. Oliveira.
- In: International Conference on the Computational Processing of Portuguese ― PROPOR 2022 (March 2022)

# Exploratory Data Analysis - Phases 1 and 2

In [None]:
!pip install sklearn-crfsuite

In [None]:
import operator
import os
import random
import functools
import collections
import numpy as np
import matplotlib.pyplot as plt
import nltk
import sklearn_crfsuite
import pandas as pd
import joblib

from nltk.tag.hmm import HiddenMarkovModelTrainer
from sklearn.model_selection import KFold
from itertools import chain
from nltk.corpus import PlaintextCorpusReader 
from nltk import sent_tokenize, word_tokenize, pos_tag 
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

random.seed(1999)

In [None]:
#DIR is the path to a folder containing all files in .conll format
DIR = '/caminho/'
all_files = os.listdir(DIR)


In [None]:
# convert files in .conll format into nested lists with the following hierarchy:
# Level 1: Sentences
# Level 2: Tuples of type (Token, Tag)

def process_conll_file(location:str)->list:
    with open(location, "r") as f:
        data = f.read()
    data = data.split("\n\n")
    data = list(map(lambda x:x.split("\n"), data))
    data.pop()
    data = list(map(lambda x:[operator.itemgetter(*[0, -1])(y.split(" ")) for y in x], data))
    return data

In [None]:
# merge the outputs of the process_conll_file function, now instead of the list containing the sentences of a single file, it contains the sentences of all files in DIR.
def combine_files(locations:list)->list:
    extended = []
    for f in locations:
        f = DIR + f
        extended.extend(process_conll_file(f))
    return extended

In [None]:
#main1
all_data = combine_files(all_files)
random.shuffle(all_data)
f"Número Total de Documentos: {len(all_data)}"

## Estatísticas sobre as sentenças

In [None]:
#box plot of the distribution of sentences.
tamanhos_sent = []
for d in all_data:
  tamanhos_sent.append(len(d))  
plt.boxplot(tamanhos_sent, labels=[""])
plt.show()

In [None]:
#Point estimation of the average number of tokens per sentence with confidence interval.
media_sent = np.mean(tamanhos_sent)
std_sent = np.std(tamanhos_sent)
z_alpha = 1.96
rng = (z_alpha * std_sent) / np.sqrt(len(tamanhos_sent))
print(f"Número Médio de Tokens por sentença: {media_sent}")
print(f"Intervalo de Confiança (alpha = 5%): {(media_sent-rng, media_sent+rng)}")

## Estatísticas sobre os tokens e tags

In [None]:
#removes the hierarchy of sentences, that is, the tokens become independent of each other.
def to_list(data:list)->list:
    return functools.reduce(operator.iconcat, data, [])

#returns two lists: One containing all dataset tokens and another containing their respective tags respecting the order.
def split_words_n_tags(data:list)->tuple:
    words, tags = map(list, zip(*data))
    return words, tags 

In [None]:
all_pairs = to_list(all_data)
all_words, all_tags = split_words_n_tags(all_pairs)
f"Número de tokens no dataset: {len(all_words)}", f"Tamanho do Vocabulário: {len(set(all_words))}"

In [None]:
#Removal of I- and B- prefixes (to improve the model)
all_tags_limpo = []
for tag in all_tags:
    if tag=="O":
        all_tags_limpo.append(tag)
    elif tag.startswith("B-") or tag.startswith("I-"):
        all_tags_limpo.append(tag[2:])
    else:
        continue

f"Número de Categorias ou Tipos: {len(set(all_tags_limpo))}"

In [None]:
#tags counter 
tag_hist = collections.Counter(all_tags_limpo)
tag_hist 

In [None]:
#Point estimation of the proportions of each tag
prop = {key:(val/sum(tag_hist.values())) * 100 for key, val in tag_hist.items()}
prop = dict(sorted(prop.items(), key=lambda item: item[1]))
for key, val in prop.items():
    print(f"{key} & {val:.2f}\%\\\\")

In [None]:
#bar chart with distributions
tag_hist.pop("O")
tag_hist = dict(sorted(tag_hist.items(), key=lambda item: item[1]))
keys = tag_hist.keys()
vals = tag_hist.values()

fig, ax = plt.subplots()
ax.barh(range(len(keys)), tag_hist.values())
ax.set_yticks(range(len(keys)))
ax.set_yticklabels(keys)
plt.show()

In [None]:
eps = z_alpha * np.sqrt(1/(4*len(all_words)))
print(f"Erro ao estimar a proporção (abordagem conservativa, alpha = 5%): {(eps * 100):.2f}%")