# Bibliotecas necessárias

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!pip install PyMuPDF
!pip install plac

!pip install unidecode
from unidecode import unidecode

import fitz  # Módulo PyMuPDF
import json
import os
import re
from itertools import chain
import sys
import spacy
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
from tqdm import tqdm
import pandas as pd

import csv
csv.field_size_limit(sys.maxsize)

!python -m spacy download pt_core_news_sm
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import *
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
sys.path.append("/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Scripts")
import analise_edit

# Funções necessárias

In [None]:
NLP = spacy.load("pt_core_news_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"])
NLP.enable_pipe("senter")

# Ajusta a formatação dos textos
def remove_formatting(text):
    text = text.replace("\n", " ").replace("\t", " ")
    while text.find("  ") != -1:
        text = text.replace("  ", " ")
    return text

# Lista todos os arquivos de um diretório dado que ele seja de um formato
def listdir_drive(dir_path: str, formats=['pdf']) -> list:
    file_paths = []
    for root, directories, files in os.walk(dir_path):
        file_paths += [os.path.join(root, file) for file in files]
        if len(formats) > 0:
            file_paths = list(filter(lambda f: any([f.endswith(f".{format}") for format in formats]), file_paths))
    return file_paths

# Extrai o texto de um PDF
def extract_text(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = remove_formatting(" ".join([doc[n].get_text() for n in range(doc.page_count)]))
    doc.close()
    return text

# Retorna as publicações de um PDF
def get_publications(text: str):
    pub_texts = re.split(r'Código Identificador:\w+', text)[1:]
    publications = [pub_text.strip() for pub_text in pub_texts]
    publications = list(filter(lambda p: len(p) > 0, publications))
    return publications

# Extrai as sentenças do texto usando o SpaCy
def get_sentences(text: str):
    sentences = [sent.text.strip() for sent in NLP(text).sents]
    sentences = list(filter(lambda s: len(s) > 0, sentences))
    return sentences

In [None]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [None]:
# Remove acentos e passa para lowercase
formatar = lambda palavra: unidecode(str(palavra).lower())

STEMMER = SnowballStemmer("portuguese", ignore_stopwords=True)
STOPWORDS = stopwords.words('portuguese')

def extract_stem_tokens(sentence: str):
    tokens = [formatar(t) for t in word_tokenize(sentence)]
    important_tokens = list(filter(lambda t: not t in STOPWORDS and t.isalpha(), tokens))
    stems = [STEMMER.stem(t) for t in important_tokens]
    return stems

# Salvando a base de dados a partir dos PDFs

In [None]:
pdf_list = listdir_drive("/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Dados/PDFs", ['pdf'])
random.shuffle(pdf_list)

with open("/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Dados/dataset_full.csv", 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    for pdf in tqdm(pdf_list[:100], desc="Extraindo publicações dos PDFs", position=0, leave=True):
        with HiddenPrints():
            for publication in get_publications(extract_text(pdf)):
                for sentence in get_sentences(publication):
                    tokens = extract_stem_tokens(sentence)
                    # Verifica se há algum token para formar a frase
                    if tokens:
                        data = analise_edit.extract_data(sentence)
                        # Frase original, tokenizada, modalidade, tipo de publicação e festividade
                        csv_writer.writerow([sentence, " ".join(tokens), data["modalidade"], data["tipo_publicacao"], data["festividade"]])

Extraindo publicações dos PDFs: 100%|██████████| 100/100 [19:35<00:00, 11.76s/it]


In [None]:
with open("/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Dados/dataset_full.csv", 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    sentence_data = [row for row in tqdm(csv_reader, desc="Carregando dataset", position=0, leave=True)]

Carregando publicações: 498219it [00:03, 135741.22it/s]


In [None]:
print(sentence_data[0])

['GABINETE DO PREFEITO DECRETO', 'gabinet prefeit decret', 'N.A.', 'Decreto', 'False']
