# 0. Imports

In [1]:
import re
import spacy
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Dropout
#from tensorflow.keras.optimizers import Adam
#from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
#from gensim.models import Word2Vec, FastText
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2025-02-13 09:50:22.980238: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739451022.998703   11534 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739451023.004049   11534 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-13 09:50:23.023627: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 0.1 - Funções

In [2]:
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('portuguese'))

nlp = spacy.load("pt_core_news_sm")

def preprocessar_texto(text):
    """Limpeza, remoção de stopwords e lematização"""
    text = re.sub(r"\b\d{3}\.\d{3}\.\d{3}-\d{2}\b", "[CPF]", text)
    text = re.sub(r"\b\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}\b", "[CNPJ]", text)
    text = re.sub(r"\b\d{2}\)?\s?\d{4,5}-?\d{4}\b", "[TELEFONE]", text)
    text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "[EMAIL]", text)
    text = re.sub(r'\d{7}-\d{2}\.\d{4}\.\d\.\d{2}\.\d{4}', '[PROCESSO]', text)
    text = re.sub(r'\d{5}-?\d{3}', '[CEP]', text)
    
    #text = text.lower()
    
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in stopwords and len(token.text) > 2]

    return " ".join(tokens).lower()

def representacao_texto(metodo: str, df: pd.DataFrame):
    if metodo == "TF-IDF":
        vectorizer = TfidfVectorizer(max_features=5000, min_df=0.005, max_df=0.4)
        X = vectorizer.fit_transform(df["texto_limpo"]).toarray()
    
    #elif metodo == "Word2Vec":
    #    model_w2v = Word2Vec(sentences=[text.split() for text in df["texto_limpo"]], vector_size=100, window=5, min_count=1)
    #    X = np.array([np.mean([model_w2v.wv[word] for word in text.split() if word in model_w2v.wv] or [np.zeros(100)], axis=0) for text in df["texto_limpo"]])

    #elif metodo == "FastText":
    #    model_ft = FastText(sentences=[text.split() for text in df["texto_limpo"]], vector_size=100, window=5, min_count=1)
    #    X = np.array([np.mean([model_ft.wv[word] for word in text.split() if word in model_ft.wv] or [np.zeros(100)], axis=0) for text in df["texto_limpo"]])

    elif metodo == "BERT":
        tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
        model_bert = BertModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

        def embed_bert(text):
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            outputs = model_bert(**inputs)
            return outputs.last_hidden_state[:, 0, :].detach().numpy()

        X = np.vstack([embed_bert(text) for text in df["texto_limpo"]])

    return X


[nltk_data] Error loading stopwords: <urlopen error [Errno 104]
[nltk_data]     Connection reset by peer>


# 1. Data Loading

In [3]:
df = pd.read_csv('./data/pgm-dataset-v6-clean.csv')
df.head(3)

Unnamed: 0,teorTexto,setorDestino,tipoAviso,orgaoJulgador,assuntos,documentos,anexos,classeProcesso,qtd_sentenca,qtd_acordao,qtd_transito_julgado,novoSetorDestino
0,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,APOIO FISCAL,IntimaÆo,5ª Vara de Execução Fiscal e Tributária de Natal,5952,Despacho;Petição;Intimação;Diligência;Penhora;...,0243943-10.2007.8.20.0001 Ext DA;Endereço da e...,1116,0,0,0,APOIO FISCAL
1,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,APOIO FISCAL,IntimaÆo,4ª Vara de Execução Fiscal e Tributária de Natal,10536;5952,Certidão Trânsito em Julgado;Sentença;Petição ...,0865696-23.2018.8.20.5001 Ext DA;Rcda - extrat...,1116,1,0,1,APOIO FISCAL
2,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,APOIO FISCAL,IntimaÆo,4ª Vara de Execução Fiscal e Tributária de Natal,5951,Decisão;Diligência;Mandado;Despacho;Certidão;D...,0508032-92.2006;0508032-92.2006 - EXT;0508032-...,1116,0,0,0,APOIO FISCAL


## 1.1 - Aplicando pré-processamento

In [4]:
tqdm.pandas()

df['texto_limpo'] = df['teorTexto'].progress_apply(preprocessar_texto)

100%|██████████| 6327/6327 [06:35<00:00, 16.01it/s]


# 2. Seleção da Representação do Texto

In [5]:
X_tfidf = representacao_texto(metodo='TF-IDF', df=df)

In [24]:
X_bert = representacao_texto(metodo='BERT', df=df)

ImportError: 
BertModel requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
