In [1]:
import pandas as pd

path = 'data/processed/df_avisos_completos.csv'
df = pd.read_csv(path)

df['label'] = pd.Series([1 if setor_destino == 'Apoio Fiscal' else 0 for setor_destino in df['setor_destino']])

df.head()

Unnamed: 0,id_aviso,data_hora,teor_texto,setor_destino,label
0,18612632.0,2024-06-21 05:00:40,PODER JUDICIÁRIO ESTADO DO RIO GRANDE DO NORTE...,Procuradoria da Saúde,0
1,18629258.0,2024-06-25 23:59:59,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,Procuradoria Administrativa,0
2,18629422.0,2024-06-18 09:27:16,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,Cartorio,0
3,18629665.0,2024-07-30 23:59:59,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,Apoio Fiscal,1
4,18629671.0,2024-06-18 05:01:05,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,Procuradoria Fiscal,0


# 1. Sample

## Separando em 3 conjuntos de dados (Treinamento/Teste/OOT(Out of Time))

In [17]:
from sklearn.model_selection import train_test_split

# OOT (últimos 10% dos avisos do Dataframe ordenado por data)
df_oot = df.sort_values(by='data_hora').tail(int(df.shape[0] * 0.1)).copy()
df_train_test = df.drop(df_oot.index) # Excluindo os dados do oot para a separação de treino e teste

features = 'teor_texto'
target = 'label'

X, y = df_train_test[features], df_train_test[target]

# Treino e Teste
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f'Taxa de variável resposta para o Treinamento: {y_train.mean()} ({len(X_train)} amostras totais)')
print(f'Taxa de variável resposta para o Teste: {y_test.mean()} ({len(X_test)} amostras totais)')
print(f'Taxa de variável resposta para o OOT: {df_oot['label'].mean()} ({len(df_oot)} amostras totais)')

Taxa de variável resposta para o Treinamento: 0.29275706745516344 (36187 amostras totais)
Taxa de variável resposta para o Teste: 0.29280424450093956 (9047 amostras totais)
Taxa de variável resposta para o OOT: 0.22945273631840796 (5025 amostras totais)


# 2. Explore

## Análisando a importância das features pós TF-IDF (Sem pré-processamento)

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree

tfidf_vec = TfidfVectorizer(max_features=3000, lowercase=True, ngram_range=(1,3))
features_train = tfidf_vec.fit_transform(X_train)

decision_tree = tree.DecisionTreeClassifier(min_samples_leaf=20, random_state=42)
decision_tree.fit(features_train, y_train)

In [92]:
features_test = tfidf_vec.transform(X_test)

decision_tree.score(features_test, y_test)

0.9587708632695922

In [100]:
feature_importance = (pd.Series(decision_tree.feature_importances_, index=tfidf_vec.get_feature_names_out())
                        .sort_values(ascending=False)
                        .reset_index())

feature_importance['acum.'] = feature_importance[0].cumsum()
feature_importance[feature_importance['acum.'] < 1]

Unnamed: 0,index,0,acum.
0,de execução fiscal,0.675517,0.675517
1,sentença,0.070234,0.745751
2,exceção,0.041166,0.786917
3,petição,0.030551,0.817468
4,2024 20,0.017934,0.835403
...,...,...,...
193,rio grande,0.000004,0.999989
194,fazer,0.000003,0.999992
195,se,0.000003,0.999995
196,nacional,0.000003,0.999997


In [94]:
eval_df = pd.DataFrame(data={
                        'true':y_test.values,
                        'pred':decision_tree.predict(features_test),
                        'pred_proba':decision_tree.predict_proba(features_test)[:,-1]
                    },
                    index=X_test.index)

In [95]:
from sklearn import metrics

print(f'Acurácia: {metrics.accuracy_score(eval_df['true'], eval_df['pred'])}')
print(f'Precisão: {metrics.precision_score(eval_df['true'], eval_df['pred'])}')
print(f'Recall: {metrics.recall_score(eval_df['true'], eval_df['pred'])}')
print(f'F1-Score: {metrics.f1_score(eval_df['true'], eval_df['pred'])}')
print(f'Especificidade: {eval_df[eval_df['true'] == 0].shape[0] / (eval_df[eval_df['true'] == 0].shape[0]+(eval_df[(eval_df['true'] == 0) & (eval_df['pred'] == 1)].shape[0]))}')

Acurácia: 0.9587708632695922
Precisão: 0.9317147192716236
Recall: 0.9271423178557946
F1-Score: 0.9294228949858089
Especificidade: 0.9726360595925814


# 3. Pipeline de Classificação

In [115]:
import nltk
from unidecode import unidecode
import string
import re

# Baixa as stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
stopwords.extend(['art', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii'])

def normalize_text(text):
    """
    Normaliza texto com etapas básicas de NLP.
    """

    text = text.lower()       # Minúsculas
    text = unidecode(text)    # Remove acentuação
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove Pontuação
    text = re.sub(r'\d+', '', text)
    tokens = text.split()

    clean_tokens = [token for token in tokens if token not in stopwords and token.isprintable()]

    return ' '.join(clean_tokens)


[nltk_data] Downloading package stopwords to /home/esdras-
[nltk_data]     daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.pipeline import Pipeline

clf_pipe = Pipeline(steps=[
    ('feature-extraction', TfidfVectorizer(preprocessor=normalize_text)),
    ('classificator', tree.DecisionTreeClassifier(min_samples_leaf=20, random_state=42))
])



Score Teste: 0.9606499392063668


In [123]:
import mlflow

mlflow.set_tracking_uri('http://127.0.0.1:5000/')
mlflow.set_experiment(experiment_name='apoio_fiscal_exp')

with mlflow.start_run():
    mlflow.sklearn.autolog()
    clf_pipe.fit(X_train, y_train)



🏃 View run serious-jay-478 at: http://127.0.0.1:5000/#/experiments/605581821824525917/runs/cc24c206397f488981742fc86cecbdac
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/605581821824525917


In [124]:
eval_train_df = pd.DataFrame(data={
                        'true':y_train.values,
                        'pred':clf_pipe.predict(X_train),
                        'pred_proba':clf_pipe.predict_proba(X_train)[:,-1]
                    },
                    index=X_train.index)

eval_test_df = pd.DataFrame(data={
                        'true':y_test.values,
                        'pred':clf_pipe.predict(X_test),
                        'pred_proba':clf_pipe.predict_proba(X_test)[:,-1]
                    },
                    index=X_test.index)



In [125]:
print(f'Acurácia: {metrics.accuracy_score(eval_train_df['true'], eval_train_df['pred'])}')
print(f'Precisão: {metrics.precision_score(eval_train_df['true'], eval_train_df['pred'])}')
print(f'Recall: {metrics.recall_score(eval_train_df['true'], eval_train_df['pred'])}')
print(f'F1-Score: {metrics.f1_score(eval_train_df['true'], eval_train_df['pred'])}')
print(f'Especificidade: {eval_train_df[eval_train_df['true'] == 0].shape[0] / (eval_train_df[eval_train_df['true'] == 0].shape[0]+(eval_train_df[(eval_train_df['true'] == 0) & (eval_train_df['pred'] == 1)].shape[0]))}')

print(f'Acurácia: {metrics.accuracy_score(eval_test_df['true'], eval_test_df['pred'])}')
print(f'Precisão: {metrics.precision_score(eval_test_df['true'], eval_test_df['pred'])}')
print(f'Recall: {metrics.recall_score(eval_test_df['true'], eval_test_df['pred'])}')
print(f'F1-Score: {metrics.f1_score(eval_test_df['true'], eval_test_df['pred'])}')
print(f'Especificidade: {eval_test_df[eval_test_df['true'] == 0].shape[0] / (eval_test_df[eval_test_df['true'] == 0].shape[0]+(eval_test_df[(eval_test_df['true'] == 0) & (eval_test_df['pred'] == 1)].shape[0]))}')

Acurácia: 0.970320833448476
Precisão: 0.9454426352236571
Recall: 0.9536530111383802
F1-Score: 0.94953007518797
Especificidade: 0.9777276894865525
Acurácia: 0.9606499392063668
Precisão: 0.9337873628452517
Recall: 0.931672329180823
F1-Score: 0.9327286470143613
Especificidade: 0.9733759318423855


In [126]:
clf_pipe.score(X_train, y_train)

0.970320833448476

In [129]:
metrics.precision_score(y_train.values, clf_pipe.predict(X_train), average='weighted')



0.9704105865654786