In [31]:
import numpy as np
import pandas as pd

from collections import Counter

import warnings
warnings.filterwarnings('ignore')

# 0. Inicializando Ambiente Django no Notebook

In [32]:
import os, sys
sys.path.insert(0, os.path.abspath('..'))

from django_for_jupyter import init_django
init_django('app')

# 1. Lendo do Banco de Dados e Salvando em um DataFrame Pandas

In [33]:
from api.models import TextosJuridicosTreinamento

qs = TextosJuridicosTreinamento.objects.all()
data = qs.values('teor_texto', 'orgao_julgador', 'classe_processo', 'assuntos', 'setor_destino')

df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,teor_texto,orgao_julgador,classe_processo,assuntos,setor_destino
0,PODER JUDICIÁRIO ESTADO DO RIO GRANDE DO NORTE...,2ª VIJ da Com. Natal,1706,12485;12494,Saúde
1,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,3º JFP da Com. Natal,14695,10715,Administrativa
2,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,5ª VEFT de Natal,156,9518,Fiscal


In [34]:
Counter(df['setor_destino'])

Counter({'Fiscal': 6686,
         'Administrativa': 5403,
         'Contabilidade': 1338,
         'Judicial': 1137,
         'Saúde': 443,
         'Meio Ambiente': 155,
         'Patrimonial': 121})

# 2. Separando dados de treinamento/teste

## 2.1 - Separando os dados de forma estratificada

In [35]:
from sklearn.model_selection import train_test_split

y = df['setor_destino'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, stratify=y, random_state=42)
print(f'Shape dos dados de treinamento: {X_train.shape}')
print(f'Shape dos dados de teste: {X_test.shape}')

Shape dos dados de treinamento: (10698, 5)
Shape dos dados de teste: (4585, 5)


## 2.2 - *Upsampling* e *Downsampling* dos dados de treinamento

In [36]:
def balance_df(df: pd.DataFrame, balance_on:str = 'setor_destino', n_samples:int = 500):
    unique_classes = df[balance_on].unique()
    
    dfs_list = []

    for classe in unique_classes:
        if len(df[df[balance_on] == classe]) >= n_samples:
            dfs_list.append(df[df[balance_on] == classe].sample(n=n_samples, replace=False))
        else:
            dfs_list.append(df[df[balance_on] == classe].sample(n=n_samples, replace=True))
    
    df_balanceado = pd.concat(dfs_list)

    return df_balanceado.sample(frac=1, replace=False)

X_train = balance_df(X_train, balance_on='setor_destino', n_samples=500)
y_train = X_train['setor_destino']

In [37]:
print(f'Shape dos dados de treinamento: {X_train.shape} | {y_train.shape}')

print(f'\n\nDISTRIBUIÇÃO DOS DADOS DE TREINAMENTO\n{X_train['setor_destino'].value_counts()}')
print(f'\n\nDISTRIBUIÇÃO DOS DADOS DE TESTE\n{X_test['setor_destino'].value_counts()}')

Shape dos dados de treinamento: (3500, 5) | (3500,)


DISTRIBUIÇÃO DOS DADOS DE TREINAMENTO
setor_destino
Judicial          500
Administrativa    500
Saúde             500
Meio Ambiente     500
Patrimonial       500
Contabilidade     500
Fiscal            500
Name: count, dtype: int64


DISTRIBUIÇÃO DOS DADOS DE TESTE
setor_destino
Fiscal            2006
Administrativa    1621
Contabilidade      401
Judicial           341
Saúde              133
Meio Ambiente       47
Patrimonial         36
Name: count, dtype: int64


# 3. Definindo os Transformadores

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

from api.utils.transformers import AssuntosPipeline, CategoricalPipeline


'''# --- Transformadores personalizados ---

class StringToListTransformer(BaseEstimator, TransformerMixin):
    """Converte string em lista (usado para assuntos e campos categóricos com um valor só)"""
    def __init__(self, sep=';'):
        self.sep = sep

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(lambda val: val.split(self.sep) if isinstance(val, str) else []).values.reshape(-1, 1)

class MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        X_flat = [x[0] for x in X]
        return self.mlb.fit(X_flat)

    def transform(self, X):
        X_flat = [x[0] for x in X]
        return self.mlb.transform(X_flat)

class CategoricalPipeline(BaseEstimator, TransformerMixin):
    """Pipeline para orgao_julgador e classe_processo com MLB"""
    def fit(self, X, y=None):
        self.to_list = StringToListTransformer(sep=';')
        X_list = self.to_list.transform(X)
        self.binarizer = MultiLabelBinarizerWrapper()
        self.binarizer.fit(X_list)
        return self

    def transform(self, X):
        X_list = self.to_list.transform(X)
        return self.binarizer.transform(X_list)

class AssuntosPipeline(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.splitter = StringToListTransformer(sep=';')
        X_split = self.splitter.transform(X)
        self.binarizer = MultiLabelBinarizerWrapper()
        self.binarizer.fit(X_split)
        return self

    def transform(self, X):
        X_split = self.splitter.transform(X)
        return self.binarizer.transform(X_split)'''

'# --- Transformadores personalizados ---\n\nclass StringToListTransformer(BaseEstimator, TransformerMixin):\n    """Converte string em lista (usado para assuntos e campos categóricos com um valor só)"""\n    def __init__(self, sep=\';\'):\n        self.sep = sep\n\n    def fit(self, X, y=None):\n        return self\n\n    def transform(self, X):\n        return X.apply(lambda val: val.split(self.sep) if isinstance(val, str) else []).values.reshape(-1, 1)\n\nclass MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):\n    def __init__(self):\n        self.mlb = MultiLabelBinarizer()\n\n    def fit(self, X, y=None):\n        X_flat = [x[0] for x in X]\n        return self.mlb.fit(X_flat)\n\n    def transform(self, X):\n        X_flat = [x[0] for x in X]\n        return self.mlb.transform(X_flat)\n\nclass CategoricalPipeline(BaseEstimator, TransformerMixin):\n    """Pipeline para orgao_julgador e classe_processo com MLB"""\n    def fit(self, X, y=None):\n        self.to_list = Str

# 4. Pipeline Final

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

# --- Pipelines individuais para os classificadores ---

# 1. LinearSVC: teor_texto + assuntos
svc_preprocessor = ColumnTransformer(transformers=[
    ('texto', TfidfVectorizer(min_df=0.001, max_df=0.5, max_features=3500), 'teor_texto'),
    ('assuntos', AssuntosPipeline(), 'assuntos')
])
svc_pipeline = Pipeline([
    ('preprocessor', svc_preprocessor),
    ('classifier', LinearSVC())
])

# 2. RandomForest: teor_texto + assuntos + classe_processo + orgao_julgador
rf_preprocessor = ColumnTransformer(transformers=[
    ('texto', TfidfVectorizer(min_df=0.001, max_df=0.5, max_features=3500), 'teor_texto'),
    ('assuntos', AssuntosPipeline(), 'assuntos'),
    ('classe', CategoricalPipeline(), 'classe_processo'),
    ('orgao', CategoricalPipeline(), 'orgao_julgador')
])
rf_pipeline = Pipeline([
    ('preprocessor', rf_preprocessor),
    ('classifier', RandomForestClassifier())
])

# 3. LogisticRegression: assuntos + classe_processo + orgao_julgador
lr_preprocessor = ColumnTransformer(transformers=[
    ('assuntos', AssuntosPipeline(), 'assuntos'),
    ('classe', CategoricalPipeline(), 'classe_processo'),
    ('orgao', CategoricalPipeline(), 'orgao_julgador')
])
lr_pipeline = Pipeline([
    ('preprocessor', lr_preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# --- StackingClassifier com todos os pipelines ---
stacking_clf = StackingClassifier(
    estimators=[
        ('svc', svc_pipeline),
        ('rf', rf_pipeline),
        ('lr', lr_pipeline)
    ],
    final_estimator=LogisticRegression(max_iter=1000)
)

In [40]:
stacking_clf.fit(X_train, y_train)

In [41]:
stacking_clf.score(X_test, y_test)

0.931515812431843

In [42]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, stacking_clf.predict(X_test))

array([[1475,   20,   18,   99,    7,    0,    2],
       [   5,  390,    4,    2,    0,    0,    0],
       [  23,    1, 1968,   10,    2,    1,    1],
       [  34,    2,    3,  279,    8,   11,    4],
       [   0,    1,    4,   21,   21,    0,    0],
       [   2,    1,    0,   15,    1,   17,    0],
       [   3,    0,    0,    9,    0,    0,  121]])

In [43]:
print(classification_report(y_test, stacking_clf.predict(X_test)))

                precision    recall  f1-score   support

Administrativa       0.96      0.91      0.93      1621
 Contabilidade       0.94      0.97      0.96       401
        Fiscal       0.99      0.98      0.98      2006
      Judicial       0.64      0.82      0.72       341
 Meio Ambiente       0.54      0.45      0.49        47
   Patrimonial       0.59      0.47      0.52        36
         Saúde       0.95      0.91      0.93       133

      accuracy                           0.93      4585
     macro avg       0.80      0.79      0.79      4585
  weighted avg       0.94      0.93      0.93      4585



# 5. Salvando o modelo

In [44]:
import os
import pickle
import joblib

models_path = '/home/esdras-daniel/Documentos/Python/Django/PGM-Text_Classificator/api/models_clf'

model_name = 'stacking_clf_V2.pkl'
joblib.dump(stacking_clf, os.path.join(models_path, model_name))

['/home/esdras-daniel/Documentos/Python/Django/PGM-Text_Classificator/api/models_clf/stacking_clf_V2.pkl']