In [2]:
import numpy as np
import pandas as pd

from collections import Counter

import warnings
warnings.filterwarnings('ignore')

# 0. Inicializando Ambiente Django no Notebook

In [3]:
import os, sys
sys.path.insert(0, os.path.abspath('..'))

from django_for_jupyter import init_django
init_django('app')

# 1. Lendo do Banco de Dados e Salvando em um DataFrame Pandas

In [4]:
from api.models import TextosJuridicosTreinamento

qs = TextosJuridicosTreinamento.objects.all()
data = qs.values('teor_texto', 'orgao_julgador', 'classe_processo', 'assuntos', 'setor_destino')

df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,teor_texto,orgao_julgador,classe_processo,assuntos,setor_destino
0,PODER JUDICIÁRIO ESTADO DO RIO GRANDE DO NORTE...,2ª VIJ da Com. Natal,1706,12485;12494,Saúde
1,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,3º JFP da Com. Natal,14695,10715,Administrativa
2,PODER JUDICIÁRIO DO ESTADO DO RIO GRANDE DO NO...,5ª VEFT de Natal,156,9518,Fiscal


In [5]:
Counter(df['setor_destino'])

Counter({'Fiscal': 6686,
         'Administrativa': 5403,
         'Contabilidade': 1338,
         'Judicial': 1137,
         'Saúde': 443,
         'Meio Ambiente': 155,
         'Patrimonial': 121})

# 2. Separando dados de treinamento/teste

## 2.1 - Separando os dados de forma estratificada

In [5]:
from sklearn.model_selection import train_test_split

y = df['setor_destino'].to_numpy()

X_train_unbalanced, X_test, y_train_unbalacend, y_test = train_test_split(df, y, test_size=0.3, stratify=y, random_state=42)
print(f'Shape dos dados de treinamento: {X_train_unbalanced.shape}')
print(f'Shape dos dados de teste: {X_test.shape}')

Shape dos dados de treinamento: (10698, 5)
Shape dos dados de teste: (4585, 5)


## 2.2 - *Upsampling* e *Downsampling* dos dados de treinamento

In [6]:
def balance_df(df: pd.DataFrame, balance_on:str = 'setor_destino', n_samples:int = 500):
    unique_classes = df[balance_on].unique()
    
    dfs_list = []

    for classe in unique_classes:
        if len(df[df[balance_on] == classe]) >= n_samples:
            dfs_list.append(df[df[balance_on] == classe].sample(n=n_samples, replace=False))
        else:
            dfs_list.append(df[df[balance_on] == classe].sample(n=n_samples, replace=True))
    
    df_balanceado = pd.concat(dfs_list)

    return df_balanceado.sample(frac=1, replace=False)

X_train_balanced = balance_df(X_train_unbalanced, balance_on='setor_destino', n_samples=500)
y_train_balanced = X_train_balanced['setor_destino']

In [7]:
print(f'\n\nDISTRIBUIÇÃO DOS DADOS DE TREINAMENTO BALANCEADOS\n{X_train_balanced['setor_destino'].value_counts()}')
print(f'\n\nDISTRIBUIÇÃO DOS DADOS DE TREINAMENTO DESBALANCEADOS\n{X_train_unbalanced['setor_destino'].value_counts()}')
print(f'\n\nDISTRIBUIÇÃO DOS DADOS DE TESTE\n{X_test['setor_destino'].value_counts()}')



DISTRIBUIÇÃO DOS DADOS DE TREINAMENTO BALANCEADOS
setor_destino
Contabilidade     500
Patrimonial       500
Saúde             500
Administrativa    500
Fiscal            500
Judicial          500
Meio Ambiente     500
Name: count, dtype: int64


DISTRIBUIÇÃO DOS DADOS DE TREINAMENTO DESBALANCEADOS
setor_destino
Fiscal            4680
Administrativa    3782
Contabilidade      937
Judicial           796
Saúde              310
Meio Ambiente      108
Patrimonial         85
Name: count, dtype: int64


DISTRIBUIÇÃO DOS DADOS DE TESTE
setor_destino
Fiscal            2006
Administrativa    1621
Contabilidade      401
Judicial           341
Saúde              133
Meio Ambiente       47
Patrimonial         36
Name: count, dtype: int64


# 3. Definindo os Transformadores

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

from api.utils.transformers import AssuntosPipeline, CategoricalPipeline


'''# --- Transformadores personalizados ---

class StringToListTransformer(BaseEstimator, TransformerMixin):
    """Converte string em lista (usado para assuntos e campos categóricos com um valor só)"""
    def __init__(self, sep=';'):
        self.sep = sep

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(lambda val: val.split(self.sep) if isinstance(val, str) else []).values.reshape(-1, 1)

class MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        X_flat = [x[0] for x in X]
        return self.mlb.fit(X_flat)

    def transform(self, X):
        X_flat = [x[0] for x in X]
        return self.mlb.transform(X_flat)

class CategoricalPipeline(BaseEstimator, TransformerMixin):
    """Pipeline para orgao_julgador e classe_processo com MLB"""
    def fit(self, X, y=None):
        self.to_list = StringToListTransformer(sep=';')
        X_list = self.to_list.transform(X)
        self.binarizer = MultiLabelBinarizerWrapper()
        self.binarizer.fit(X_list)
        return self

    def transform(self, X):
        X_list = self.to_list.transform(X)
        return self.binarizer.transform(X_list)

class AssuntosPipeline(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.splitter = StringToListTransformer(sep=';')
        X_split = self.splitter.transform(X)
        self.binarizer = MultiLabelBinarizerWrapper()
        self.binarizer.fit(X_split)
        return self

    def transform(self, X):
        X_split = self.splitter.transform(X)
        return self.binarizer.transform(X_split)'''

'# --- Transformadores personalizados ---\n\nclass StringToListTransformer(BaseEstimator, TransformerMixin):\n    """Converte string em lista (usado para assuntos e campos categóricos com um valor só)"""\n    def __init__(self, sep=\';\'):\n        self.sep = sep\n\n    def fit(self, X, y=None):\n        return self\n\n    def transform(self, X):\n        return X.apply(lambda val: val.split(self.sep) if isinstance(val, str) else []).values.reshape(-1, 1)\n\nclass MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):\n    def __init__(self):\n        self.mlb = MultiLabelBinarizer()\n\n    def fit(self, X, y=None):\n        X_flat = [x[0] for x in X]\n        return self.mlb.fit(X_flat)\n\n    def transform(self, X):\n        X_flat = [x[0] for x in X]\n        return self.mlb.transform(X_flat)\n\nclass CategoricalPipeline(BaseEstimator, TransformerMixin):\n    """Pipeline para orgao_julgador e classe_processo com MLB"""\n    def fit(self, X, y=None):\n        self.to_list = Str

# 4. Pipeline Final

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

# --- Pipelines individuais para os classificadores ---

# 1. LinearSVC: teor_texto + assuntos
svc_preprocessor = ColumnTransformer(transformers=[
    ('texto', TfidfVectorizer(min_df=0.001, max_df=0.5, max_features=3500), 'teor_texto'),
    ('assuntos', AssuntosPipeline(), 'assuntos')
])
svc_pipeline = Pipeline([
    ('preprocessor', svc_preprocessor),
    ('classifier', LinearSVC())
])

# 2. RandomForest: teor_texto + assuntos + classe_processo + orgao_julgador
rf_preprocessor = ColumnTransformer(transformers=[
    ('texto', TfidfVectorizer(min_df=0.001, max_df=0.5, max_features=3500), 'teor_texto'),
    ('assuntos', AssuntosPipeline(), 'assuntos'),
    ('classe', CategoricalPipeline(), 'classe_processo'),
    ('orgao', CategoricalPipeline(), 'orgao_julgador')
])
rf_pipeline = Pipeline([
    ('preprocessor', rf_preprocessor),
    ('classifier', RandomForestClassifier())
])

# 3. LogisticRegression: assuntos + classe_processo + orgao_julgador
lr_preprocessor = ColumnTransformer(transformers=[
    ('assuntos', AssuntosPipeline(), 'assuntos'),
    ('classe', CategoricalPipeline(), 'classe_processo'),
    ('orgao', CategoricalPipeline(), 'orgao_julgador')
])
lr_pipeline = Pipeline([
    ('preprocessor', lr_preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# --- StackingClassifier com todos os pipelines ---
stacking_clf = StackingClassifier(
    estimators=[
        ('svc', svc_pipeline),
        ('rf', rf_pipeline),
        ('lr', lr_pipeline)
    ],
    final_estimator=LogisticRegression(max_iter=1000)
)

In [20]:
stacking_clf.fit(X_train_balanced, y_train_balanced)

In [21]:
stacking_clf.score(X_test, y_test)

0.9391494002181026

In [22]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, stacking_clf.predict(X_test))

array([[1521,   21,   13,   62,    2,    1,    1],
       [   6,  392,    2,    1,    0,    0,    0],
       [  28,    5, 1950,   20,    2,    1,    0],
       [  38,    1,    1,  284,    6,    6,    5],
       [   3,    1,    3,   17,   21,    2,    0],
       [   3,    1,    0,   15,    0,   17,    0],
       [   2,    0,    0,   10,    0,    0,  121]])

In [23]:
print(classification_report(y_test, stacking_clf.predict(X_test)))

                precision    recall  f1-score   support

Administrativa       0.95      0.94      0.94      1621
 Contabilidade       0.93      0.98      0.95       401
        Fiscal       0.99      0.97      0.98      2006
      Judicial       0.69      0.83      0.76       341
 Meio Ambiente       0.68      0.45      0.54        47
   Patrimonial       0.63      0.47      0.54        36
         Saúde       0.95      0.91      0.93       133

      accuracy                           0.94      4585
     macro avg       0.83      0.79      0.81      4585
  weighted avg       0.94      0.94      0.94      4585



# 5. Salvando o modelo

In [24]:
import os
import pickle
import joblib

models_path = '/home/esdras-daniel/Documentos/Python/Django/PGM-Text_Classificator/api/models_clf'

model_name = 'stacking_clf_V2.pkl'
joblib.dump(stacking_clf, os.path.join(models_path, model_name))

['/home/esdras-daniel/Documentos/Python/Django/PGM-Text_Classificator/api/models_clf/stacking_clf_V2.pkl']

# 6. Usando BERT para classificação de textos

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import softmax
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

MAX_LEN = 512
STRIDE = 64
POOLING = 'mean' # ou 'max'
BATCH_SIZE = 16
EPOCHS = 500
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BERT_MODEL = 'rufimelo/Legal-BERTimbau-base'

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
model = AutoModel.from_pretrained(BERT_MODEL).to(DEVICE)

Some weights of BertModel were not initialized from the model checkpoint at rufimelo/Legal-BERTimbau-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 6.1 - Extraindo as embeddings

In [None]:
        

# EXTRAIR TODOS OS EMBEDDINGS
all_embeddings = []
for text in tqdm(df['teor_texto'], desc="Extraindo Embeddings"):
    embedding = get_embedding(text, pooling=POOLING)
    all_embeddings.append(embedding)


Extraindo Embeddings: 100%|██████████| 15283/15283 [5:43:02<00:00,  1.35s/it]  


In [14]:
#save_dir = '/home/esdras-daniel/Documentos/Python/Django/PGM-Text_Classificator/notebooks/embeddings'

# CONVERTER PARA TENSOR
#embeddings_tensor = torch.stack(all_embeddings)
#torch.save(embeddings_tensor, os.path.join(save_dir, f'embeddings_{POOLING}.pt'))

# SALVAR RÓTULOS ENCODADOS
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels_encoded = le.fit_transform(df['setor_destino'])
#torch.save(torch.tensor(labels_encoded), os.path.join(save_dir, f'labels.pt'))

In [52]:
le.classes_

array(['Administrativa', 'Contabilidade', 'Fiscal', 'Judicial',
       'Meio Ambiente', 'Patrimonial', 'Saúde'], dtype=object)

## 6.2 - Criando o DataSet e Classificador

In [8]:
class EmbeddingsDataset(Dataset):
    def __init__(self, embedding, labels):
        self.embeddings = embedding
        self.labels = labels
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

#  --------- CLASSIFICADOR ---------

class BERTClassifier(nn.Module):
    def __init__(self, input_dim=768, num_classes=7):
        super().__init__()
        self.linear = nn.Linear(input_dim, num_classes)
    
    def forward(self, x):
        return self.linear(x)

## 6.3 - Separação entre Treino e Teste

In [9]:
load_dir = '/home/esdras-daniel/Documentos/Python/Django/PGM-Text_Classificator/notebooks/embeddings'
embeddings_file_name = 'embeddings_mean.pt'

X = torch.load(os.path.join(load_dir, embeddings_file_name))
y = torch.load(os.path.join(load_dir, 'labels.pt'))

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

# Calculando os pesos das classes (Usando apenas o conjunto de treinamento)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train.numpy())
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

In [11]:
train_dataset = EmbeddingsDataset(X_train, y_train)
test_dataset = EmbeddingsDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

## 6.4 - Treinamento

### 6.4.1 - Treinamento Desbalanceado (Usando pesos)

In [12]:
from sklearn.metrics import accuracy_score

def train_model(model, criterion, optimizer, train_loader, test_loader):
    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss_train = 0
        all_preds_train = []
        all_labels_train = []

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss_train += loss.item()
            preds = torch.argmax(softmax(outputs, dim=1), dim=1)
            all_preds_train.extend(preds.cpu().numpy())
            all_labels_train.extend(targets.cpu().numpy())

        if epoch % 25 == 0:
            acc_train = accuracy_score(all_labels_train, all_preds_train)

            model.eval()
            total_loss_test = 0
            all_preds_test = []
            all_labels_test = []

            with torch.no_grad():
                for inputs, targets in test_loader:
                    inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                    preds = torch.argmax(softmax(outputs, dim=1), dim=1)

                    total_loss_test += loss.item()
                    all_preds_test.extend(preds.cpu().numpy())
                    all_labels_test.extend(targets.cpu().numpy())

            acc_test = accuracy_score(all_labels_test, all_preds_test)

            print(f"Epoch {epoch} | "
                f"Train Loss: {total_loss_train:.4f} | Train Acc: {acc_train:.4f} | "
                f"Test Loss: {total_loss_test:.4f} | Test Acc: {acc_test:.4f}")


model_unbalanced = BERTClassifier()
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.Adam(model_unbalanced.parameters(), lr=2e-5)

train_model(model_unbalanced, criterion, optimizer, train_loader, test_loader)

Epoch 25 | Train Loss: 894.2429 | Train Acc: 0.7113 | Test Loss: 383.4937 | Test Acc: 0.7165
Epoch 50 | Train Loss: 795.7108 | Train Acc: 0.7693 | Test Loss: 343.2343 | Test Acc: 0.7690
Epoch 75 | Train Loss: 739.8344 | Train Acc: 0.7852 | Test Loss: 320.3176 | Test Acc: 0.7869
Epoch 100 | Train Loss: 702.7317 | Train Acc: 0.7949 | Test Loss: 305.1813 | Test Acc: 0.7935
Epoch 125 | Train Loss: 675.8239 | Train Acc: 0.7985 | Test Loss: 294.3215 | Test Acc: 0.7976
Epoch 150 | Train Loss: 655.0837 | Train Acc: 0.8006 | Test Loss: 286.0688 | Test Acc: 0.8007
Epoch 175 | Train Loss: 638.3706 | Train Acc: 0.8012 | Test Loss: 279.5248 | Test Acc: 0.8024
Epoch 200 | Train Loss: 624.4485 | Train Acc: 0.8027 | Test Loss: 274.1663 | Test Acc: 0.8028
Epoch 225 | Train Loss: 612.5566 | Train Acc: 0.8035 | Test Loss: 269.6694 | Test Acc: 0.8035
Epoch 250 | Train Loss: 602.2002 | Train Acc: 0.8041 | Test Loss: 265.8229 | Test Acc: 0.8055
Epoch 275 | Train Loss: 593.0424 | Train Acc: 0.8052 | Test Los

In [15]:
from sklearn.metrics import confusion_matrix, classification_report

# Avaliação final do modelo
model_unbalanced.eval()
all_preds_final = []
all_labels_final = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        outputs = model_unbalanced(inputs)
        preds = torch.argmax(softmax(outputs, dim=1), dim=1)
        all_preds_final.extend(preds.cpu().numpy())
        all_labels_final.extend(targets.cpu().numpy())

# Matriz de confusão
cm = confusion_matrix(all_labels_final, all_preds_final)
print('Matriz de confusão\n', cm)

# Classification report
report = classification_report(all_labels_final, all_preds_final, target_names=le.classes_)
print('\nClassification Report\n', report)

Matriz de confusão
 [[1192   27   68  208   21   37   68]
 [   6  383   11    0    0    0    1]
 [ 105    0 1844   16   14    9   18]
 [  90    2   15  198    3    8   25]
 [  10    1    7   12   12    2    3]
 [   6    2    4    6    2   13    3]
 [  14    0    7    7    1    1  103]]

Classification Report
                 precision    recall  f1-score   support

Administrativa       0.84      0.74      0.78      1621
 Contabilidade       0.92      0.96      0.94       401
        Fiscal       0.94      0.92      0.93      2006
      Judicial       0.44      0.58      0.50       341
 Meio Ambiente       0.23      0.26      0.24        47
   Patrimonial       0.19      0.36      0.25        36
         Saúde       0.47      0.77      0.58       133

      accuracy                           0.82      4585
     macro avg       0.57      0.65      0.60      4585
  weighted avg       0.84      0.82      0.82      4585



In [16]:
model_unbalanced.eval()
with torch.no_grad():
    inputs, targets = next(iter(test_loader))
    print(targets)
    out = model_unbalanced(inputs)
    print(out)

tensor([0, 3, 0, 0, 3, 0, 0, 0, 2, 1, 0, 0, 2, 2, 0, 2])
tensor([[-1.3257e+00, -3.8422e+00, -8.5472e-02, -1.1962e+00, -1.5299e+00,
         -1.7927e+00,  1.0070e-01],
        [-7.6147e-01, -4.4241e+00,  4.4673e-02, -9.3418e-01, -1.7202e+00,
         -1.4550e+00, -5.8641e-01],
        [ 1.0196e+00, -3.1369e+00, -3.0099e+00,  6.8396e-01, -7.5353e-01,
         -1.1246e+00, -1.8783e+00],
        [ 2.6138e-01, -3.5752e+00, -2.2318e+00, -3.3441e-01, -7.5324e-01,
         -1.4542e+00, -1.6968e+00],
        [ 1.2223e+00, -2.8791e+00, -1.9233e+00,  1.3024e+00, -1.0842e+00,
         -1.8190e+00, -2.4832e+00],
        [-1.2187e+00, -5.2623e+00, -8.8672e-01, -1.9242e+00, -1.0336e+00,
         -8.5677e-01, -2.5916e-01],
        [ 2.6634e-01, -3.0978e+00, -8.9702e-01, -5.5766e-02, -1.2550e+00,
         -1.2639e+00, -1.4869e+00],
        [ 1.1649e+00, -7.7129e+00, -2.7089e+00, -5.5475e-01,  3.2430e-01,
         -1.4303e+00, -3.0564e-02],
        [ 3.7152e-03, -2.9457e+00,  6.1277e+00, -1.9253e+00, -9

### 6.4.2 - Treinamento Balanceado (RandomUnderSampler e SMOTE)

In [20]:
Counter(y_train.numpy())

Counter({np.int64(2): 4680,
         np.int64(0): 3782,
         np.int64(1): 937,
         np.int64(3): 796,
         np.int64(6): 310,
         np.int64(4): 108,
         np.int64(5): 85})

In [21]:
dict_under_sampling = {2: 750,
                       0: 750,
                       1: 750,
                       3: 750}

In [22]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy=dict_under_sampling, random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

In [23]:
Counter(y_train_res)

Counter({np.int64(0): 750,
         np.int64(1): 750,
         np.int64(2): 750,
         np.int64(3): 750,
         np.int64(6): 310,
         np.int64(4): 108,
         np.int64(5): 85})

In [24]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_res, y_train_res)

In [25]:
Counter(y_train_res)

Counter({np.int64(0): 750,
         np.int64(1): 750,
         np.int64(2): 750,
         np.int64(3): 750,
         np.int64(4): 750,
         np.int64(5): 750,
         np.int64(6): 750})

In [26]:
train_dataset = EmbeddingsDataset(X_train_res, y_train_res)
# test_dataset = EmbeddingsDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [29]:
EPOCHS = 1000

model_balanced = BERTClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_balanced.parameters(), lr=2e-5)

train_model(model_balanced, criterion, optimizer, train_loader, test_loader)

Epoch 25 | Train Loss: 571.9347 | Train Acc: 0.3531 | Test Loss: 513.9794 | Test Acc: 0.4268
Epoch 50 | Train Loss: 524.4705 | Train Acc: 0.4050 | Test Loss: 472.1254 | Test Acc: 0.4768
Epoch 75 | Train Loss: 493.1247 | Train Acc: 0.4486 | Test Loss: 442.2420 | Test Acc: 0.5067
Epoch 100 | Train Loss: 469.7796 | Train Acc: 0.4829 | Test Loss: 419.2922 | Test Acc: 0.5429
Epoch 125 | Train Loss: 451.1320 | Train Acc: 0.5122 | Test Loss: 400.7156 | Test Acc: 0.5721
Epoch 150 | Train Loss: 435.6381 | Train Acc: 0.5375 | Test Loss: 385.2096 | Test Acc: 0.5952
Epoch 175 | Train Loss: 422.4529 | Train Acc: 0.5541 | Test Loss: 372.0256 | Test Acc: 0.6131
Epoch 200 | Train Loss: 411.0531 | Train Acc: 0.5750 | Test Loss: 360.6779 | Test Acc: 0.6266
Epoch 225 | Train Loss: 401.0825 | Train Acc: 0.5886 | Test Loss: 350.8210 | Test Acc: 0.6360
Epoch 250 | Train Loss: 392.2816 | Train Acc: 0.5998 | Test Loss: 342.1934 | Test Acc: 0.6434
Epoch 275 | Train Loss: 384.4511 | Train Acc: 0.6093 | Test Los

In [34]:
# Avaliação final do modelo
model_balanced.eval()
all_preds_final = []
all_labels_final = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        outputs = model_balanced(inputs)
        preds_prob = softmax(outputs, dim=1)
        preds = torch.argmax(softmax(outputs, dim=1), dim=1)
        all_preds_final.extend(preds.cpu().numpy())
        all_labels_final.extend(targets.cpu().numpy())

# Matriz de confusão
cm = confusion_matrix(all_labels_final, all_preds_final)
print('Matriz de confusão\n', cm)

# Classification report
report = classification_report(all_labels_final, all_preds_final, target_names=le.classes_)
print('\nClassification Report\n', report)

Matriz de confusão
 [[ 675   23   54  258  351  200   60]
 [   1  383   10    1    4    1    1]
 [  17    0 1789   17  104   59   20]
 [  23    2   12  173   44   66   21]
 [   0    1    4   11   23    7    1]
 [   0    1    2    4    5   21    3]
 [   4    2    7    6    8    6  100]]

Classification Report
                 precision    recall  f1-score   support

Administrativa       0.94      0.42      0.58      1621
 Contabilidade       0.93      0.96      0.94       401
        Fiscal       0.95      0.89      0.92      2006
      Judicial       0.37      0.51      0.43       341
 Meio Ambiente       0.04      0.49      0.08        47
   Patrimonial       0.06      0.58      0.11        36
         Saúde       0.49      0.75      0.59       133

      accuracy                           0.69      4585
     macro avg       0.54      0.66      0.52      4585
  weighted avg       0.87      0.69      0.74      4585



## 6.5 - RandomForest Classifier

In [41]:
# Transofrmando os tensores em arrays
X_train, X_test, y_train, y_test = X_train.numpy(), X_test.numpy(), y_train.numpy(), y_test.numpy()

### 6.5.1 - Desbalanceado

In [48]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(class_weight='balanced')
rf_clf.fit(X_train, y_train)
print(rf_clf.score(X_test, y_test))

0.8669574700109052


In [50]:
confusion_matrix(y_true=y_test, y_pred=rf_clf.predict(X_test))

array([[1486,   26,   77,   32,    0,    0,    0],
       [  13,  376,   12,    0,    0,    0,    0],
       [  93,    2, 1909,    1,    0,    0,    1],
       [ 179,    2,   18,  137,    0,    1,    4],
       [  25,    1,   12,    9,    0,    0,    0],
       [  23,    1,    5,    4,    0,    3,    0],
       [  54,    0,   11,    4,    0,    0,   64]])

In [51]:
print(classification_report(y_test, y_pred=rf_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85      1621
           1       0.92      0.94      0.93       401
           2       0.93      0.95      0.94      2006
           3       0.73      0.40      0.52       341
           4       0.00      0.00      0.00        47
           5       0.75      0.08      0.15        36
           6       0.93      0.48      0.63       133

    accuracy                           0.87      4585
   macro avg       0.72      0.54      0.58      4585
weighted avg       0.86      0.87      0.85      4585



In [47]:
10698 / (7 * np.bincount(y_train))

array([ 0.40409458,  1.63104132,  0.32655678,  1.91995693, 14.15079365,
       17.97983193,  4.92995392])

In [44]:
class_weights

array([ 0.40409458,  1.63104132,  0.32655678,  1.91995693, 14.15079365,
       17.97983193,  4.92995392])