@inproceedings{souza2020bertimbau,
  author    = {F{\'a}bio Souza and
               Rodrigo Nogueira and
               Roberto Lotufo},
  title     = {{BERT}imbau: pretrained {BERT} models for {B}razilian {P}ortuguese},
  booktitle = {9th Brazilian Conference on Intelligent Systems, {BRACIS}, Rio Grande do Sul, Brazil, October 20-23 (to appear)},
  year      = {2020}
}


In [None]:
!pip install transformers evaluate accelerate
import pandas as pd
import numpy as np
import evaluate
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->a

In [None]:
import torch
import torch.nn as nn

# Define um modelo simples
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(10, 50),
            nn.ReLU(),
            nn.Linear(50, 1)
        )

    def forward(self, x):
        return self.layer(x)

# Cria o modelo
model = Model()

# Define o dispositivo (GPU se disponível)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

# Move o modelo para a GPU
model = model.to(device)

# Exemplo de uso
dados = torch.randn(32, 10).to(device)  # Move os dados também
saida = model(dados)
print(saida.shape)  # Deveria mostrar torch.Size([32, 1])

Usando dispositivo: cuda
torch.Size([32, 1])


# **# 1. Carregar Dataset Processado**

In [None]:
from torch.utils.data import Dataset as TorchDataset
# Carregar dataset já processado
file_path = '/content/drive/MyDrive/Pesquisa 2025/dataset/denuncias_balanceadas.xlsx'
df = pd.read_excel(file_path)

# Dividir em treino (70%), validação (15%) e teste (15%)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['classe'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['classe'], random_state=42)

print(f"Treino: {len(train_df)}, Validação: {len(val_df)}, Teste: {len(test_df)}")

Treino: 117, Validação: 25, Teste: 26


# **2. Tokenização com BERTimbau**

In [None]:
model_path = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Função de tokenização
def tokenize_function(batch):
    return tokenizer(batch["texto"], truncation=True,max_length=512, padding='max_length',return_tensors='pt')

# Tokenizar os DataFrames diretamente (sem usar datasets.Dataset)
train_encodings = tokenize_function(train_df.to_dict('list'))
val_encodings = tokenize_function(val_df.to_dict('list'))
test_encodings = tokenize_function(test_df.to_dict('list'))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# **3. Configurar o Modelo com congelamento ajustado**

In [None]:
id2label = {0: "invasao_domicilio", 1: "violencia_fisica"}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# Congelar camadas do BERT (exceto pooler e classificador)
for name, param in model.named_parameters():
    if 'classifier' not in name and 'pooler' not in name:
        param.requires_grad = False

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **4. MÉTRICAS AVANÇADAS (Acurácia + AUC-ROC)**

In [None]:
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    try:
        predictions, labels = eval_pred

        # Converter logits em probabilidades
        probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
        positive_class_probs = probabilities[:, 1]

        # Calcular métricas básicas
        preds = np.argmax(predictions, axis=1)
        acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
        auc = auc_score.compute(prediction_scores=positive_class_probs, references=labels)["roc_auc"]

        # Calcular F1 para cada classe com tratamento de erro
        f1_results = f1_metric.compute(
            predictions=preds,
            references=labels,
            average=None,
            labels=[label2id["invasao_domicilio"], label2id["violencia_fisica"]]
        )

        return {
            "accuracy": round(acc, 4),
            "auc": round(auc, 4),
            "f1_invasao": round(f1_results["f1"][0], 4),
            "f1_violencia": round(f1_results["f1"][1], 4)
        }
    except Exception as e:
        print(f"Erro no cálculo de métricas: {str(e)}")
        return {"accuracy": 0.0, "auc": 0.0, "f1_invasao": 0.0, "f1_violencia": 0.0}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

# **TREINAMENTO COM EARLY STOPPING**

In [None]:
training_args = TrainingArguments(
    output_dir="./bertimbau-denuncias",

    # Configurações básicas
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    #num_train_epochs=10,
    num_train_epochs=3,

    # Estratégias (nomenclatura correta para v4.53.2)
    eval_strategy="epoch",          # evaluation_strategy não existe nesta versão
    save_strategy="epoch",

    # Configurações de avaliação
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Logging
    logging_dir="./logs",
    logging_steps=100,
    disable_tqdm=False,
    report_to="none",

    # Otimizações
    fp16=True,
    seed=42,
    gradient_accumulation_steps=1,

    # Parâmetros específicos da v4.53.2
    remove_unused_columns=True,
    label_names=["labels"]
)

# Data Collator para padding dinâmico
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Custom Dataset (para trabalhar com DataFrames)
from torch.utils.data import Dataset as TorchDataset
class CustomDataset(TorchDataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels.iloc[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Criar datasets
train_dataset = CustomDataset(train_encodings, train_df['classe'].map(label2id))
val_dataset = CustomDataset(val_encodings, val_df['classe'].map(label2id))
test_dataset = CustomDataset(test_encodings, test_df['classe'].map(label2id))

# Trainer
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Para early stopping
    compute_metrics=compute_metrics
)
# Treinar!
#print("\nIniciando treinamento...")
#trainer.train()

try:
    print("\n🚀 Iniciando treinamento...")
    train_result = trainer.train()

    # Salvar métricas de treino
    metrics = train_result.metrics
    print("\n📊 Métricas finais de treino:")
    print(f"Loss: {metrics['train_loss']:.4f}")
    print(f"Tempo total: {metrics['train_runtime']:.2f}s")

    # Avaliar no conjunto de teste
    print("\n🧪 Avaliação no conjunto de teste...")
    test_metrics = trainer.evaluate(CustomDataset(test_encodings, test_df['classe'].map(label2id)))
    print(f"Acurácia: {test_metrics['eval_accuracy']:.4f}")
    print(f"AUC: {test_metrics['eval_auc']:.4f}")

except Exception as e:
    print(f"\n❌ Erro durante o treinamento: {str(e)}")
    # Adicione aqui qualquer lógica adicional de tratamento de erro
finally:
    # Salvamento seguro do modelo
    try:
        trainer.save_model("./modelo_final")
        print("\n💾 Modelo salvo com sucesso!")
    except:
        print("\n⚠️ Erro ao salvar o modelo")


🚀 Iniciando treinamento...


Epoch,Training Loss,Validation Loss,Accuracy,Auc,F1 Invasao,F1 Violencia
1,No log,0.602432,0.76,0.8782,0.7692,0.75
2,No log,0.591055,0.76,0.8974,0.7692,0.75
3,No log,0.581641,0.76,0.9167,0.75,0.7692
4,No log,0.574043,0.8,0.9231,0.8,0.8
5,No log,0.566006,0.8,0.9231,0.8,0.8
6,No log,0.558545,0.8,0.9231,0.8,0.8
7,0.591500,0.554102,0.8,0.9167,0.8,0.8
8,0.591500,0.551841,0.76,0.9103,0.75,0.7692
9,0.591500,0.549526,0.8,0.9167,0.8,0.8
10,0.591500,0.548931,0.8,0.9167,0.8,0.8



📊 Métricas finais de treino:
Loss: 0.5828
Tempo total: 63.46s

🧪 Avaliação no conjunto de teste...


Acurácia: 0.6538
AUC: 0.6982

💾 Modelo salvo com sucesso!


# **6. AVALIAÇÃO FINAL NO TESTE**

In [None]:
print("\nAvaliação no conjunto de teste:")
test_results = trainer.predict(test_dataset)
print(test_results.metrics)


# **7. SALVAR MODELO**

In [None]:
model.save_pretrained("/content/drive/MyDrive/Pesquisa 2025/modelo_bertimbau_final")
tokenizer.save_pretrained("/content/drive/MyDrive/Pesquisa 2025/modelo_bertimbau_final")
print("Modelo salvo no Google Drive!")