@inproceedings{souza2020bertimbau,
  author    = {F{\'a}bio Souza and
               Rodrigo Nogueira and
               Roberto Lotufo},
  title     = {{BERT}imbau: pretrained {BERT} models for {B}razilian {P}ortuguese},
  booktitle = {9th Brazilian Conference on Intelligent Systems, {BRACIS}, Rio Grande do Sul, Brazil, October 20-23 (to appear)},
  year      = {2020}
}


In [6]:
!pip install nbstripout
!nbstripout /content/drive/MyDrive/Pesquisa-2025/fine-tuning_Bertimbau.ipynb



In [None]:
!pip install transformers evaluate accelerate
import pandas as pd
import numpy as np
import evaluate
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

In [None]:
import torch
import torch.nn as nn

# Define um modelo simples
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(10, 50),
            nn.ReLU(),
            nn.Linear(50, 1)
        )

    def forward(self, x):
        return self.layer(x)

# Cria o modelo
model = Model()

# Define o dispositivo (GPU se disponível)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

# Move o modelo para a GPU
model = model.to(device)

# Exemplo de uso
dados = torch.randn(32, 10).to(device)  # Move os dados também
saida = model(dados)
print(saida.shape)  # Deveria mostrar torch.Size([32, 1])

# **# 1. Carregar Dataset Processado**

In [None]:
from torch.utils.data import Dataset as TorchDataset
# Carregar dataset já processado
file_path = '/content/drive/MyDrive/Pesquisa 2025/dataset/denuncias_balanceadas.xlsx'
df = pd.read_excel(file_path)

# Dividir em treino (70%), validação (15%) e teste (15%)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['classe'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['classe'], random_state=42)

print(f"Treino: {len(train_df)}, Validação: {len(val_df)}, Teste: {len(test_df)}")

# **2. Tokenização com BERTimbau**

In [None]:
model_path = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Função de tokenização
def tokenize_function(batch):
    return tokenizer(batch["texto"], truncation=True,max_length=512, padding='max_length',return_tensors='pt')

# Tokenizar os DataFrames diretamente (sem usar datasets.Dataset)
train_encodings = tokenize_function(train_df.to_dict('list'))
val_encodings = tokenize_function(val_df.to_dict('list'))
test_encodings = tokenize_function(test_df.to_dict('list'))

# **3. Configurar o Modelo com congelamento ajustado**

In [None]:
id2label = {0: "invasao_domicilio", 1: "violencia_fisica"}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# Congelar camadas do BERT (exceto pooler e classificador)
for name, param in model.named_parameters():
    if 'classifier' not in name and 'pooler' not in name:
        param.requires_grad = False

# **4. MÉTRICAS AVANÇADAS (Acurácia + AUC-ROC)**

In [None]:
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    try:
        predictions, labels = eval_pred

        # Converter logits em probabilidades
        probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
        positive_class_probs = probabilities[:, 1]

        # Calcular métricas básicas
        preds = np.argmax(predictions, axis=1)
        acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
        auc = auc_score.compute(prediction_scores=positive_class_probs, references=labels)["roc_auc"]

        # Calcular F1 para cada classe com tratamento de erro
        f1_results = f1_metric.compute(
            predictions=preds,
            references=labels,
            average=None,
            labels=[label2id["invasao_domicilio"], label2id["violencia_fisica"]]
        )

        return {
            "accuracy": round(acc, 4),
            "auc": round(auc, 4),
            "f1_invasao": round(f1_results["f1"][0], 4),
            "f1_violencia": round(f1_results["f1"][1], 4)
        }
    except Exception as e:
        print(f"Erro no cálculo de métricas: {str(e)}")
        return {"accuracy": 0.0, "auc": 0.0, "f1_invasao": 0.0, "f1_violencia": 0.0}

# **TREINAMENTO COM EARLY STOPPING**

In [None]:
training_args = TrainingArguments(
    output_dir="./bertimbau-denuncias",

    # Configurações básicas
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    #num_train_epochs=10,
    num_train_epochs=3,

    # Estratégias (nomenclatura correta para v4.53.2)
    eval_strategy="epoch",          # evaluation_strategy não existe nesta versão
    save_strategy="epoch",

    # Configurações de avaliação
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Logging
    logging_dir="./logs",
    logging_steps=100,
    disable_tqdm=False,
    report_to="none",

    # Otimizações
    fp16=True,
    seed=42,
    gradient_accumulation_steps=1,

    # Parâmetros específicos da v4.53.2
    remove_unused_columns=True,
    label_names=["labels"]
)

# Data Collator para padding dinâmico
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Custom Dataset (para trabalhar com DataFrames)
from torch.utils.data import Dataset as TorchDataset
class CustomDataset(TorchDataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels.iloc[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Criar datasets
train_dataset = CustomDataset(train_encodings, train_df['classe'].map(label2id))
val_dataset = CustomDataset(val_encodings, val_df['classe'].map(label2id))
test_dataset = CustomDataset(test_encodings, test_df['classe'].map(label2id))

# Trainer
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Para early stopping
    compute_metrics=compute_metrics
)
# Treinar!
#print("\nIniciando treinamento...")
#trainer.train()

try:
    print("\n🚀 Iniciando treinamento...")
    train_result = trainer.train()

    # Salvar métricas de treino
    metrics = train_result.metrics
    print("\n📊 Métricas finais de treino:")
    print(f"Loss: {metrics['train_loss']:.4f}")
    print(f"Tempo total: {metrics['train_runtime']:.2f}s")

    # Avaliar no conjunto de teste
    print("\n🧪 Avaliação no conjunto de teste...")
    test_metrics = trainer.evaluate(CustomDataset(test_encodings, test_df['classe'].map(label2id)))
    print(f"Acurácia: {test_metrics['eval_accuracy']:.4f}")
    print(f"AUC: {test_metrics['eval_auc']:.4f}")

except Exception as e:
    print(f"\n❌ Erro durante o treinamento: {str(e)}")
    # Adicione aqui qualquer lógica adicional de tratamento de erro
finally:
    # Salvamento seguro do modelo
    try:
        trainer.save_model("./modelo_final")
        print("\n💾 Modelo salvo com sucesso!")
    except:
        print("\n⚠️ Erro ao salvar o modelo")

# **6. AVALIAÇÃO FINAL NO TESTE**

In [None]:
print("\nAvaliação no conjunto de teste:")
test_results = trainer.predict(test_dataset)
print(test_results.metrics)


# **7. SALVAR MODELO**

In [None]:
model.save_pretrained("/content/drive/MyDrive/Pesquisa 2025/modelo_bertimbau_final")
tokenizer.save_pretrained("/content/drive/MyDrive/Pesquisa 2025/modelo_bertimbau_final")
print("Modelo salvo no Google Drive!")