<a href="https://colab.research.google.com/github/AugustoGuimaraesUFRJ/Algoritmo-Grafos/blob/main/TCC_C%C3%B3digo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

augustoguimaraes_bertimbau_tcc_model_path = kagglehub.dataset_download('augustoguimaraes/bertimbau-tcc-model')
augustoguimaraes_miniml_path = kagglehub.dataset_download('augustoguimaraes/miniml')
augustoguimaraes_treinamento_e_teste_path = kagglehub.dataset_download('augustoguimaraes/treinamento-e-teste')
augustoguimaraes_treino_mlm_path = kagglehub.dataset_download('augustoguimaraes/treino-mlm')
augustoguimaraes_treinamentoad_path = kagglehub.dataset_download('augustoguimaraes/treinamentoad')

print('Data source import complete.')


In [None]:
import pandas as pd
import torch
import random
import re
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    BertTokenizer, BertForMaskedLM,
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
)
from datasets import Dataset
from sentence_transformers import SentenceTransformer, util
import os
import unicodedata
import string
from sklearn.model_selection import train_test_split
os.environ["WANDB_DISABLED"] = "true"
import matplotlib.pyplot as plt

import logging

# Desativa qualquer barra de progresso do tqdm e logs do transformers
tqdm.disable = True
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)

In [None]:
model_name = "/kaggle/input/bertimbau-tcc-model/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"‚úÖ Modelo carregado no dispositivo: {device}")

‚úÖ Modelo carregado no dispositivo: cuda


In [None]:
# Caminhos dos arquivos
train_path = "/kaggle/input/treinamento-e-teste/train_df.csv"
test_path = "/kaggle/input/treinamento-e-teste/test_df.csv"

# Carregar datasets
train_final = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

#  Renomear 'class' ‚Üí 'labels'
train_final = train_final.rename(columns={"class": "labels"})
test_df = test_df.rename(columns={"class": "labels"})

#  Remover nulos e garantir tipo string
for name, df_ in {"train": train_final, "test": test_df}.items():
    df_.dropna(subset=["text", "labels"], inplace=True)
    df_["text"] = df_["text"].astype(str)
    df_.loc[df_["text"].str.strip() == "", "text"] = np.nan
    df_.dropna(subset=["text"], inplace=True)
    print(f" {name}: {len(df_)} linhas")

#  Tokeniza√ß√£o
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="binary")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

train_dataset = Dataset.from_pandas(train_final)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ============================================================
# 9 Treinamento e avalia√ß√£o
# ============================================================
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)

# ============================================================
# 9. An√°lise de Erros
# ============================================================
preds_output = trainer.predict(test_dataset)
preds = np.argmax(preds_output.predictions, axis=1)

result_df = test_df.copy()
result_df["preds"] = preds

total = len(result_df)
acertos = (result_df["labels"] == result_df["preds"]).sum()
erros = total - acertos
fp = ((result_df["preds"] == 1) & (result_df["labels"] == 0)).sum()
fn = ((result_df["preds"] == 0) & (result_df["labels"] == 1)).sum()

print(f"\n=== RESULTADOS DE ERRO ===")
print(f"Total: {total}")
print(f" Acertos: {acertos}")
print(f" Erros: {erros}")
print(f"  ‚Ü≥ Falsos Positivos (prev√™ ofensa mas n√£o √©): {fp}")
print(f"  ‚Ü≥ Falsos Negativos (n√£o detectou ofensa real): {fn}")

# Mostrar exemplos
falsos_positivos = result_df[(result_df["preds"] == 1) & (result_df["labels"] == 0)]
falsos_negativos = result_df[(result_df["preds"] == 0) & (result_df["labels"] == 1)]

print("\n===  FALSOS POSITIVOS ===")
for t in falsos_positivos["text"].head(30):
    print("‚Ä¢", t)

print("\n===  FALSOS NEGATIVOS ===")
for t in falsos_negativos["text"].head(30):
    print("‚Ä¢", t)

 train: 5322 linhas
 test: 1331 linhas


Map:   0%|          | 0/5322 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.5415, 'grad_norm': 16.501218795776367, 'learning_rate': 1.705705705705706e-05, 'epoch': 0.15015015015015015}
{'loss': 0.3196, 'grad_norm': 8.104938507080078, 'learning_rate': 1.4054054054054055e-05, 'epoch': 0.3003003003003003}
{'loss': 0.2922, 'grad_norm': 8.398772239685059, 'learning_rate': 1.1051051051051051e-05, 'epoch': 0.45045045045045046}
{'loss': 0.273, 'grad_norm': 2.8173561096191406, 'learning_rate': 8.048048048048048e-06, 'epoch': 0.6006006006006006}
{'loss': 0.2734, 'grad_norm': 21.57377052307129, 'learning_rate': 5.045045045045045e-06, 'epoch': 0.7507507507507507}
{'loss': 0.2606, 'grad_norm': 2.817598581314087, 'learning_rate': 2.0420420420420424e-06, 'epoch': 0.9009009009009009}
{'eval_loss': 0.23450876772403717, 'eval_accuracy': 0.9120961682945155, 'eval_precision': 0.9159159159159159, 'eval_recall': 0.9090909090909091, 'eval_f1': 0.912490650710546, 'eval_runtime': 6.9917, 'eval_samples_per_second': 190.369, 'eval_steps_per_second': 12.014, 'epoch': 1.0}
{'tr



{'eval_loss': 0.23450876772403717, 'eval_accuracy': 0.9120961682945155, 'eval_precision': 0.9159159159159159, 'eval_recall': 0.9090909090909091, 'eval_f1': 0.912490650710546, 'eval_runtime': 7.3556, 'eval_samples_per_second': 180.951, 'eval_steps_per_second': 11.42, 'epoch': 1.0}
{'eval_loss': 0.23450876772403717, 'eval_accuracy': 0.9120961682945155, 'eval_precision': 0.9159159159159159, 'eval_recall': 0.9090909090909091, 'eval_f1': 0.912490650710546, 'eval_runtime': 7.3556, 'eval_samples_per_second': 180.951, 'eval_steps_per_second': 11.42, 'epoch': 1.0}





=== RESULTADOS DE ERRO ===
Total: 1331
 Acertos: 1214
 Erros: 117
  ‚Ü≥ Falsos Positivos (prev√™ ofensa mas n√£o √©): 56
  ‚Ü≥ Falsos Negativos (n√£o detectou ofensa real): 61

===  FALSOS POSITIVOS ===
‚Ä¢ Cada dia se queimando mais
‚Ä¢ √â a treva.
‚Ä¢ SENSACIONAL √â BEM P√îR AI MESMOOO, ELE EST√Å PREOCUPADO PORQU√ä SERA, QUEM √ë DEVE √ë TEME, SIMPLES ASSIM!!!
‚Ä¢ O Brasil lutando para sair da Grota e alguns preocupado com a tal de Greta. Vamos ficar at√© quando nessa?
‚Ä¢ Dupla imbat√≠vel nas argumenta√ß√µes inteligentes contra esse governo desgovernado que estamos pagando....aff
‚Ä¢ E n√≥s que podemos ter perdido uma Greta Thunberg cruelmente assasinada por witizelassassino witizelgenocida. √Åghata nossa Menina Maravilha tinha apenas 8 anos seu discurso era apenas a boneca M√¥nica, mas quem poder√° afirmar ao contr√°rio que aos 16 anos trocaria M√¥nica pela bandeira de um mundo mais justo e sustent√°vel.
‚Ä¢ Nem era pra ter sido preso! LulaInocente
‚Ä¢ Uma menina chega as capas aos

In [None]:
train_path = "/kaggle/input/treinamento-e-teste/train_df.csv"
train_df = pd.read_csv(train_path)

train_df = train_df[train_df['text'].apply(lambda x: isinstance(x, str) and x.strip() != '')]
print("üìä Linhas totais:", len(train_df))
print(train_df.info())
print("\nAmostra de tipos incorretos:")
print(train_df[~train_df['text'].apply(lambda x: isinstance(x, str))].head())

üìä Linhas totais: 5322
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5322 entries, 0 to 5321
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5322 non-null   object
 1   class   5322 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 83.3+ KB
None

Amostra de tipos incorretos:
Empty DataFrame
Columns: [text, class]
Index: []


In [None]:
model_name = "/kaggle/input/bertimbau-tcc-model/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"‚úÖ Modelo carregado no dispositivo: {device}")

def mlm_augment(text, idx=None, prob=0.15, n_aug=2):
    """
    Gera frases aumentadas via Masked Language Modeling (BERTimbau)
    e associa cada uma ao √≠ndice original do texto.

    Args:
        text (str): Texto original.
        idx (int, optional): √çndice da frase original no dataset.
        prob (float): Probabilidade de mascarar um token.
        n_aug (int): Quantas vers√µes aumentadas gerar.

    Returns:
        list[dict]: Lista com {"orig_idx", "text_original", "text_augmentada"}
    """
    if not isinstance(text, str) or text.strip() == "":
        return []

    tokens = tokenizer.tokenize(text)
    augmented_samples = []

    for _ in range(n_aug):
        masked_tokens = tokens.copy()
        for i in range(len(masked_tokens)):
            if random.random() < prob and masked_tokens[i].isalpha():
                masked_tokens[i] = tokenizer.mask_token

        masked_text = tokenizer.convert_tokens_to_string(masked_tokens)
        inputs = tokenizer(masked_text, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            predictions = outputs.logits

        predicted_indices = torch.argmax(predictions, dim=-1)
        if predicted_indices.dim() == 0:
            predicted_indices = predicted_indices.unsqueeze(0)

        predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices[0].tolist())

        result_tokens = [
            predicted_tokens[i] if masked_tokens[i] == tokenizer.mask_token else masked_tokens[i]
            for i in range(len(masked_tokens))
        ]

        new_text = tokenizer.convert_tokens_to_string(result_tokens)
        augmented_samples.append({
            "orig_idx": idx,
            "text_original": text,
            "text_augmentada": new_text
        })

    return augmented_samples


# ============================================================
# Carregar dataset
# ============================================================
train_path = "/kaggle/input/treinamento-e-teste/train_df.csv"
train_df = pd.read_csv(train_path)

augmented_rows = []

# Itera sobre cada linha do treino
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    # Gera novas vers√µes da frase
    new_samples = mlm_augment(
        text=row["text"],
        idx=idx,           # √≠ndice original da linha no treino
        prob=0.15,         # 15% das palavras mascaradas
        n_aug=2            # duas vers√µes aumentadas por frase
    )

    # Adiciona resultados ao dataset final
    for sample in new_samples:
        new_text = sample["text_augmentada"]

        augmented_rows.append({
            "orig_idx": sample["orig_idx"],
            "text_original": sample["text_original"],
            "text_augmentada": new_text,
            "class": row["class"]
        })

# Cria DataFrame com os textos aumentados
df_aug = pd.DataFrame(augmented_rows)

‚úÖ Modelo carregado no dispositivo: cuda


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5322/5322 [01:33<00:00, 56.81it/s]


In [None]:
df_aug= df_aug[["text_augmentada", "class"]].rename(columns={"text_augmentada": "text"})

print(df_aug.shape)
df_aug.head()

(10644, 2)


Unnamed: 0,text,class
0,Parab√©ns meu presidente,0
1,Parab√©ns meu meu,0
2,Valeuu analise .,0
3,Valeu seu seue .,0
4,PT roubando o o povo ! !,1


In [None]:
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,text,class
0,Parab√©ns meu presidente,0
1,Valeu seu analise.,0
2,PT roubando o pa√≠s inteiro!!,1
3,Lula livre.,0
4,Ele esqueceu de amadurecer,0


In [None]:
train_final = pd.concat([train_df, df_aug], ignore_index=True)
train_final.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,text,labels,class
0,Parab√©ns meu presidente,0.0,
1,Valeu seu analise.,0.0,
2,PT roubando o pa√≠s inteiro!!,1.0,
3,Lula livre.,0.0,
4,Ele esqueceu de amadurecer,0.0,


In [None]:
train_path = "/kaggle/input/treinamento-e-teste/train_df.csv"
test_path = "/kaggle/input/treinamento-e-teste/test_df.csv"

# Carregar datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_final = pd.concat([train_df, df_aug], ignore_index=True)

#  Renomear 'class' ‚Üí 'labels'
train_final = train_final.rename(columns={"class": "labels"})
test_df = test_df.rename(columns={"class": "labels"})


train_final = train_final[~train_final["text"].str.contains(r"\[UNK\]", regex=True, na=False)]
train_final = train_final.drop_duplicates(subset=["text"], keep="first").reset_index(drop=True)

#  Remover nulos e garantir tipo string
for name, df_ in {"train": train_final, "test": test_df}.items():
    df_.dropna(subset=["text", "labels"], inplace=True)
    df_["text"] = df_["text"].astype(str)
    df_.loc[df_["text"].str.strip() == "", "text"] = np.nan
    df_.dropna(subset=["text"], inplace=True)
    print(f" {name}: {len(df_)} linhas")

#  Tokeniza√ß√£o
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="binary")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

train_dataset = Dataset.from_pandas(train_final)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ============================================================
# 9 Treinamento e avalia√ß√£o
# ============================================================
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)

# ============================================================
# 9. An√°lise de Erros
# ============================================================
preds_output = trainer.predict(test_dataset)
preds = np.argmax(preds_output.predictions, axis=1)

result_df = test_df.copy()
result_df["preds"] = preds

total = len(result_df)
acertos = (result_df["labels"] == result_df["preds"]).sum()
erros = total - acertos
fp = ((result_df["preds"] == 1) & (result_df["labels"] == 0)).sum()
fn = ((result_df["preds"] == 0) & (result_df["labels"] == 1)).sum()

print(f"\n=== RESULTADOS DE ERRO ===")
print(f"Total: {total}")
print(f" Acertos: {acertos}")
print(f" Erros: {erros}")
print(f"  ‚Ü≥ Falsos Positivos (prev√™ ofensa mas n√£o √©): {fp}")
print(f"  ‚Ü≥ Falsos Negativos (n√£o detectou ofensa real): {fn}")

# Mostrar exemplos
falsos_positivos = result_df[(result_df["preds"] == 1) & (result_df["labels"] == 0)]
falsos_negativos = result_df[(result_df["preds"] == 0) & (result_df["labels"] == 1)]

print("\n===  FALSOS POSITIVOS ===")
for t in falsos_positivos["text"].head(30):
    print("‚Ä¢", t)

print("\n===  FALSOS NEGATIVOS ===")
for t in falsos_negativos["text"].head(30):
    print("‚Ä¢", t)

 train: 13232 linhas
 test: 1331 linhas


Map:   0%|          | 0/13232 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.5433, 'grad_norm': 10.164219856262207, 'learning_rate': 1.8814993954050786e-05, 'epoch': 0.060459492140266025}
{'loss': 0.3297, 'grad_norm': 10.76231575012207, 'learning_rate': 1.760580411124547e-05, 'epoch': 0.12091898428053205}
{'loss': 0.2985, 'grad_norm': 8.488628387451172, 'learning_rate': 1.6396614268440147e-05, 'epoch': 0.18137847642079807}
{'loss': 0.2696, 'grad_norm': 28.817798614501953, 'learning_rate': 1.5187424425634827e-05, 'epoch': 0.2418379685610641}
{'loss': 0.301, 'grad_norm': 6.734292984008789, 'learning_rate': 1.3978234582829506e-05, 'epoch': 0.3022974607013301}
{'loss': 0.2726, 'grad_norm': 7.549954414367676, 'learning_rate': 1.2769044740024186e-05, 'epoch': 0.36275695284159615}
{'loss': 0.2785, 'grad_norm': 14.627341270446777, 'learning_rate': 1.1559854897218865e-05, 'epoch': 0.42321644498186217}
{'loss': 0.2576, 'grad_norm': 8.157594680786133, 'learning_rate': 1.0350665054413544e-05, 'epoch': 0.4836759371221282}
{'loss': 0.2148, 'grad_norm': 4.024790763



{'eval_loss': 0.25742781162261963, 'eval_accuracy': 0.9143501126972201, 'eval_precision': 0.908957415565345, 'eval_recall': 0.9225037257824144, 'eval_f1': 0.9156804733727811, 'eval_runtime': 7.2263, 'eval_samples_per_second': 184.188, 'eval_steps_per_second': 11.624, 'epoch': 1.0}
{'eval_loss': 0.25742781162261963, 'eval_accuracy': 0.9143501126972201, 'eval_precision': 0.908957415565345, 'eval_recall': 0.9225037257824144, 'eval_f1': 0.9156804733727811, 'eval_runtime': 7.2263, 'eval_samples_per_second': 184.188, 'eval_steps_per_second': 11.624, 'epoch': 1.0}





=== RESULTADOS DE ERRO ===
Total: 1331
 Acertos: 1217
 Erros: 114
  ‚Ü≥ Falsos Positivos (prev√™ ofensa mas n√£o √©): 62
  ‚Ü≥ Falsos Negativos (n√£o detectou ofensa real): 52

===  FALSOS POSITIVOS ===
‚Ä¢ Cada dia se queimando mais
‚Ä¢ √â a treva.
‚Ä¢ Isto mesmo. Esses filhos do Sr. Presidente deveriam ficar mais falados. V√£o colocar tudo a perder. Uma l√°stima.
‚Ä¢ Jesus, √© cada uma desse cidad√£o. √â de assustar.
‚Ä¢ SENSACIONAL √â BEM P√îR AI MESMOOO, ELE EST√Å PREOCUPADO PORQU√ä SERA, QUEM √ë DEVE √ë TEME, SIMPLES ASSIM!!!
‚Ä¢ A√™ num aguento!
‚Ä¢ Aro
‚Ä¢ Dupla imbat√≠vel nas argumenta√ß√µes inteligentes contra esse governo desgovernado que estamos pagando....aff
‚Ä¢ E n√≥s que podemos ter perdido uma Greta Thunberg cruelmente assasinada por witizelassassino witizelgenocida. √Åghata nossa Menina Maravilha tinha apenas 8 anos seu discurso era apenas a boneca M√¥nica, mas quem poder√° afirmar ao contr√°rio que aos 16 anos trocaria M√¥nica pela bandeira de um mundo mais justo e s

In [None]:
model_name = "/kaggle/input/bertimbau-tcc-model/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"‚úÖ Modelo carregado no dispositivo: {device}")

def mlm_augment(text, idx=None, prob=0.20, n_aug=5):
    """
    Gera frases aumentadas via Masked Language Modeling (BERTimbau)
    e associa cada uma ao √≠ndice original do texto.

    Args:
        text (str): Texto original.
        idx (int, optional): √çndice da frase original no dataset.
        prob (float): Probabilidade de mascarar um token.
        n_aug (int): Quantas vers√µes aumentadas gerar.

    Returns:
        list[dict]: Lista com {"orig_idx", "text_original", "text_augmentada"}
    """
    if not isinstance(text, str) or text.strip() == "":
        return []

    tokens = tokenizer.tokenize(text)
    augmented_samples = []

    for _ in range(n_aug):
        masked_tokens = tokens.copy()
        for i in range(len(masked_tokens)):
            if random.random() < prob and masked_tokens[i].isalpha():
                masked_tokens[i] = tokenizer.mask_token

        masked_text = tokenizer.convert_tokens_to_string(masked_tokens)
        inputs = tokenizer(masked_text, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            predictions = outputs.logits

        predicted_indices = torch.argmax(predictions, dim=-1)
        if predicted_indices.dim() == 0:
            predicted_indices = predicted_indices.unsqueeze(0)

        predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indices[0].tolist())

        result_tokens = [
            predicted_tokens[i] if masked_tokens[i] == tokenizer.mask_token else masked_tokens[i]
            for i in range(len(masked_tokens))
        ]

        new_text = tokenizer.convert_tokens_to_string(result_tokens)
        augmented_samples.append({
            "orig_idx": idx,
            "text_original": text,
            "text_augmentada": new_text
        })

    return augmented_samples


# ============================================================
# Carregar dataset
# ============================================================
train_path = "/kaggle/input/treinamento-e-teste/train_df.csv"
train_df = pd.read_csv(train_path)

augmented_rows = []

# Itera sobre cada linha do treino
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    # Gera novas vers√µes da frase
    new_samples = mlm_augment(
        text=row["text"],
        idx=idx,           # √≠ndice original da linha no treino
        prob=0.20,         # 15% das palavras mascaradas
        n_aug=5            # duas vers√µes aumentadas por frase
    )

    # Adiciona resultados ao dataset final
    for sample in new_samples:
        new_text = sample["text_augmentada"]

        augmented_rows.append({
            "orig_idx": sample["orig_idx"],
            "text_original": sample["text_original"],
            "text_augmentada": new_text,
            "class": row["class"]
        })

# Cria DataFrame com os textos aumentados
df_aug_2 = pd.DataFrame(augmented_rows)

‚úÖ Modelo carregado no dispositivo: cuda


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5322/5322 [03:49<00:00, 23.21it/s]


In [None]:
df_aug_2= df_aug_2[["text_augmentada", "class"]].rename(columns={"text_augmentada": "text"})

print(df_aug_2.shape)
df_aug_2.head()

(26610, 2)


Unnamed: 0,text,class
0,Parab√©ns meu presidente,0
1,Parab√©ns meu meu,0
2,Parab√©ns√©ns presidente,0
3,Parab√©ns meu meu,0
4,Parab√©ns√©ns presidente,0


In [None]:
train_path = "/kaggle/input/treinamento-e-teste/train_df.csv"
test_path = "/kaggle/input/treinamento-e-teste/test_df.csv"

# Carregar datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_final = pd.concat([train_df, df_aug_2], ignore_index=True)

#  Renomear 'class' ‚Üí 'labels'
train_final = train_final.rename(columns={"class": "labels"})
test_df = test_df.rename(columns={"class": "labels"})


train_final = train_final[~train_final["text"].str.contains(r"\[UNK\]", regex=True, na=False)]
train_final = train_final.drop_duplicates(subset=["text"], keep="first").reset_index(drop=True)

#  Remover nulos e garantir tipo string
for name, df_ in {"train": train_final, "test": test_df}.items():
    df_.dropna(subset=["text", "labels"], inplace=True)
    df_["text"] = df_["text"].astype(str)
    df_.loc[df_["text"].str.strip() == "", "text"] = np.nan
    df_.dropna(subset=["text"], inplace=True)
    print(f" {name}: {len(df_)} linhas")

#  Tokeniza√ß√£o
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="binary")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

train_dataset = Dataset.from_pandas(train_final)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ============================================================
# 9 Treinamento e avalia√ß√£o
# ============================================================
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)

# ============================================================
# 9. An√°lise de Erros
# ============================================================
preds_output = trainer.predict(test_dataset)
preds = np.argmax(preds_output.predictions, axis=1)

result_df = test_df.copy()
result_df["preds"] = preds

total = len(result_df)
acertos = (result_df["labels"] == result_df["preds"]).sum()
erros = total - acertos
fp = ((result_df["preds"] == 1) & (result_df["labels"] == 0)).sum()
fn = ((result_df["preds"] == 0) & (result_df["labels"] == 1)).sum()

print(f"\n=== RESULTADOS DE ERRO ===")
print(f"Total: {total}")
print(f" Acertos: {acertos}")
print(f" Erros: {erros}")
print(f"  ‚Ü≥ Falsos Positivos (prev√™ ofensa mas n√£o √©): {fp}")
print(f"  ‚Ü≥ Falsos Negativos (n√£o detectou ofensa real): {fn}")

# Mostrar exemplos
falsos_positivos = result_df[(result_df["preds"] == 1) & (result_df["labels"] == 0)]
falsos_negativos = result_df[(result_df["preds"] == 0) & (result_df["labels"] == 1)]

print("\n===  FALSOS POSITIVOS ===")
for t in falsos_positivos["text"].head(30):
    print("‚Ä¢", t)

print("\n===  FALSOS NEGATIVOS ===")
for t in falsos_negativos["text"].head(30):
    print("‚Ä¢", t)

 train: 24088 linhas
 test: 1331 linhas


Map:   0%|          | 0/24088 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.5505, 'grad_norm': 1.7914618253707886, 'learning_rate': 1.9349269588313415e-05, 'epoch': 0.033200531208499334}
{'loss': 0.3775, 'grad_norm': 9.333806037902832, 'learning_rate': 1.868525896414343e-05, 'epoch': 0.06640106241699867}
{'loss': 0.3199, 'grad_norm': 10.904051780700684, 'learning_rate': 1.8021248339973442e-05, 'epoch': 0.099601593625498}
{'loss': 0.2919, 'grad_norm': 16.351192474365234, 'learning_rate': 1.7357237715803456e-05, 'epoch': 0.13280212483399734}
{'loss': 0.2494, 'grad_norm': 2.6490745544433594, 'learning_rate': 1.669322709163347e-05, 'epoch': 0.16600265604249667}
{'loss': 0.2621, 'grad_norm': 1.1895049810409546, 'learning_rate': 1.602921646746348e-05, 'epoch': 0.199203187250996}
{'loss': 0.2262, 'grad_norm': 15.565313339233398, 'learning_rate': 1.5365205843293494e-05, 'epoch': 0.23240371845949534}
{'loss': 0.2849, 'grad_norm': 11.781869888305664, 'learning_rate': 1.4701195219123507e-05, 'epoch': 0.2656042496679947}
{'loss': 0.2267, 'grad_norm': 12.0898742



{'eval_loss': 0.29196697473526, 'eval_accuracy': 0.9166040570999249, 'eval_precision': 0.9154302670623146, 'eval_recall': 0.9195230998509687, 'eval_f1': 0.9174721189591077, 'eval_runtime': 7.5798, 'eval_samples_per_second': 175.598, 'eval_steps_per_second': 11.082, 'epoch': 1.0}
{'eval_loss': 0.29196697473526, 'eval_accuracy': 0.9166040570999249, 'eval_precision': 0.9154302670623146, 'eval_recall': 0.9195230998509687, 'eval_f1': 0.9174721189591077, 'eval_runtime': 7.5798, 'eval_samples_per_second': 175.598, 'eval_steps_per_second': 11.082, 'epoch': 1.0}





=== RESULTADOS DE ERRO ===
Total: 1331
 Acertos: 1220
 Erros: 111
  ‚Ü≥ Falsos Positivos (prev√™ ofensa mas n√£o √©): 57
  ‚Ü≥ Falsos Negativos (n√£o detectou ofensa real): 54

===  FALSOS POSITIVOS ===
‚Ä¢ Cada dia se queimando mais
‚Ä¢ √â a treva.
‚Ä¢ Isto mesmo. Esses filhos do Sr. Presidente deveriam ficar mais falados. V√£o colocar tudo a perder. Uma l√°stima.
‚Ä¢ Jesus, √© cada uma desse cidad√£o. √â de assustar.
‚Ä¢ SENSACIONAL √â BEM P√îR AI MESMOOO, ELE EST√Å PREOCUPADO PORQU√ä SERA, QUEM √ë DEVE √ë TEME, SIMPLES ASSIM!!!
‚Ä¢ O Brasil lutando para sair da Grota e alguns preocupado com a tal de Greta. Vamos ficar at√© quando nessa?
‚Ä¢ Aro
‚Ä¢ A t√° ,a esquerda pode falar que vai incendiar o pa√≠s e ningu√©m fala nada.
‚Ä¢ E n√≥s que podemos ter perdido uma Greta Thunberg cruelmente assasinada por witizelassassino witizelgenocida. √Åghata nossa Menina Maravilha tinha apenas 8 anos seu discurso era apenas a boneca M√¥nica, mas quem poder√° afirmar ao contr√°rio que aos 16 anos 

In [None]:
import torch
import random
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util

# Garantir que stopwords est√£o dispon√≠veis
nltk.download("stopwords")
stopwords_pt = set(stopwords.words("portuguese"))

# Carrega modelo SBERT (para medir similaridade sem√¢ntica)
sbert = SentenceTransformer('/kaggle/input/miniml/paraphrase-multilingual-MiniLM-L12-v2')


def debug_mlm_augment_conditional_one_token_dataset(
    text,
    tokenizer,
    model,
    device,
    n_aug=10,
    top_k=10,
    max_tries=20,
    min_similarity=0.7
):
    """
    Vers√£o com debug detalhado do gerador de augmentations via MLM.
    Mostra cada tentativa, similaridade e motivo de rejei√ß√£o.
    """

    stop_words = set(stopwords.words("portuguese"))
    augmented_samples = []

    print("=" * 100)
    print(f"üìù Texto original: {text}")
    print("-" * 100)

    # Limpeza b√°sica
    text = re.sub(r"[‚Äò‚Äô]", "'", text)
    text = re.sub(r"[‚Äú‚Äù]", '"', text)

    tokens = tokenizer.tokenize(text)
    if len(tokens) < 3:
        print(" Frase muito curta (menos de 3 tokens).")
        print("=" * 100)
        return []

    def is_full_word(tokens, idx):
        """True se token for palavra completa, n√£o subpalavra."""
        if tokens[idx].startswith("##"):
            return False
        if idx + 1 < len(tokens) and tokens[idx + 1].startswith("##"):
            return False
        return True

    candidate_indices = [
        i for i, t in enumerate(tokens)
        if t.isalpha() and is_full_word(tokens, i) and t.lower() not in stop_words
    ]

    if not candidate_indices:
        print(" Nenhuma palavra candidata para mascarar.")
        print("=" * 100)
        return []

    print(f" Tokens candidatos ({len(candidate_indices)}): {[tokens[i] for i in candidate_indices]}")

    # Embedding do texto original
    orig_emb = sbert.encode(text, convert_to_tensor=True, show_progress_bar=False)

    tries = 0
    total_attempts = 0
    while len(augmented_samples) < n_aug and tries < max_tries:
        tries += 1

        mask_idx = random.choice(candidate_indices)
        masked_tokens = tokens.copy()
        masked_tokens[mask_idx] = tokenizer.mask_token
        masked_text = tokenizer.convert_tokens_to_string(masked_tokens)

        inputs = tokenizer(masked_text, return_tensors="pt").to(device)
        mask_positions = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=False)
        if mask_positions.size(0) == 0:
            print(" Nenhum [MASK] encontrado ap√≥s tokeniza√ß√£o.")
            continue
        mask_pos = mask_positions[0, 1].item()

        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits[0, mask_pos]
        top_k_ids = torch.topk(logits, k=min(top_k, logits.size(0))).indices.tolist()
        random.shuffle(top_k_ids)

        for chosen_id in top_k_ids:
            total_attempts += 1
            new_token = tokenizer.convert_ids_to_tokens([chosen_id])[0]
            if new_token == "[UNK]" or new_token.startswith("##"):
                print(f" Rejeitado: token inv√°lido ({new_token})")
                continue

            new_tokens = tokens.copy()
            new_tokens[mask_idx] = new_token
            new_text = tokenizer.convert_tokens_to_string(new_tokens).strip()

            if "[UNK]" in new_text:
                print(f" Rejeitado: cont√©m [UNK] ‚Üí {new_text}")
                continue

            if new_text.lower() == text.strip().lower():
                print(f" Rejeitado: igual ao original ‚Üí {new_text}")
                continue

            # Calcula similaridade
            new_emb = sbert.encode(new_text, convert_to_tensor=True, show_progress_bar=False)
            sim = util.cos_sim(orig_emb, new_emb).item()

            if sim < min_similarity:
                print(f" Rejeitado (sim={sim:.3f} < {min_similarity}) ‚Üí {new_text}")
                continue

            if new_text in augmented_samples:
                print(f" J√° gerado ‚Üí {new_text}")
                continue

            augmented_samples.append(new_text)
            print(f"‚úÖ Aceito (sim={sim:.3f}) ‚Üí {new_text}")

            if len(augmented_samples) >= n_aug:
                break

    print("-" * 100)
    print(f" Total aceitas: {len(augmented_samples)} / {n_aug} (em {total_attempts} tentativas)")
    print("=" * 100)
    return augmented_samples


# Exemplo de uso com seu dataset
import pandas as pd

train_path = "/kaggle/input/treinamento-e-teste/train_df.csv"
train_df = pd.read_csv(train_path)

for i, row in train_df.head(50).iterrows():
    novas = debug_mlm_augment_conditional_one_token_dataset(
        text=row["text"],
        tokenizer=tokenizer,
        model=model,
        device=device,
        n_aug=10,
        top_k=10,
        max_tries=20,
        min_similarity=0.7,
    )

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


üìù Texto original: Parab√©ns meu presidente
----------------------------------------------------------------------------------------------------
 Tokens candidatos (1): ['presidente']
‚úÖ Aceito (sim=0.715) ‚Üí Parab√©ns meu !
 Rejeitado (sim=0.688 < 0.7) ‚Üí Parab√©ns meu filho
 Rejeitado (sim=0.655 < 0.7) ‚Üí Parab√©ns meu amor
 Rejeitado (sim=0.664 < 0.7) ‚Üí Parab√©ns meu pai
 Rejeitado (sim=0.680 < 0.7) ‚Üí Parab√©ns meu neto
‚úÖ Aceito (sim=0.725) ‚Üí Parab√©ns meu caro
 Rejeitado (sim=0.671 < 0.7) ‚Üí Parab√©ns meu amigo
‚úÖ Aceito (sim=0.743) ‚Üí Parab√©ns meu .
 Rejeitado (sim=0.591 < 0.7) ‚Üí Parab√©ns meu cora√ß√£o
 Rejeitado (sim=0.624 < 0.7) ‚Üí Parab√©ns meu irm√£o
 Rejeitado (sim=0.688 < 0.7) ‚Üí Parab√©ns meu filho
 J√° gerado ‚Üí Parab√©ns meu .
 Rejeitado (sim=0.655 < 0.7) ‚Üí Parab√©ns meu amor
 J√° gerado ‚Üí Parab√©ns meu caro
 Rejeitado (sim=0.664 < 0.7) ‚Üí Parab√©ns meu pai
 Rejeitado (sim=0.671 < 0.7) ‚Üí Parab√©ns meu amigo
 J√° gerado ‚Üí Parab√©ns meu !
 R

In [None]:
import torch
import random
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util

# ============================================================
# 1. Prepara√ß√£o
# ============================================================
nltk.download("stopwords")
stopwords_pt = set(stopwords.words("portuguese"))

# Carrega modelo SBERT
sbert = SentenceTransformer('/kaggle/input/miniml/paraphrase-multilingual-MiniLM-L12-v2')

# ============================================================
# 2. Fun√ß√£o de data augmentation (sem prints)
# ============================================================
def mlm_augment_clean(
    text,
    tokenizer,
    model,
    device,
    n_aug=10,
    top_k=10,
    max_tries=20,
    min_similarity=0.7,
    max_similarity=1.0
):
    stop_words = set(stopwords.words("portuguese"))
    augmented_samples = []

    text = re.sub(r"[‚Äò‚Äô]", "'", text)
    text = re.sub(r"[‚Äú‚Äù]", '"', text)

    tokens = tokenizer.tokenize(text)
    if len(tokens) < 3:
        return []

    def is_full_word(tokens, idx):
        if tokens[idx].startswith("##"):
            return False
        if idx + 1 < len(tokens) and tokens[idx + 1].startswith("##"):
            return False
        return True

    candidate_indices = [
        i for i, t in enumerate(tokens)
        if t.isalpha() and is_full_word(tokens, i) and t.lower() not in stop_words
    ]
    if not candidate_indices:
        return []

    orig_emb = sbert.encode(text, convert_to_tensor=True, show_progress_bar=False)

    tries = 0
    while len(augmented_samples) < n_aug and tries < max_tries:
        tries += 1
        mask_idx = random.choice(candidate_indices)
        masked_tokens = tokens.copy()
        masked_tokens[mask_idx] = tokenizer.mask_token
        masked_text = tokenizer.convert_tokens_to_string(masked_tokens)

        inputs = tokenizer(masked_text, return_tensors="pt").to(device)
        mask_positions = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=False)
        if mask_positions.size(0) == 0:
            continue
        mask_pos = mask_positions[0, 1].item()

        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits[0, mask_pos]
        top_k_ids = torch.topk(logits, k=min(top_k, logits.size(0))).indices.tolist()
        random.shuffle(top_k_ids)

        for chosen_id in top_k_ids:
            new_token = tokenizer.convert_ids_to_tokens([chosen_id])[0]
            if new_token == "[UNK]" or new_token.startswith("##"):
                continue

            new_tokens = tokens.copy()
            new_tokens[mask_idx] = new_token
            new_text = tokenizer.convert_tokens_to_string(new_tokens).strip()

            if "[UNK]" in new_text or new_text.lower() == text.lower():
                continue

            new_emb = sbert.encode(new_text, convert_to_tensor=True, show_progress_bar=False)
            sim = util.cos_sim(orig_emb, new_emb).item()

            if not (min_similarity <= sim <= max_similarity):
                continue

            if new_text in augmented_samples:
                continue

            augmented_samples.append(new_text)

            if len(augmented_samples) >= n_aug:
                break

    return augmented_samples

# ============================================================
# 3. Gera√ß√£o dos 5 datasets
# ============================================================
intervalos = {
    "sim_0.65_1.00": (0.65, 1.00),
    "sim_0.70_1.00": (0.70, 1.00),
    "sim_0.65_0.90": (0.65, 0.90),
    "sim_0.70_0.90": (0.70, 0.90),
    "sim_0.70_0.85": (0.70, 0.85),
}

train_path = "/kaggle/input/treinamento-e-teste/train_df.csv"
train_df = pd.read_csv(train_path)

# Gera datasets
for nome, (min_sim, max_sim) in intervalos.items():
    print(f" Gerando dataset {nome} (intervalo {min_sim}-{max_sim})")

    aug_texts, aug_labels = [], []
    for i, row in train_df.iterrows():
        novas = mlm_augment_clean(
            text=row["text"],
            tokenizer=tokenizer,
            model=model,
            device=device,
            n_aug=10,
            top_k=10,
            max_tries=20,
            min_similarity=min_sim,
            max_similarity=max_sim,
        )
        for n in novas:
            aug_texts.append(n)
            aug_labels.append(row["class"])

    df_aug = pd.DataFrame({"text": aug_texts, "class": aug_labels})
    df_final = pd.concat([train_df, df_aug], ignore_index=True)
    df_final.to_csv(f"/kaggle/working/train_aug_{nome}.csv", index=False)

    print(f" Salvo: /kaggle/working/train_aug_{nome}.csv ‚Äî {len(df_final)} linhas totais\n")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 Gerando dataset sim_0.65_1.00 (intervalo 0.65-1.0)
 Salvo: /kaggle/working/train_aug_sim_0.65_1.00.csv ‚Äî 43250 linhas totais

 Gerando dataset sim_0.70_1.00 (intervalo 0.7-1.0)
 Salvo: /kaggle/working/train_aug_sim_0.70_1.00.csv ‚Äî 42459 linhas totais

 Gerando dataset sim_0.65_0.90 (intervalo 0.65-0.9)
 Salvo: /kaggle/working/train_aug_sim_0.65_0.90.csv ‚Äî 32193 linhas totais

 Gerando dataset sim_0.70_0.90 (intervalo 0.7-0.9)
 Salvo: /kaggle/working/train_aug_sim_0.70_0.90.csv ‚Äî 30551 linhas totais

 Gerando dataset sim_0.70_0.85 (intervalo 0.7-0.85)
 Salvo: /kaggle/working/train_aug_sim_0.70_0.85.csv ‚Äî 22515 linhas totais



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import os

# ============================================================
# 1) Configura√ß√µes
# ============================================================
model_name = "/kaggle/input/bertimbau-tcc-model/bert-base-portuguese-cased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)

base_input = "/kaggle/input/treinamentoad"
datasets_paths = {
    "sim_0.65_1.00": "/kaggle/working/train_aug_sim_0.65_1.00.csv",
    "sim_0.70_1.00": "/kaggle/working/train_aug_sim_0.70_1.00.csv",
    "sim_0.65_0.90": "/kaggle/working/train_aug_sim_0.65_0.90.csv",
    "sim_0.70_0.90": "/kaggle/working/train_aug_sim_0.70_0.90.csv",
    "sim_0.70_0.85": "/kaggle/working/train_aug_sim_0.70_0.85.csv",
}

test_path = "/kaggle/input/treinamento-e-teste/test_df.csv"
test_df = pd.read_csv(test_path)
test_df = test_df.rename(columns={"class": "labels"})

# ============================================================
# 2) Fun√ß√µes utilit√°rias
# ============================================================
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="binary")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ============================================================
# 3) Avaliar cada dataset
# ============================================================
results = []

for name, path in datasets_paths.items():
    print(f"\n Treinando modelo para dataset: {name}")

    # Carregar dataset
    train_df = pd.read_csv(path)
    if "class" in train_df.columns:
        train_df = train_df.rename(columns={"class": "labels"})

    # Limpeza
    train_df = train_df.drop_duplicates(subset=["text"], keep="first").dropna(subset=["text", "labels"]).reset_index(drop=True)
    train_df["text"] = train_df["text"].astype(str)

    # Tokeniza√ß√£o
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # Modelo e treino
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

    training_args = TrainingArguments(
        output_dir=f"./results_{name}",
        eval_strategy="epoch",
        save_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        logging_dir=f"./logs_{name}",
        logging_steps=100,
        report_to="none"  # evita logs no WandB
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()

    results.append({
        "dataset": name,
        "accuracy": metrics["eval_accuracy"],
        "precision": metrics["eval_precision"],
        "recall": metrics["eval_recall"],
        "f1": metrics["eval_f1"]
    })

# ============================================================
# 4) Resumo comparativo
# ============================================================
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="f1", ascending=False).reset_index(drop=True)

print("\n RESULTADOS FINAIS")
print(results_df)

# Salvar resultados
results_df.to_csv("/kaggle/working/bert_results_comparativo.csv", index=False)
print("\n Resultados salvos em /kaggle/working/bert_results_comparativo.csv")


 Treinando modelo para dataset: sim_0.65_1.00


Map:   0%|          | 0/43042 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.4966, 'grad_norm': 9.776448249816895, 'learning_rate': 1.9264214046822744e-05, 'epoch': 0.03716090672612412}
{'loss': 0.2716, 'grad_norm': 9.232985496520996, 'learning_rate': 1.8520995912300262e-05, 'epoch': 0.07432181345224824}
{'loss': 0.2242, 'grad_norm': 14.672833442687988, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.11148272017837235}
{'loss': 0.2181, 'grad_norm': 9.53431224822998, 'learning_rate': 1.7034559643255298e-05, 'epoch': 0.14864362690449648}
{'loss': 0.2009, 'grad_norm': 11.961630821228027, 'learning_rate': 1.6291341508732813e-05, 'epoch': 0.18580453363062058}
{'loss': 0.1617, 'grad_norm': 1.8552415370941162, 'learning_rate': 1.554812337421033e-05, 'epoch': 0.2229654403567447}
{'loss': 0.1348, 'grad_norm': 7.390620231628418, 'learning_rate': 1.480490523968785e-05, 'epoch': 0.2601263470828688}
{'loss': 0.1331, 'grad_norm': 0.32121601700782776, 'learning_rate': 1.4061687105165367e-05, 'epoch': 0.29728725380899296}
{'loss': 0.1296, 'grad_norm': 19.9239177



{'eval_loss': 0.47083353996276855, 'eval_accuracy': 0.9098422238918107, 'eval_precision': 0.8998548621190131, 'eval_recall': 0.9239940387481371, 'eval_f1': 0.911764705882353, 'eval_runtime': 7.4974, 'eval_samples_per_second': 177.529, 'eval_steps_per_second': 11.204, 'epoch': 1.0}

 Treinando modelo para dataset: sim_0.70_1.00


Map:   0%|          | 0/42297 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.4894, 'grad_norm': 11.112591743469238, 'learning_rate': 1.9251134644478065e-05, 'epoch': 0.037821482602118005}
{'loss': 0.279, 'grad_norm': 12.085649490356445, 'learning_rate': 1.8494704992435706e-05, 'epoch': 0.07564296520423601}
{'loss': 0.2349, 'grad_norm': 3.029345750808716, 'learning_rate': 1.7738275340393343e-05, 'epoch': 0.11346444780635401}
{'loss': 0.2203, 'grad_norm': 22.723520278930664, 'learning_rate': 1.6981845688350985e-05, 'epoch': 0.15128593040847202}
{'loss': 0.1687, 'grad_norm': 7.264078140258789, 'learning_rate': 1.6225416036308626e-05, 'epoch': 0.18910741301059}
{'loss': 0.1779, 'grad_norm': 5.5061564445495605, 'learning_rate': 1.5468986384266263e-05, 'epoch': 0.22692889561270801}
{'loss': 0.1414, 'grad_norm': 21.42954444885254, 'learning_rate': 1.4712556732223904e-05, 'epoch': 0.264750378214826}
{'loss': 0.1227, 'grad_norm': 3.3844833374023438, 'learning_rate': 1.3956127080181545e-05, 'epoch': 0.30257186081694404}
{'loss': 0.1291, 'grad_norm': 11.4685287



{'eval_loss': 0.5128214359283447, 'eval_accuracy': 0.9098422238918107, 'eval_precision': 0.9021897810218978, 'eval_recall': 0.9210134128166915, 'eval_f1': 0.911504424778761, 'eval_runtime': 7.9681, 'eval_samples_per_second': 167.04, 'eval_steps_per_second': 10.542, 'epoch': 1.0}

 Treinando modelo para dataset: sim_0.65_0.90


Map:   0%|          | 0/32022 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.4423, 'grad_norm': 6.67209529876709, 'learning_rate': 1.901098901098901e-05, 'epoch': 0.04995004995004995}
{'loss': 0.2673, 'grad_norm': 9.299182891845703, 'learning_rate': 1.8011988011988013e-05, 'epoch': 0.0999000999000999}
{'loss': 0.2117, 'grad_norm': 28.370769500732422, 'learning_rate': 1.7012987012987013e-05, 'epoch': 0.14985014985014986}
{'loss': 0.2007, 'grad_norm': 18.101728439331055, 'learning_rate': 1.6013986013986016e-05, 'epoch': 0.1998001998001998}
{'loss': 0.2008, 'grad_norm': 0.616071343421936, 'learning_rate': 1.5014985014985016e-05, 'epoch': 0.24975024975024976}
{'loss': 0.1685, 'grad_norm': 0.3248981237411499, 'learning_rate': 1.4015984015984017e-05, 'epoch': 0.2997002997002997}
{'loss': 0.1692, 'grad_norm': 0.8367102146148682, 'learning_rate': 1.3016983016983018e-05, 'epoch': 0.34965034965034963}
{'loss': 0.1542, 'grad_norm': 20.012964248657227, 'learning_rate': 1.201798201798202e-05, 'epoch': 0.3996003996003996}
{'loss': 0.1501, 'grad_norm': 11.327555656



{'eval_loss': 0.40883952379226685, 'eval_accuracy': 0.9135987978963186, 'eval_precision': 0.9161676646706587, 'eval_recall': 0.9120715350223547, 'eval_f1': 0.9141150112023899, 'eval_runtime': 7.5127, 'eval_samples_per_second': 177.167, 'eval_steps_per_second': 11.181, 'epoch': 1.0}

 Treinando modelo para dataset: sim_0.70_0.90


Map:   0%|          | 0/30425 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.4668, 'grad_norm': 6.30980110168457, 'learning_rate': 1.8958990536277605e-05, 'epoch': 0.052576235541535225}
{'loss': 0.2816, 'grad_norm': 12.72677993774414, 'learning_rate': 1.79074658254469e-05, 'epoch': 0.10515247108307045}
{'loss': 0.2451, 'grad_norm': 26.101152420043945, 'learning_rate': 1.6855941114616193e-05, 'epoch': 0.15772870662460567}
{'loss': 0.2084, 'grad_norm': 5.195542335510254, 'learning_rate': 1.580441640378549e-05, 'epoch': 0.2103049421661409}
{'loss': 0.1751, 'grad_norm': 7.388441562652588, 'learning_rate': 1.4752891692954785e-05, 'epoch': 0.2628811777076761}
{'loss': 0.1635, 'grad_norm': 12.211200714111328, 'learning_rate': 1.370136698212408e-05, 'epoch': 0.31545741324921134}
{'loss': 0.1539, 'grad_norm': 2.483710527420044, 'learning_rate': 1.2649842271293376e-05, 'epoch': 0.36803364879074657}
{'loss': 0.1283, 'grad_norm': 0.6179376244544983, 'learning_rate': 1.159831756046267e-05, 'epoch': 0.4206098843322818}
{'loss': 0.1296, 'grad_norm': 2.9041883945465



{'eval_loss': 0.43323007225990295, 'eval_accuracy': 0.9075882794891059, 'eval_precision': 0.9041297935103245, 'eval_recall': 0.9135618479880775, 'eval_f1': 0.9088213491475168, 'eval_runtime': 7.507, 'eval_samples_per_second': 177.3, 'eval_steps_per_second': 11.19, 'epoch': 1.0}

 Treinando modelo para dataset: sim_0.70_0.85


Map:   0%|          | 0/22416 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 0.4582, 'grad_norm': 8.00192642211914, 'learning_rate': 1.858672376873662e-05, 'epoch': 0.07137758743754462}
{'loss': 0.2877, 'grad_norm': 5.8521952629089355, 'learning_rate': 1.7159172019985725e-05, 'epoch': 0.14275517487508924}
{'loss': 0.2215, 'grad_norm': 0.45973750948905945, 'learning_rate': 1.5731620271234832e-05, 'epoch': 0.21413276231263384}
{'loss': 0.2057, 'grad_norm': 11.597918510437012, 'learning_rate': 1.430406852248394e-05, 'epoch': 0.28551034975017847}
{'loss': 0.1933, 'grad_norm': 6.175741195678711, 'learning_rate': 1.287651677373305e-05, 'epoch': 0.35688793718772305}
{'loss': 0.1831, 'grad_norm': 13.074370384216309, 'learning_rate': 1.1448965024982157e-05, 'epoch': 0.4282655246252677}
{'loss': 0.1498, 'grad_norm': 16.43013572692871, 'learning_rate': 1.0021413276231265e-05, 'epoch': 0.49964311206281226}
{'loss': 0.1335, 'grad_norm': 0.10346709191799164, 'learning_rate': 8.593861527480372e-06, 'epoch': 0.5710206995003569}
{'loss': 0.0973, 'grad_norm': 0.07456655



{'eval_loss': 0.39392223954200745, 'eval_accuracy': 0.9075882794891059, 'eval_precision': 0.9053254437869822, 'eval_recall': 0.9120715350223547, 'eval_f1': 0.9086859688195991, 'eval_runtime': 7.5033, 'eval_samples_per_second': 177.39, 'eval_steps_per_second': 11.195, 'epoch': 1.0}

 RESULTADOS FINAIS
         dataset  accuracy  precision    recall        f1
0  sim_0.65_0.90  0.913599   0.916168  0.912072  0.914115
1  sim_0.65_1.00  0.909842   0.899855  0.923994  0.911765
2  sim_0.70_1.00  0.909842   0.902190  0.921013  0.911504
3  sim_0.70_0.90  0.907588   0.904130  0.913562  0.908821
4  sim_0.70_0.85  0.907588   0.905325  0.912072  0.908686

 Resultados salvos em /kaggle/working/bert_results_comparativo.csv
