<a href="https://colab.research.google.com/github/AlcilenySilva/Bert-musicas/blob/main/treinar_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers datasets scikit-learn pandas





In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

# Desativar W&B
os.environ["WANDB_DISABLED"] = "true"

# Carregar os arquivos CSV do Colab
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

# Criar cópias para evitar FutureWarning
train_df = train_df.copy()
test_df = test_df.copy()

# Tratar valores ausentes
train_df["explicit"] = train_df["explicit"].fillna(0).astype(int)
test_df["explicit"] = test_df["explicit"].fillna(0).astype(int)

# Definir o tokenizer
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Preparação do dataset
class MusicDataset(Dataset):
    def __init__(self, dataframe):
        self.texts = list(dataframe["lyrics"].astype(str))
        self.labels = list(dataframe["explicit"].astype(int))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        return {**{key: val.squeeze(0) for key, val in encoding.items()}, "labels": torch.tensor(label, dtype=torch.long)}

# Criar datasets
train_dataset = MusicDataset(train_df)
test_dataset = MusicDataset(test_df)

# Criar modelo BERT
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Configuração de treino com melhorias
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
)

# Função de avaliação
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Criar Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Treinar o modelo
trainer.train()

# Avaliar no conjunto de teste
metrics = trainer.evaluate()
print(metrics)
print("\n===== Resultados de Avaliação =====")
print(f"Epoch: {metrics['epoch']:.1f}")
print(f"Loss de Validação: {metrics['eval_loss']:.4f}")
print(f"Acurácia: {metrics['eval_accuracy'] * 100:.2f}%")
print(f"Precisão: {metrics['eval_precision'] * 100:.2f}%")
print(f"Recall: {metrics['eval_recall'] * 100:.2f}%")
print(f"F1 Score: {metrics['eval_f1'] * 100:.2f}%")
print(f"Tempo de Execução: {metrics['eval_runtime']:.2f} segundos")
print(f"Amostras por Segundo: {metrics['eval_samples_per_second']:.2f}")
print(f"Passos por Segundo: {metrics['eval_steps_per_second']:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.400105,0.841584,0.8,0.869565,0.833333
2,0.455900,0.433335,0.811881,0.728814,0.934783,0.819048
3,0.455900,0.44078,0.841584,0.777778,0.913043,0.84
4,0.200800,0.428507,0.861386,0.82,0.891304,0.854167
5,0.200800,0.44814,0.861386,0.82,0.891304,0.854167


{'eval_loss': 0.44813984632492065, 'eval_accuracy': 0.8613861386138614, 'eval_precision': 0.82, 'eval_recall': 0.8913043478260869, 'eval_f1': 0.8541666666666666, 'eval_runtime': 3.5375, 'eval_samples_per_second': 28.552, 'eval_steps_per_second': 3.675, 'epoch': 5.0}

===== Resultados de Avaliação =====
Epoch: 5.0
Loss de Validação: 0.4481
Acurácia: 86.14%
Precisão: 82.00%
Recall: 89.13%
F1 Score: 85.42%
Tempo de Execução: 3.54 segundos
Amostras por Segundo: 28.55
Passos por Segundo: 3.67


In [None]:
from google.colab import drive
drive.mount('/content/drive')