In [44]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import torch

# Configurações 
device = "cpu"
model_name = "neuralmind/bert-base-portuguese-cased"
csv_file = "dataset.csv"

# Carregar dataset
dataset = load_dataset("csv", data_files=csv_file)['train']

# Converter colunas para listas
texts = list(dataset['text'])
labels = list(dataset['label'])

# Separar treino e validação
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42
)

# Criar Dataset Hugging Face
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

label_map = {"improdutivo": 0, "produtivo": 1}

def encode_labels(batch):
    batch["label"] = label_map[batch["label"]]
    return batch

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/688 [00:00<?, ? examples/s]

Map:   0%|          | 0/172 [00:00<?, ? examples/s]

In [45]:
# Tokenização
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/688 [00:00<?, ? examples/s]

Map:   0%|          | 0/172 [00:00<?, ? examples/s]

In [46]:
# Preparar modelo para classificação
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label={0: "improdutivo", 1: "produtivo"},
    label2id={"improdutivo": 0, "produtivo": 1}
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
# Treinamento
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    use_cpu=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1234,0.252836
2,0.0811,0.217498
3,0.0009,0.263434


TrainOutput(global_step=258, training_loss=0.11281332519815065, metrics={'train_runtime': 1174.1942, 'train_samples_per_second': 1.758, 'train_steps_per_second': 0.22, 'total_flos': 56215321421760.0, 'train_loss': 0.11281332519815065, 'epoch': 3.0})

In [2]:
from transformers import pipeline

# Avaliação com pipeline
clf_pipeline = pipeline(
    task="text-classification",
    model=model,
    tokenizer=tokenizer,
    device="cpu"
)

# Teste
print(clf_pipeline("Preciso de uma explicação do seu documento."))

NameError: name 'model' is not defined

In [49]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Prever no dataset de validação
val_texts_list = list(val_dataset['text'])
val_labels_list = list(val_dataset['label'])

# pipeline já treinado
predictions = clf_pipeline(val_texts_list, batch_size=8)

# Extrair apenas os labels previstos
pred_labels = [label_map[p['label']] if p['label'] in label_map else int(p['label'].split("_")[-1]) for p in predictions]

# Métricas
acc = accuracy_score(val_labels_list, pred_labels)
f1 = f1_score(val_labels_list, pred_labels)
cm = confusion_matrix(val_labels_list, pred_labels)
report = classification_report(val_labels_list, pred_labels, target_names=["improdutivo", "produtivo"])

print(f"Acurácia: {acc:.4f}")
print(f"F1-score: {f1:.4f}")
print("Matriz de Confusão:")
print(cm)
print("\nRelatório Completo:")
print(report)

Acurácia: 0.9477
F1-score: 0.9581
Matriz de Confusão:
[[ 60   5]
 [  4 103]]

Relatório Completo:
              precision    recall  f1-score   support

 improdutivo       0.94      0.92      0.93        65
   produtivo       0.95      0.96      0.96       107

    accuracy                           0.95       172
   macro avg       0.95      0.94      0.94       172
weighted avg       0.95      0.95      0.95       172



In [3]:
# salvar modelo
model.config.id2label = {0: "improdutivo", 1: "produtivo"}
model.config.label2id = {"improdutivo": 0, "produtivo": 1}

model.save_pretrained("models/saved_model")
tokenizer.save_pretrained("models/saved_model")

('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/vocab.txt',
 'saved_model/added_tokens.json',
 'saved_model/tokenizer.json')