<a href="https://colab.research.google.com/github/CleissonVieira/fake-reviews-bert-ptbr/blob/main/script-completo-testeDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [2]:
# Carregar o dataset
url = 'https://raw.githubusercontent.com/CleissonVieira/fake-reviews-bert-ptbr/main/datasets/portuguese/data-set-teste.csv'
df = pd.read_csv(url)

# Filtrar apenas a coluna 'content' (texto das avaliações) e 'fake_review' (rótulos)
df = df[['content', 'fake_review']]

# Converter a coluna 'fake_review' para valores 0 e 1 (se ainda não estiver)
df['fake_review'] = df['fake_review'].astype(int)  # Converte True/False para 1/0

# Dividir o dataset em treino e teste (80/20)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['content'], df['fake_review'], test_size=0.2, random_state=42
)

# Criação dos datasets de treino e teste no formato HuggingFace Dataset
train_dataset = Dataset.from_dict({'content': train_texts, 'labels': train_labels})
test_dataset = Dataset.from_dict({'content': test_texts, 'labels': test_labels})

In [3]:
# Carregar o tokenizer do DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Função para tokenizar as entradas
def tokenize_function(examples):
    return tokenizer(examples['content'], truncation=True, padding=True, max_length=512)

# Aplicar a tokenização aos datasets de treino e teste
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remover a coluna 'content' após a tokenização
train_dataset = train_dataset.remove_columns(['content'])
test_dataset = test_dataset.remove_columns(['content'])

# Definir os formatos de tensor (PyTorch)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



Map:   0%|          | 0/499 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

In [4]:
# Carregar o modelo DistilBERT para classificação
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Definir os argumentos de treinamento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    logging_dir='./logs',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    eval_steps=500
)

# Inicializar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Treinamento do modelo
trainer.train()

Epoch,Training Loss,Validation Loss
