<a href="https://colab.research.google.com/github/CleissonVieira/fake-reviews-bert-ptbr/blob/main/script-completo-testeDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# REQUISITOS PARA EXECUÇÃO
# pip install pandas
# pip install scikit-learn
# pip install datasets
# pip install datasets
# pip install transformers[torch]
# pip install accelerate -U

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
# PREPARAÇÃO E DIVISÃO DOS DADOS
url = 'https://raw.githubusercontent.com/CleissonVieira/fake-reviews-bert-ptbr/main/datasets/portuguese/yelp-fake-reviews-dataset-pt.csv'
df = pd.read_csv(url)
df_sampled = df.sample(n=100, random_state=42) # Limita o dataset a N linhas
df_sampled = df_sampled[['content', 'fake_review']] # Filtra apenas coluna 'content' (texto das avaliações) e 'fake_review' (rótulos)
df_sampled['fake_review'] = df_sampled['fake_review'].astype(int)  # Converte coluna 'fake_review' True/False para 1/0

train_texts, test_texts, train_labels, test_labels = train_test_split( # Divide dataset em treino e teste (80/20)
    df_sampled['content'], df_sampled['fake_review'], test_size=0.2, random_state=42
)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') # Carregar o tokenizer DistilBERT
def tokenize_function(examples): # Tokeniza as entradas
    return tokenizer(examples['content'], truncation=True, padding=True, max_length=512)

train_dataset = Dataset.from_dict({'content': train_texts, 'labels': train_labels}) # Cria datasets treino e teste no formato HuggingFace
test_dataset = Dataset.from_dict({'content': test_texts, 'labels': test_labels})

train_dataset = train_dataset.map(tokenize_function, batched=True) # Aplica a tokenização aos datasets de treino e teste
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(['content']) # Remover a coluna 'content' após a tokenização
test_dataset = test_dataset.remove_columns(['content'])



[A[A

Map: 100%|██████████| 80/80 [00:00<00:00, 559.42 examples/s]


Map: 100%|██████████| 20/20 [00:00<00:00, 540.54 examples/s]


In [None]:
# AJUSTE FINO
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) # Definir os formatos de tensor (PyTorch)
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2) # Carregar o modelo DistilBERT para classificação

training_args = TrainingArguments( # Define os parâmetros de treinamento
    output_dir='./results',
    evaluation_strategy="epoch",
    logging_dir='./logs',
    num_train_epochs=2, # Quantidade de epocas. Experimentar entre 5 e 50 epocas
    per_device_train_batch_size=16, # Tamanho lote treino
    per_device_eval_batch_size=16, # Tamanho lote avaliação
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    eval_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  # Função de métricas personalizada
)

trainer.train()

In [None]:
# APRESENTAÇÃO E COMPARAÇÃO DOS RESULTADOS
results = trainer.evaluate()
print(f"Resultados: {results}")
