In [3]:
import os
import time
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# Carregar o dataset
url = 'https://raw.githubusercontent.com/CleissonVieira/fake-reviews-bert-ptbr/main/datasets/yelp-fake-reviews-dataset-pt.csv'
df = pd.read_csv(url)
df = df[['content', 'fake_review']]

df_real = df[df.fake_review == False].sample(n=1250, random_state=42)
df_fakes = df[df.fake_review == True].sample(n=1250, random_state=42)
df_balanceado = pd.concat([df_real, df_fakes]) 
df_balanceado['fake_review'] = df_balanceado['fake_review'].astype(int)

results_real_fake = []

# Inicializar tokenizer e modelo
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Função para tokenização
def tokenize_data(texts, labels, max_len=128):
    inputs = tokenizer(texts.tolist(), max_length=max_len, padding=True, truncation=True, return_tensors="pt")
    inputs['labels'] = torch.tensor(labels)
    return inputs

# Definir dataset customizado
class FakeReviewsDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

# Definir otimizador, função de perda e scheduler
def configure_optimizer_scheduler(model, train_loader, epochs=5):
    optimizer = AdamW(model.parameters(), lr=1e-5)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    return optimizer, scheduler

# Função de treino
def train_one_epoch(model, train_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        
        # Mover para GPU se disponível
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(train_loader)

# Função de avaliação
def evaluate_model(model, val_loader, device):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=-1).cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    # Métricas globais
    accuracy = accuracy_score(true_labels, preds)
    precision = precision_score(true_labels, preds)
    recall = recall_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)

    # Métricas por classe
    precision_real = precision_score(true_labels, preds, pos_label=0)
    precision_fake = precision_score(true_labels, preds, pos_label=1)
    recall_real = recall_score(true_labels, preds, pos_label=0)
    recall_fake = recall_score(true_labels, preds, pos_label=1)
    f1_real = f1_score(true_labels, preds, pos_label=0)
    f1_fake = f1_score(true_labels, preds, pos_label=1)

    return accuracy, precision, recall, f1, precision_real, precision_fake, recall_real, recall_fake, f1_real, f1_fake

def TreinoEValidacao(dfUtilizado, mensagem):
    print(f'\n{mensagem} | {len(dfUtilizado)} registros')
    
    # Configurar Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Inicializar listas para armazenar as métricas
    metrics = {
        'Precision_Real': [], 'Precision_Fake': [], 'Precision': [],
        'F1_Score_Real': [], 'F1_Score_Fake': [], 'F1_Score': [],
        'Recall_Real': [], 'Recall_Fake': [], 'Recall': [],
        'Accuracy': []
    }
    
    # Inicializar listas para métricas de cada fold
    
    X = dfUtilizado['content'].values
    y = dfUtilizado['fake_review'].values
    
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        print(f"\nFold {fold + 1}")
    
        # Dividir os dados
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        # Tokenizar os dados
        train_encodings = tokenize_data(X_train, y_train)
        val_encodings = tokenize_data(X_test, y_test)
    
        # Criar datasets
        train_dataset = FakeReviewsDataset(train_encodings)
        val_dataset = FakeReviewsDataset(val_encodings)
    
        # Criar DataLoaders
        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=8)
    
        # Configurar otimizador e scheduler
        optimizer, scheduler = configure_optimizer_scheduler(model, train_loader)
    
        # Treino por 5 epochs
        start_time = time.time()
        for epoch in range(5):
            train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, device)
            print(f"Epoch {epoch + 1}, Loss: {train_loss:.4f}")
    
        train_time = time.time() - start_time
    
        # Avaliar o modelo
        accuracy, precision, recall, f1, precision_real, precision_fake, recall_real, recall_fake, f1_real, f1_fake = evaluate_model(model, val_loader, device)
        print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
    
        # Armazenar métricas
        metrics['Precision_Real'].append(precision_real)
        metrics['Precision_Fake'].append(precision_fake)
        metrics['Recall_Real'].append(recall_real)
        metrics['Recall_Fake'].append(recall_fake)
        metrics['F1_Score_Real'].append(f1_real)
        metrics['F1_Score_Fake'].append(f1_fake)
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['F1_Score'].append(f1)
        metrics['Accuracy'].append(accuracy)
        
    # Calcular médias e variâncias
    final_results = {
        'tamanho_dataLoader': len(dfUtilizado),
        'scenario': 'Review',
        'classifier': 'bert-base-multilingual-uncased',
        'features_used': 'content',
        'Precision_Real': np.mean(metrics['Precision_Real']),
        'Precision_Fake': np.mean(metrics['Precision_Fake']),
        'Precision': np.mean(metrics['Precision']),
        'precision_variance': np.var(metrics['Precision'], ddof=1),
        'precision_min': np.min(metrics['Precision']),
        'precision_max': np.max(metrics['Precision']),
        'F1_Score_Real': np.mean(metrics['F1_Score_Real']),
        'F1_Score_Fake': np.mean(metrics['F1_Score_Fake']),
        'F1_Score': np.mean(metrics['F1_Score']),
        'f1_score_variance': np.var(metrics['F1_Score'], ddof=1),
        'f1_score_min': np.min(metrics['F1_Score']),
        'f1_score_max': np.max(metrics['F1_Score']),
        'Recall_Real': np.mean(metrics['Recall_Real']),
        'Recall_Fake': np.mean(metrics['Recall_Fake']),
        'Recall': np.mean(metrics['Recall']),
        'recall_variance': np.var(metrics['Recall'], ddof=1),
        'recall_min': np.min(metrics['Recall']),
        'recall_max': np.max(metrics['Recall']),
        'Accuracy': np.mean(metrics['Accuracy']),
        'accuracy_variance': np.var(metrics['Accuracy'], ddof=1),
        'accuracy_min': np.min(metrics['Accuracy']),
        'accuracy_max': np.max(metrics['Accuracy'])
    }
    
    results_real_fake = pd.DataFrame([final_results]).round(5)
    
    file_path = 'bert_final_results.csv'
    
    if os.path.isfile(file_path):
        results_real_fake.to_csv(file_path, mode='a', header=False, index=False)
    else:
        results_real_fake.to_csv(file_path, mode='w', header=True, index=False)



TreinoEValidacao(df_balanceado, '1 - df_balanceado')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



1 - df_balanceado | 2500 registros

Fold 1
Epoch 1, Loss: 0.6296
Epoch 2, Loss: 0.5669
Epoch 3, Loss: 0.4935


KeyboardInterrupt: 