In [None]:
# Importações necessárias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

# 1. Carregar a base de dados
caminho_arquivo = r"C:\Users\secad\Downloads\treino_re8\re8.csv"
dados = pd.read_csv(caminho_arquivo)

# Verificar as primeiras linhas
print(dados.head())

# 2. Dividir os dados em treino (70%), validação (10%) e teste (20%)
train_df, temp_df = train_test_split(dados, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.6667, random_state=42)  # 10% val, 20% test

print(f"Treino: {len(train_df)}, Validação: {len(val_df)}, Teste: {len(test_df)}")

# 3. Tokenização com BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Função para tokenizar os textos
def tokenize_texts(texts, max_length=512):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenizar os dados
train_encodings = tokenize_texts(train_df['text'])
val_encodings = tokenize_texts(val_df['text'])
test_encodings = tokenize_texts(test_df['text'])

# Converter as classes para tensores
train_labels = torch.tensor(train_df['class'].astype('category').cat.codes.tolist())
val_labels = torch.tensor(val_df['class'].astype('category').cat.codes.tolist())
test_labels = torch.tensor(test_df['class'].astype('category').cat.codes.tolist())

# Criar datasets e dataloaders
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=16)

# 4. Treinar o modelo BERT
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(train_df['class'].unique())
)

# Configurar otimizador
optimizer = AdamW(model.parameters(), lr=2e-5)

# Função de treinamento
def train(model, train_loader, val_loader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        print(f"Época {epoch + 1}/{epochs}")
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        avg_loss = total_loss / len(train_loader)
        print(f"Perda média: {avg_loss}")
        evaluate(model, val_loader)

# Função de avaliação
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return predictions, true_labels

# Treinar o modelo
train(model, train_loader, val_loader, optimizer, epochs=3)

# 5. Avaliar o modelo no conjunto de teste
predictions, true_labels = evaluate(model, test_loader)

# Calcular F1-score (micro e macro), acurácia e matriz de confusão
f1_micro = f1_score(true_labels, predictions, average='micro')
f1_macro = f1_score(true_labels, predictions, average='macro')
accuracy = accuracy_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print(f"F1-score (Micro): {f1_micro}")
print(f"F1-score (Macro): {f1_macro}")
print(f"Acurácia: {accuracy}")
print("Matriz de Confusão:")
print(conf_matrix)

      file_name                                               text class
0  acq.4342.txt  mcdowell me to merger with interpharm inc mcdo...   acq
1  acq.5302.txt  intermagnetics general inma completes buy inte...   acq
2  acq.8530.txt  tesco extends hillards offer tesco plc tsco l ...   acq
3  acq.3841.txt  healthvest hvt sells shares healthvest a maryl...   acq
4  acq.6302.txt  cooper canada said it received takeover offers...   acq
Treino: 5371, Validação: 767, Teste: 1536


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Época 1/3
