In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

import torch
from transformers import BertTokenizer, BertModel

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/davigreco21/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/davigreco21/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/davigreco21/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Preprocessamento

In [10]:
def load_and_preprocess_tsv(file_path):
    data = pd.read_csv(file_path, sep='\t', header=None)

    data = data[[1, 2]]  
    data.columns = ['label', 'statement']
    
    # Tratar valores nulos
    data['statement'] = data['statement'].fillna('')  # Substituir NaN por string vazia
    data['statement'] = data['statement'].astype(str)  # Garantir que todos os valores sejam strings

    # Mapeamento dos rótulos para valores numéricos
    label_mapping = {
        'pants-fire': 0,
        'false': 1,
        'barely-true': 2,
        'half-true': 3,
        'mostly-true': 4,
        'true': 5
    }
    data['label'] = data['label'].map(label_mapping)

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def preprocess_text(text):
        # Converter para minúsculas
        text = text.lower()
        # Remover caracteres especiais
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remover stopwords
        text = ' '.join([word for word in text.split() if word not in stop_words])
        # Lematização
        text = ' '.join([lemmatizer.lemmatize(word, pos='v') for word in text.split()])
        return text

    data['statement'] = data['statement'].apply(preprocess_text)
    return data


In [11]:
train_path = './LIAR/train.tsv'
valid_path = './LIAR/valid.tsv'
test_path = './LIAR/test.tsv'

train_data = load_and_preprocess_tsv(train_path)
valid_data = load_and_preprocess_tsv(valid_path)
test_data = load_and_preprocess_tsv(test_path)

X_train, y_train = train_data['statement'], train_data['label']
X_valid, y_valid = valid_data['statement'], valid_data['label']
X_test, y_test = test_data['statement'], test_data['label']

# Exemplo de saída
print("Exemplo de dados de treino:")
print(train_data.head())

Exemplo de dados de treino:
   label                                          statement
0      1  say annies list political group support thirdt...
1      3  decline coal start start natural gas take star...
2      4  hillary clinton agree john mccain vote give ge...
3      1  health care reform legislation likely mandate ...
4      3                 economic turnaround start end term


# Fine-Tuning

In [12]:
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [14]:
# Ajustando o dataset para classificação de fake news
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]  # Usar os rótulos numéricos diretamente

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [15]:
# Ajustando o modelo BERT
class BertClassifier(nn.Module):
    def __init__(self, bert_model):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(768, 6)  # 6 classes para fake news

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

# Preparação do modelo
model_tuning = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_tuning = BertClassifier(model_tuning)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_tuning.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [16]:
# Definição da função de treino
def train_epoch(model, optimizer, train_loader, criterion):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Definição da função de avaliação
def evaluate_epoch(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    return total_loss / len(val_loader), accuracy, f1


In [None]:
# Preparação para Cross-Validation com 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model_tuning.parameters(), lr=2e-5)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {fold + 1}")
    train_texts, val_texts = X_train.iloc[train_idx], X_train.iloc[val_idx]
    train_labels, val_labels = y_train.iloc[train_idx], y_train.iloc[val_idx]

    train_dataset = FakeNewsDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    val_dataset = FakeNewsDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    for epoch in range(3):  # Treinando por 3 épocas
        train_loss = train_epoch(model_tuning, optimizer, train_loader, criterion)
        val_loss, val_accuracy, val_f1 = evaluate_epoch(model_tuning, val_loader, criterion)

        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, "
              f"Val Accuracy = {val_accuracy:.4f}, Val F1 = {val_f1:.4f}")

# Avaliação final no conjunto de teste
test_dataset = FakeNewsDataset(X_test.tolist(), y_test.tolist(), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

test_loss, test_accuracy, test_f1 = evaluate_epoch(model_tuning, test_loader, criterion)
print(f"Final Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test F1: {test_f1:.4f}")