In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm

# Load datasets
df_buzzfeed = pd.read_csv("/content/drive/MyDrive/buzzfeed_news_with_filenames.csv")
df_politifact = pd.read_csv("/content/drive/MyDrive/politifact_news_with_filenames.csv")
df = pd.concat([df_buzzfeed, df_politifact], ignore_index=True)
df['label'] = df['label'].map({'fake': 0, 'real': 1})


tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class RobertaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = str(row['title']) + " " + str(row['text'])
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        label = torch.tensor(row['label'], dtype=torch.long)
        return input_ids, attention_mask, label


train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, stratify=val_df['label'], random_state=42)

train_dataset = RobertaDataset(train_df, tokenizer)
val_dataset = RobertaDataset(val_df, tokenizer)
test_dataset = RobertaDataset(test_df, tokenizer)

def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    attention_masks = torch.stack([item[1] for item in batch])
    labels = torch.stack([item[2] for item in batch])
    return input_ids, attention_masks, labels

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

criterion = torch.nn.CrossEntropyLoss()

def train_model(model, train_loader, val_loader, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0

        for input_ids, attention_mask, labels in tqdm(train_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * input_ids.size(0)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += input_ids.size(0)
            train_acc = correct / total
            val_acc, val_prec, val_rec, val_f1 = evaluate(model, val_loader)

        print(f"Epoch {epoch+1} | "
                  f"Train Loss: {total_loss/total:.4f} | "
                  f"Train Acc: {train_acc:.4f} | "
                  f"Val Acc: {val_acc:.4f} | "

                  f"Rec: {val_rec:.4f} | "
                  f"F1: {val_f1:.4f}")

def evaluate(model, loader):
    model.eval()
    preds = []
    labels_all = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=1)
            preds.extend(batch_preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())
    model.train()

    acc = accuracy_score(labels_all, preds)
    precision = precision_score(labels_all, preds)
    recall = recall_score(labels_all, preds)
    f1 = f1_score(labels_all, preds)
    return acc, precision, recall, f1

train_model(model, train_loader, val_loader, epochs=10)

test_acc, test_prec, test_rec, test_f1 = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_prec:.4f}")
print(f"Test Recall: {test_rec:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 22/22 [01:06<00:00,  3.01s/it]


Epoch 1 | Train Loss: 0.6720 | Train Acc: 0.5312 | Val Acc: 0.8095 | Rec: 0.6190 | F1: 0.7647


100%|██████████| 22/22 [01:06<00:00,  3.04s/it]


Epoch 2 | Train Loss: 0.4971 | Train Acc: 0.7982 | Val Acc: 0.7619 | Rec: 0.7619 | F1: 0.7619


100%|██████████| 22/22 [01:04<00:00,  2.91s/it]


Epoch 3 | Train Loss: 0.3592 | Train Acc: 0.8398 | Val Acc: 0.8571 | Rec: 0.7143 | F1: 0.8333


100%|██████████| 22/22 [01:03<00:00,  2.90s/it]


Epoch 4 | Train Loss: 0.3710 | Train Acc: 0.8576 | Val Acc: 0.9048 | Rec: 0.8095 | F1: 0.8947


100%|██████████| 22/22 [01:03<00:00,  2.90s/it]


Epoch 5 | Train Loss: 0.2174 | Train Acc: 0.9080 | Val Acc: 0.8810 | Rec: 0.7619 | F1: 0.8649


100%|██████████| 22/22 [01:04<00:00,  2.91s/it]


Epoch 6 | Train Loss: 0.1969 | Train Acc: 0.9139 | Val Acc: 0.7381 | Rec: 1.0000 | F1: 0.7925


100%|██████████| 22/22 [01:03<00:00,  2.90s/it]


Epoch 7 | Train Loss: 0.2797 | Train Acc: 0.8694 | Val Acc: 0.6905 | Rec: 0.9048 | F1: 0.7451


100%|██████████| 22/22 [01:03<00:00,  2.91s/it]


Epoch 8 | Train Loss: 0.1066 | Train Acc: 0.9763 | Val Acc: 0.7619 | Rec: 0.9524 | F1: 0.8000


100%|██████████| 22/22 [01:03<00:00,  2.91s/it]


Epoch 9 | Train Loss: 0.1187 | Train Acc: 0.9585 | Val Acc: 0.8571 | Rec: 0.9048 | F1: 0.8636


100%|██████████| 22/22 [01:03<00:00,  2.90s/it]


Epoch 10 | Train Loss: 0.0332 | Train Acc: 0.9881 | Val Acc: 0.8095 | Rec: 0.9048 | F1: 0.8261
Test Accuracy: 0.8372
Test Precision: 0.7917
Test Recall: 0.9048
Test F1 Score: 0.8444
