In [140]:
import pandas as pd
df_buzzfeed = pd.read_csv("/content/drive/MyDrive/buzzfeed_news_with_filenames.csv")
df_politifact = pd.read_csv("/content/drive/MyDrive/politifact_news_with_filenames.csv")

df = pd.concat([df_buzzfeed, df_politifact], ignore_index=True)
df['label'] = df['label'].map({'fake': 0, 'real': 1})

In [141]:
from transformers import DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        title = str(row['title']) if isinstance(row['title'], str) else ""
        text = str(row['text']) if isinstance(row['text'], str) else ""

        text_combined = title + " " + text

        encoding = self.tokenizer(
            text_combined,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(row['label'], dtype=torch.float)
        }

In [142]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

train_dataset = NewsDataset(train_df, tokenizer)
test_dataset = NewsDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [143]:
class FakeNewsClassifier(nn.Module):
    def __init__(self):
        super(FakeNewsClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_output)

In [144]:
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FakeNewsClassifier().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()

def train_model(model, loader, optimizer, criterion,num_epochs=5):


  for epoch in range(num_epochs):
      model.train()
      total_loss = 0
      correct = 0

      for batch in train_loader:
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['label'].unsqueeze(1).to(device)

          optimizer.zero_grad()
          outputs = model(input_ids, attention_mask)

          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()

          total_loss += loss.item() * input_ids.size(0)
          preds = torch.sigmoid(outputs) > 0.5
          correct += (preds == labels.bool()).sum().item()

      epoch_loss = total_loss / len(train_loader.dataset)
      epoch_acc = correct / len(train_loader.dataset)

      print(f"Epoch {epoch+1}/{num_epochs} | Loss: {epoch_loss:.4f} | Accuracy: {epoch_acc:.4f}")
  return epoch_loss, epoch_acc

loss, acc = train_model(model, train_loader, optimizer, criterion)
print(f"Final Training Loss: {loss:.4f}")
print(f"Final Training Accuracy: {acc:.4f}")

Epoch 1/5 | Loss: 0.6119 | Accuracy: 0.6469
Epoch 2/5 | Loss: 0.4285 | Accuracy: 0.8071
Epoch 3/5 | Loss: 0.2230 | Accuracy: 0.9258
Epoch 4/5 | Loss: 0.1282 | Accuracy: 0.9555
Epoch 5/5 | Loss: 0.0647 | Accuracy: 0.9792
Final Training Loss: 0.0647
Final Training Accuracy: 0.9792


In [145]:
def evaluate_model(model, loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].unsqueeze(1).to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.sigmoid(outputs) > 0.5
            correct += (preds == labels.bool()).sum().item()

    return correct / len(loader.dataset)

test_acc = evaluate_model(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.8118
