# Fake News Detector Using BERT Model

## Setup Environment

In [1]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [3]:
from collections import Counter

# Load datasets
fake_df = pd.read_csv(
    r"C:\Users\colby\OneDrive\Desktop\School\Capstone\FakeNewsDataset.csv",
    usecols=["title", "text"],
    low_memory=False
)
real_df = pd.read_csv(
    r"C:\Users\colby\OneDrive\Desktop\School\Capstone\RealNewsDataset.csv",
    usecols=["title", "text"],
    low_memory=False
)

# Add labels
fake_df["label"] = 0
real_df["label"] = 1

df = pd.concat([fake_df, real_df])
print("Original Distribution:", Counter(df["label"]))

# Balance dataset
fake_df_balanced = df[df["label"] == 0].sample(n=df["label"].value_counts()[1], random_state=42)
real_df_balanced = df[df["label"] == 1]
df = pd.concat([fake_df_balanced, real_df_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)
print("Balanced Distribution:", Counter(df["label"]))

# Combine text
df["content"] = df["title"] + " " + df["text"]
df = df[["content", "label"]]

Original Distribution: Counter({0: 23502, 1: 21417})
Balanced Distribution: Counter({0: 21417, 1: 21417})


In [4]:
# Print out first 15 lines 
df.head(15)

Unnamed: 0,content,label
0,OBAMA FIGHTS TO KEEP RADICAL AGENDA ALIVE: Ask...,0
1,Investigators probe Trump knowledge of campaig...,1
2,MICHIGAN CONTROVERSY OVER GUN DEPICTED IN VETE...,0
3,Jennifer Lawrence Has Two VERY Choice Words F...,0
4,UNHINGED TRUMP PROTESTER Arrested For Slapping...,0
5,UNCENSORED VIDEO: Real New Yorkers’ Opinions O...,0
6,Factbox: Top agricultural exports vulnerable t...,1
7,Trump urges U.S. Congress to repeal Obamacare ...,1
8,Watch A GOP Member Of Science Committee Suffe...,0
9,BREAKING: PROTESTER JUMPS ON STAGE…Grabs Trump...,0


## Train and Test Split

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["content"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42
)

## Tokenize

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

## Convert to PyTorch Dataset

In [None]:
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)

## Load BERT Model

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
import transformers
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch", 
    learning_rate=1e-5,     
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=6,   
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
    load_best_model_at_end=True,   
    metric_for_best_model="eval_loss",
    greater_is_better=False          
)



In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


## Train the Model

In [None]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

In [None]:
from transformers import Trainer
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # ✅ Stops if no improvement
)
trainer.train()

In [None]:
def predict_fake_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = logits.argmax(-1).item()
    return predicted_class

In [None]:
model.save_pretrained(r"C:\Users\colby\OneDrive\Desktop\School\Capstone\fake_news_bert_model2")
tokenizer.save_pretrained(r"C:\Users\colby\OneDrive\Desktop\School\Capstone\fake_news_bert_model2")

## Test the model

In [None]:
real_text = """
President Joe Biden signed into law a major bipartisan infrastructure bill that includes $550 billion in new spending for bridges, roads, broadband, and utilities.
This marks a significant achievement for the administration and is expected to boost job creation and improve transportation across the U.S.
"""

print("Prediction (real):", predict_fake_news(real_text))

In [None]:
metrics = trainer.evaluate(eval_dataset=val_dataset)
print(metrics)

In [None]:
for i in range(5):
    sample = val_texts[i]
    label = val_labels[i]
    prediction = predict_fake_news(sample)
    print(f"\nSample #{i + 1}")
    print("Actual Label:", "Real" if label == 1 else "Fake")
    print("Predicted Label:", "Real" if prediction == 1 else "Fake")

In [None]:
df["label"].value_counts()

In [None]:
from sklearn.metrics import classification_report

# Predict on all validation texts
predictions = [predict_fake_news(text) for text in val_texts]
predicted_labels = [1 if p == "Real" else 0 for p in predictions]

# Generate classification report
print(classification_report(val_labels, predicted_labels, target_names=["Fake", "Real"]))

In [None]:
from collections import Counter

# Check label distribution
print("Train Labels:", Counter(train_labels))
print("Validation Labels:", Counter(val_labels))