# Fake News Detector Using Google's BERT Model

## Setup Enviornment

In [None]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU Detected")

In [None]:
import pandas as pd
import torch
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils import resample
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    EarlyStoppingCallback
)


## Preprocessing Dataset

In [None]:
# Preprocessing Function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)              # Remove URLs
    text = re.sub(r"<.*?>", "", text)                # Remove HTML tags
    text = re.sub(r"[^A-Za-z0-9\s.,!?]", "", text)   # Remove unwanted characters
    text = re.sub(r"\s+", " ", text)                 # Normalize whitespace
    return text.strip()

In [None]:
# Load Dataset
fake_df = pd.read_csv(
    r"C:/Users/colby/OneDrive/Desktop/School/Capstone/FakeNewsDataset.csv",
    usecols=["title", "text"],
    low_memory=False
)
real_df = pd.read_csv(
    r"C:/Users/colby/OneDrive/Desktop/School/Capstone/RealNewsDataset.csv",
    usecols=["title", "text"],
    low_memory=False
)

# Add Labels
fake_df["label"] = 0
real_df["label"] = 1

# Upsample Real News to Match Fake News Count
real_upsampled = resample(real_df, replace=True, n_samples=len(fake_df), random_state=42)

# Combine and Shuffle
df = pd.concat([fake_df, real_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

# Apply Preprocessing
df["content"] = (df["title"] + " " + df["text"]).apply(clean_text)

# Final Dataset
df = df[["content", "label"]]

In [None]:
df.head(15)

## Splitting Dataset Into Training and Testing Data

In [None]:
# Train/Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["content"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [None]:
# Dataset class
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {"labels": torch.tensor(self.labels[idx])}
    def __len__(self):
        return len(self.labels)

train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)


## Load the Model

In [None]:
# Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

for param in model.bert.parameters():
    param.requires_grad = False

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## Training the Model

In [None]:
# Custom Trainer with Class Weights
from transformers import Trainer
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Class weights
        weights = torch.tensor([1.0, 1.1]).to(model.device)
        loss_fn = CrossEntropyLoss(weight=weights)

        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
# Training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    label_smoothing_factor=0.1
)


In [None]:
# Trainer instance
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
# Train the model
trainer.train()


## Testing the Model

In [None]:
def predict_fake_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = logits.argmax(-1).item()
    return predicted_class

In [None]:
model.save_pretrained(r"C:\Users\colby\OneDrive\Desktop\School\Capstone\fake_news_bert_model_full_freeze")
tokenizer.save_pretrained(r"C:\Users\colby\OneDrive\Desktop\School\Capstone\fake_news_bert_model_full_freeze")

In [None]:
real_text = """
President Joe Biden signed into law a major bipartisan infrastructure bill that includes $550 billion in new spending for bridges, roads, broadband, and utilities.
This marks a significant achievement for the administration and is expected to boost job creation and improve transportation across the U.S.
"""

print("Prediction (real):", predict_fake_news(real_text))

In [None]:
metrics = trainer.evaluate(eval_dataset=val_dataset)
print(metrics)

In [None]:
for i in range(5):
    sample = val_texts[i]
    label = val_labels[i]
    prediction = predict_fake_news(sample)
    print(f"\nSample #{i + 1}")
    print("Actual Label:", "Real" if label == 1 else "Fake")
    print("Predicted Label:", "Real" if prediction == 1 else "Fake")

In [None]:
from sklearn.metrics import classification_report

# Predict on all validation texts
predictions = [predict_fake_news(text) for text in val_texts]
predicted_labels = [1 if p == "Real" else 0 for p in predictions]

# Generate classification report
print(classification_report(val_labels, predicted_labels, target_names=["Fake", "Real"]))

In [None]:
from collections import Counter

# Check label distribution
print("Train Labels:", Counter(train_labels))
print("Validation Labels:", Counter(val_labels))

In [None]:
test_headlines = [
    "NASA Announces Launch Date for Artemis II Mission to the Moon",
    "Scientists Confirm Earth Is Flat After Secret NASA Meeting"
]

for headline in test_headlines:
    result = predict_fake_news(headline)
    print(f"\"{headline}\" → {'Real' if result == 1 else 'Fake'}")

In [1]:
test_cases = {
    "Real": [
        "World Health Organization Declares End to Global COVID-19 Emergency",
        "Supreme Court Upholds Key Environmental Regulation on Clean Water",
        "Apple Unveils New MacBook Pro with M3 Chip at Annual Event",
        "U.S. Economy Adds 250,000 Jobs in March, Unemployment Steady at 3.8%",
        "UNICEF Launches Campaign to Provide Clean Water in Sub-Saharan Africa",
        "Biden Signs Bipartisan Infrastructure Bill into Law",
        "Tesla Reports Record Q4 Revenue as EV Demand Surges",
        "Olympics 2024 to Be Hosted in Paris with Enhanced Sustainability Focus",
        "FDA Approves First Pill for Postpartum Depression",
        "Japan Launches Lunar Probe in Historic Space Mission"
    ],
    "Fake": [
        "Bill Gates Microchips Children Through Ice Cream Truck Distribution",
        "Aliens Seen Voting in the U.S. Presidential Election, Eyewitness Claims",
        "Scientists Discover That Dinosaurs Still Roam a Hidden Island in the Pacific",
        "New Study Reveals Flat Earth is Supported by NASA Whistleblowers",
        "COVID-19 Vaccines Found to Contain DNA of Extinct Species",
        "Time Traveler from 2077 Arrested for Warning of Imminent Robot Uprising",
        "FBI Confirms Bigfoot Involvement in Area 51 Breach",
        "Government Admits Using Chemtrails to Control Weather and Minds",
        "Man Grows WiFi Antenna in Arm After 5G Booster Shot",
        "Queen Elizabeth Secretly Cloned in Underground Lab, Sources Say"
    ]
}

# Run predictions
for label_type, headlines in test_cases.items():
    print(f"\n Testing {label_type} Headlines:\n")
    for headline in headlines:
        prediction = predict_fake_news(headline)
        predicted_label = "Real" if prediction == 1 else "Fake"
        print(f"{headline}\n→ Predicted: {predicted_label}\n")



 Testing Real Headlines:



NameError: name 'predict_fake_news' is not defined

In [None]:
model.save_pretrained("fake_news_bert_model_full_freeze")
tokenizer.save_pretrained("fake_news_bert_model_full_freeze")