# 1. Install Dependencies & Import Libraries

In [4]:
!pip install -q transformers datasets scikit-learn streamlit


In [42]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# 2. Upload Data & Preprocess it

In [43]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
fake_df_full = pd.read_csv('/content/drive/MyDrive/fake_news_detection/Fake.csv')
true_df_full = pd.read_csv('/content/drive/MyDrive/fake_news_detection/True.csv')

In [74]:
# Remove rows where 'text' is NaN, empty string, or just whitespace
true_df_clean = true_df_full.dropna(subset=["text"])
true_df_clean = true_df_clean[true_df_clean["text"].str.strip() != ""]

fake_df_clean = fake_df_full.dropna(subset=["text"])
fake_df_clean = fake_df_clean[fake_df_clean["text"].str.strip() != ""]

# Select top 100 clean rows from each
true_df = true_df_clean.head(100).copy()
fake_df = fake_df_clean.head(100).copy()

# Add labels
true_df["label"] = 1
fake_df["label"] = 0

# Combine and shuffle
df = pd.concat([true_df, fake_df]).sample(frac=1, random_state=42).reset_index(drop=True)

# Keep only necessary columns
df = df[["text", "label"]]

# Display confirmation
print(f"Loaded {len(df)} total samples (True: {len(true_df)}, Fake: {len(fake_df)})")


Loaded 200 total samples (True: 100, Fake: 100)


# 3. Tokenize and Prepare Dataset

In [56]:
from transformers import BertTokenizerFast
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Step 1: First split into train (80%) and temp (20%)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Step 2: Split temp into validation (10%) and test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenization function
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

# 4. Safe Conversion (Avoid NumPy 2.0 Bug)

In [57]:
# Convert to Python format
train_list = train_dataset.with_format("python")
val_list = val_dataset.with_format("python")
test_list = test_dataset.with_format("python")

# Extract fields
train_encodings = {
    "input_ids": [ex["input_ids"] for ex in train_list],
    "attention_mask": [ex["attention_mask"] for ex in train_list],
    "labels": [ex["label"] for ex in train_list]
}
val_encodings = {
    "input_ids": [ex["input_ids"] for ex in val_list],
    "attention_mask": [ex["attention_mask"] for ex in val_list],
    "labels": [ex["label"] for ex in val_list]
}
test_encodings = {
    "input_ids": [ex["input_ids"] for ex in test_list],
    "attention_mask": [ex["attention_mask"] for ex in test_list],
    "labels": [ex["label"] for ex in test_list]
}


# 5. Define PyTorch Dataset

In [58]:
import torch
from torch.utils.data import Dataset

class FakeNewsDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        self.labels = encodings["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_mask[idx]),
            "labels": torch.tensor(self.labels[idx])
        }

train_dataset = FakeNewsDataset(train_encodings)
val_dataset = FakeNewsDataset(val_encodings)
test_dataset = FakeNewsDataset(test_encodings)


# 6. Load Model and Define Metrics

In [59]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 7. Training Arguments

In [61]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"  # disables wandb/tensorboard
)


# 8. Train

In [62]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.417189,0.95,1.0,0.9,0.947368
2,No log,0.20223,1.0,1.0,1.0,1.0
3,No log,0.149032,1.0,1.0,1.0,1.0


TrainOutput(global_step=60, training_loss=0.36607751846313474, metrics={'train_runtime': 85.9096, 'train_samples_per_second': 5.587, 'train_steps_per_second': 0.698, 'total_flos': 126293306572800.0, 'train_loss': 0.36607751846313474, 'epoch': 3.0})

# 9. Evaluate on Test Set

In [63]:
trainer.evaluate(test_dataset)


{'eval_loss': 0.18944798409938812,
 'eval_accuracy': 1.0,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 0.6305,
 'eval_samples_per_second': 31.723,
 'eval_steps_per_second': 4.758,
 'epoch': 3.0}

# 10. Save Model

In [64]:
# Create a folder to store the model
model_path = "/content/drive/MyDrive/fake_news_detection/model"

# Save model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model saved to: {model_path}")


Model saved to: /content/drive/MyDrive/fake_news_detection/model


# App Code

In [75]:
app_code = """import streamlit as st
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import numpy as np

# Load model and tokenizer from Google Drive (after downloading or syncing)
model_path = "bert_fake_news_model"  # or full Drive path if running locally
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.eval()

# Streamlit page settings
st.set_page_config(
    page_title="📰 Fake News Detector",
    layout="centered"
)

# Title
st.markdown("<h1 style='text-align: center;'>🤖 Fake News Detection Chatbot</h1>", unsafe_allow_html=True)
st.markdown(
    "<p style='text-align: center;'>Paste a news snippet below to check if it's <strong>Real ✅</strong> or <strong>Fake ❌</strong> using BERT!</p>",
    unsafe_allow_html=True
)

# Input area
user_input = st.text_area("🗞️ Enter News Article or Headline:", height=200)

# On click
if st.button("🔍 Check Now"):
    if user_input.strip() == "":
        st.warning("⚠️ Please enter some text!")
    else:
        inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).numpy()[0]
            pred = np.argmax(probs)
            confidence = float(np.max(probs) * 100)

        # Label with emoji
        if pred == 1:
            label = "✅ **This looks like REAL news!**"
            emoji = "🟢"
        else:
            label = "❌ **This might be FAKE news!**"
            emoji = "🔴"

        # Display results
        st.markdown(f"### {emoji} {label}")
        st.markdown(f"📊 **Confidence:** `{confidence:.2f}%`")
"""

# Save to file
with open("/content/drive/MyDrive/fake_news_detection/app.py", "w") as f:
    f.write(app_code)

print("App file is saved to drive")


App file is saved to drive


In [76]:
requirements = """
streamlit
transformers
torch
numpy
"""
# Save the file locally in Colab
with open("/content/drive/MyDrive/fake_news_detection/requirements.txt", "w") as f:
    f.write(requirements.strip())

print("requirements.txt created.")


requirements.txt created.
