# Fake News Detection Model

This notebook develops a fake news detection model using transfer learning.

## Approach
1. Load pre-trained DistilBERT model (lightweight transformer)
2. Combine Kaggle Fake News dataset with X/Twitter data
3. Fine-tune on our combined dataset
4. Evaluate and optimize for inference speed

## Data Sources
- Kaggle Fake News Dataset
- X/Twitter API (trending topics)
- News API for verified sources

In [None]:
# Install dependencies
!pip install transformers datasets torch scikit-learn pandas numpy

In [None]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

## 1. Load and Prepare Dataset

In [None]:
# Load Kaggle Fake News dataset from HuggingFace
# Alternative: Download from Kaggle and load locally

try:
    # Try loading from HuggingFace datasets
    dataset = load_dataset('GonzaloA/fake_news', split='train')
    print(f"Loaded {len(dataset)} samples from HuggingFace")
except:
    print("Loading from local CSV...")
    # Load from local file if available
    # dataset = pd.read_csv('../data/fake_news.csv')

In [None]:
# Prepare dataset
def prepare_data(examples):
    """Combine title and text for classification"""
    texts = []
    for title, text in zip(examples['title'], examples['text']):
        combined = f"{title}\n\n{text[:500]}"  # Limit text length
        texts.append(combined)
    return {'text': texts, 'label': examples['label']}

# Apply preprocessing
# processed_dataset = dataset.map(prepare_data, batched=True)

## 2. Load Pre-trained Model

In [None]:
# Use DistilBERT for efficiency
MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # Real (0) or Fake (1)
    id2label={0: "real", 1: "fake"},
    label2id={"real": 0, "fake": 1}
)

print(f"Model loaded: {MODEL_NAME}")
print(f"Parameters: {model.num_parameters():,}")

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=256  # Keep short for efficiency
    )

# tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)

## 3. Training Configuration

In [None]:
# Compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Training arguments optimized for efficiency
training_args = TrainingArguments(
    output_dir='../models/fake_news',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='../models/fake_news/logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

## 4. Train Model

In [None]:
# Initialize trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset['train'],
#     eval_dataset=tokenized_dataset['test'],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
# )

# # Train
# trainer.train()

print("Training code ready - uncomment to run with actual data")

## 5. Save and Export Model

In [None]:
# Save model
# model.save_pretrained('../models/fake_news/final')
# tokenizer.save_pretrained('../models/fake_news/final')

print("Model export code ready")

## 6. Quick Inference Test

In [None]:
def predict_fake_news(text):
    """Quick inference function"""
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        max_length=256
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
        confidence = probs[0][prediction].item()
    
    return {
        'is_fake': prediction == 1,
        'confidence': confidence,
        'label': 'fake' if prediction == 1 else 'real'
    }

# Test
# result = predict_fake_news("Breaking: Scientists discover cure for all diseases!")
# print(result)