# Fake News Detection with FakeNewsNet Dataset

This notebook demonstrates how to finetune a lightweight transformer model (TinyBERT) on the FakeNewsNet dataset for fake news detection. This model can then be integrated into the VeriFact Django application.

## Overview

1. Load and explore the FakeNewsNet dataset
2. Preprocess the text data
3. Set up TinyBERT for sequence classification
4. Finetune the model
5. Evaluate model performance
6. Export the model for Django integration

## 1. Install Required Packages

In [None]:
# Install required packages
!pip install transformers datasets torch scikit-learn pandas numpy matplotlib seaborn

## 2. Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import time
import gc
import json
import re
from tqdm.auto import tqdm

# Set seed for reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

## 3. Load and Explore FakeNewsNet Dataset

FakeNewsNet dataset contains news content with labels for political and gossipcop domains. We'll load the dataset, which should be organized as follows:

```
FakeNewsNet/
├── politifact/
│   ├── fake/
│   └── real/
└── gossipcop/
    ├── fake/
    └── real/
```

Each news article has its content and metadata.

In [None]:
# Update this path to where your FakeNewsNet dataset is stored
DATASET_PATH = "./FakeNewsNet/"

# Function to load dataset
def load_fakenewsnet(dataset_path, domain="politifact"):
    """
    Load articles from FakeNewsNet dataset for a specific domain
    """
    data = []
    labels = []
    
    # Define paths for real and fake news
    real_path = os.path.join(dataset_path, domain, "real")
    fake_path = os.path.join(dataset_path, domain, "fake")
    
    # Process real news
    print(f"Loading real news from {domain}...")
    for news_id in tqdm(os.listdir(real_path)):
        news_json_path = os.path.join(real_path, news_id, "news content.json")
        if os.path.exists(news_json_path):
            try:
                with open(news_json_path, 'r', encoding='utf-8') as f:
                    news_data = json.load(f)
                    
                # Extract text (title + content)
                title = news_data.get('title', '')
                content = news_data.get('text', '')
                text = title + " " + content
                
                if text.strip():
                    data.append(text)
                    labels.append(0)  # 0 for real
            except Exception as e:
                print(f"Error loading {news_json_path}: {e}")
    
    # Process fake news
    print(f"Loading fake news from {domain}...")
    for news_id in tqdm(os.listdir(fake_path)):
        news_json_path = os.path.join(fake_path, news_id, "news content.json")
        if os.path.exists(news_json_path):
            try:
                with open(news_json_path, 'r', encoding='utf-8') as f:
                    news_data = json.load(f)
                    
                # Extract text (title + content)
                title = news_data.get('title', '')
                content = news_data.get('text', '')
                text = title + " " + content
                
                if text.strip():
                    data.append(text)
                    labels.append(1)  # 1 for fake
            except Exception as e:
                print(f"Error loading {news_json_path}: {e}")
    
    return pd.DataFrame({
        'text': data,
        'label': labels
    })

In [None]:
# Load the dataset (adjust this if needed)
try:
    # Try to load politifact data
    df_politifact = load_fakenewsnet(DATASET_PATH, domain="politifact")
    print(f"Loaded {len(df_politifact)} articles from politifact")
    
    # Try to load gossipcop data
    df_gossipcop = load_fakenewsnet(DATASET_PATH, domain="gossipcop")
    print(f"Loaded {len(df_gossipcop)} articles from gossipcop")
    
    # Combine datasets
    df = pd.concat([df_politifact, df_gossipcop], ignore_index=True)
    print(f"Combined dataset size: {len(df)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("If your dataset is structured differently, please adjust the loading code.")

In [None]:
# Basic dataset exploration
print("Dataset overview:")
print(df.info())

print("\nLabel distribution:")
print(df['label'].value_counts())

print("\nSample articles:")
# Display a sample real article
real_sample = df[df['label'] == 0].iloc[0]
print(f"\nReal article sample (first 300 chars): \n{real_sample['text'][:300]}...")

# Display a sample fake article
fake_sample = df[df['label'] == 1].iloc[0]
print(f"\nFake article sample (first 300 chars): \n{fake_sample['text'][:300]}...")

# Text length distribution
df['text_length'] = df['text'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='text_length', hue='label', bins=30, log_scale=(False, True))
plt.title('Text Length Distribution by Label')
plt.xlabel('Text Length (characters)')
plt.ylabel('Count (log scale)')
plt.legend(['Real', 'Fake'])
plt.show()

## 4. Data Preprocessing

Let's clean the text data and prepare it for the model.

In [None]:
def preprocess_text(text):
    """
    Basic text preprocessing
    """
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

# Check for empty texts after preprocessing
empty_texts = df['processed_text'].apply(lambda x: len(x.strip()) == 0).sum()
print(f"Number of empty texts after preprocessing: {empty_texts}")

# Remove empty texts if any
if empty_texts > 0:
    df = df[df['processed_text'].apply(lambda x: len(x.strip()) > 0)].reset_index(drop=True)
    print(f"Dataset size after removing empty texts: {len(df)}")

## 5. Split Data into Train, Validation, and Test Sets

In [None]:
# Split data into train, validation, and test sets
# First, split into train+val and test
train_val_df, test_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df['label']
)

# Then split train+val into train and validation
train_df, val_df = train_test_split(
    train_val_df, test_size=0.15, random_state=SEED, stratify=train_val_df['label']
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

# Check label distribution in each split
print("\nLabel distribution in training set:")
print(train_df['label'].value_counts(normalize=True))

print("\nLabel distribution in validation set:")
print(val_df['label'].value_counts(normalize=True))

print("\nLabel distribution in test set:")
print(test_df['label'].value_counts(normalize=True))

## 6. Create PyTorch Dataset for the Model

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        # Convert to tensors and remove batch dimension the tokenizer adds
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 7. Load TinyBERT Model and Tokenizer

In [None]:
# Model configuration
MODEL_NAME = "huawei-noah/TinyBERT_General_4L_312D"
MAX_LENGTH = 512
BATCH_SIZE = 8

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Create datasets
train_dataset = FakeNewsDataset(
    train_df['processed_text'].tolist(),
    train_df['label'].tolist(),
    tokenizer,
    max_length=MAX_LENGTH
)

val_dataset = FakeNewsDataset(
    val_df['processed_text'].tolist(),
    val_df['label'].tolist(),
    tokenizer,
    max_length=MAX_LENGTH
)

test_dataset = FakeNewsDataset(
    test_df['processed_text'].tolist(),
    test_df['label'].tolist(),
    tokenizer,
    max_length=MAX_LENGTH
)

In [None]:
# Load model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

# Print model summary
print(f"Model: {MODEL_NAME}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")

## 8. Define Training Arguments and Evaluation Metrics

In [None]:
def compute_metrics(pred):
    """
    Compute metrics for evaluation.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

## 9. Finetune the Model

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
print("Starting model training...")
train_start = time.time()
trainer.train()
train_end = time.time()
train_time = train_end - train_start
print(f"Training completed in {train_time:.2f} seconds ({train_time/60:.2f} minutes)")

## 10. Evaluate Model on Test Set

In [None]:
# Evaluate on test set
print("Evaluating model on test set...")
results = trainer.evaluate(test_dataset)
print("Test results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

# Get predictions on test set
test_pred_output = trainer.predict(test_dataset)
test_preds = test_pred_output.predictions.argmax(-1)
test_labels = test_pred_output.label_ids

# Create confusion matrix
cm = confusion_matrix(test_labels, test_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Classification report
print("Classification Report:")
print(classification_report(test_labels, test_preds, target_names=['Real', 'Fake']))

## 11. Memory Usage and Performance Analysis

In [None]:
# Measure memory usage
import psutil
import os
from transformers import pipeline

def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / (1024 * 1024)

# Create pipeline for inference
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Memory before
mem_before = get_memory_usage()

# Measure inference time on sample texts
sample_texts = test_df['processed_text'].head(50).tolist()
start_time = time.time()
for text in sample_texts:
    _ = classifier(text[:512])
end_time = time.time()

# Memory after
mem_after = get_memory_usage()
mem_used = mem_after - mem_before

avg_inference_time = (end_time - start_time) / len(sample_texts)

print(f"Average inference time: {avg_inference_time:.4f} seconds per sample")
print(f"Memory usage during inference: {mem_used:.2f} MB")

# Add these to a metrics dictionary
model_metrics = {
    "model_name": "TinyBERT",
    "accuracy": results["eval_accuracy"],
    "f1_score": results["eval_f1"],
    "precision": results["eval_precision"],
    "recall": results["eval_recall"],
    "avg_processing_time": avg_inference_time,
    "avg_memory_usage": mem_used,
    "parameter_count": sum(p.numel() for p in model.parameters())
}

## 12. Save Model and Metrics for Django Integration

In [None]:
# Save model
MODEL_OUTPUT_DIR = "./models/tinybert_fakenewsnet"
trainer.save_model(MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)
print(f"Model saved to {MODEL_OUTPUT_DIR}")

# Save metrics
METRICS_OUTPUT_PATH = "./models/tinybert_fakenewsnet_metrics.json"
with open(METRICS_OUTPUT_PATH, 'w') as f:
    json.dump(model_metrics, f, indent=4)
print(f"Model metrics saved to {METRICS_OUTPUT_PATH}")

## 13. Example of How to Use the Model in Django

In [None]:
# This code can be used in the Django services.py file
def analyze_with_model(text, model_dir="./models/tinybert_fakenewsnet"):
    """
    Analyze text using the trained model.
    
    Args:
        text: Text to analyze
        model_dir: Path to the saved model directory
        
    Returns:
        dict: Detection results
    """
    # For demonstration purposes
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    
    # Create pipeline
    classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)
    
    # Measure performance
    start_time = time.time()
    result = classifier(text[:512])[0]
    processing_time = time.time() - start_time
    
    # Map the result
    label = result['label']
    score = result['score']
    
    # In this model, LABEL_0 = real, LABEL_1 = fake
    if label == 'LABEL_0':
        credibility_score = score
        category = "credible" if score > 0.7 else "mixed"
    else:
        credibility_score = 1 - score
        category = "fake" if score > 0.7 else "mixed"
    
    return {
        "credibility_score": credibility_score,
        "category": category,
        "confidence": score,
        "model_name": "TinyBERT",
        "processing_time": processing_time
    }

# Test the function with a sample text
sample_text = "Breaking news: Scientists discover that drinking water cures all diseases. Pharmaceutical companies don't want you to know this secret."
result = analyze_with_model(sample_text, model_dir=MODEL_OUTPUT_DIR)
print("Analysis result:")
for key, value in result.items():
    print(f"{key}: {value}")

## 14. Next Steps

1. Finetune a DistilBERT model using the same approach (see the next notebook)
2. Finetune models on the LIAR dataset (see the LIAR dataset notebook)
3. Compare model performance metrics
4. Integrate the best performing model(s) into your Django application

In your Django application, you can load the saved model and use it for real-time predictions as shown in section 13.