# Cross-Dataset Validation for Fake News Detection

This notebook implements cross-dataset validation by training a fake news detection model on the BuzzFeed dataset and testing it on the PolitiFact dataset.

This approach helps evaluate how well the model generalizes to new sources, which is crucial for real-world applications.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Load and Prepare Data

We'll load two datasets:
- BuzzFeed dataset for training
- PolitiFact dataset for testing

In [None]:
# Load BuzzFeed datasets for training
bf_fake_df = pd.read_csv("../data/BuzzFeed_fake_news_content.csv")
bf_real_df = pd.read_csv("../data/BuzzFeed_real_news_content.csv")

# Load PolitiFact datasets for testing
pf_fake_df = pd.read_csv("../data/PolitiFact_fake_news_content.csv")
pf_real_df = pd.read_csv("../data/PolitiFact_real_news_content.csv")

# Print the sizes of each dataset
print(f"BuzzFeed fake news: {len(bf_fake_df)} articles")
print(f"BuzzFeed real news: {len(bf_real_df)} articles")
print(f"PolitiFact fake news: {len(pf_fake_df)} articles")
print(f"PolitiFact real news: {len(pf_real_df)} articles")

## 3. Add Labels and Combine Datasets

In [None]:
# Add labels to the datasets
bf_fake_df['label'] = 1  # 1 for fake news
bf_real_df['label'] = 0  # 0 for real news
pf_fake_df['label'] = 1
pf_real_df['label'] = 0

# Combine BuzzFeed datasets for training
bf_combined_df = pd.concat([bf_fake_df, bf_real_df], ignore_index=True)
# Shuffle the data
bf_combined_df = bf_combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Combine PolitiFact datasets for testing
pf_combined_df = pd.concat([pf_fake_df, pf_real_df], ignore_index=True)
# Shuffle the data
pf_combined_df = pf_combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

## 4. Feature Engineering

Create a combined text feature that includes both the title and content.

In [None]:
# Create a combined text field from title and content
bf_combined_df['combined_text'] = bf_combined_df['title'].fillna('') + ' ' + bf_combined_df['text'].fillna('')
pf_combined_df['combined_text'] = pf_combined_df['title'].fillna('') + ' ' + pf_combined_df['text'].fillna('')

# Create training and testing sets
X_train = bf_combined_df['combined_text']
y_train = bf_combined_df['label']
X_test = pf_combined_df['combined_text']
y_test = pf_combined_df['label']

print(f"Training data size: {len(X_train)} samples")
print(f"Testing data size: {len(X_test)} samples")

## 5. TF-IDF Vectorization

We'll convert the text data into numerical vectors using TF-IDF.

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Number of features: {X_train_tfidf.shape[1]}")

## 6. Train Logistic Regression Model

In [None]:
# Train a logistic regression model
model = LogisticRegression(max_iter=1000, C=1.0)
model.fit(X_train_tfidf, y_train)

print("Model training completed!")

## 7. Evaluate Model Performance

In [None]:
# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## 8. Confusion Matrix Visualization

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real', 'Fake'], 
            yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## 9. Feature Importance Analysis

Let's examine which words are most indicative of fake vs. real news.

In [None]:
# Feature importance analysis
feature_names = tfidf_vectorizer.get_feature_names_out()
importance = model.coef_[0]

# Get top 10 features for real and fake news
sorted_idx = importance.argsort()
top_fake_features = [(feature_names[idx], importance[idx]) for idx in sorted_idx[-10:]]
top_real_features = [(feature_names[idx], importance[idx]) for idx in sorted_idx[:10]]

print("Top 10 features for fake news:")
for feature, weight in reversed(top_fake_features):
    print(f"{feature}: {weight:.4f}")

print("\nTop 10 features for real news:")
for feature, weight in top_real_features:
    print(f"{feature}: {weight:.4f}")

## 10. Feature Importance Visualization

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 8))

# Top features for fake news
plt.subplot(1, 2, 1)
y_pos = np.arange(len(top_fake_features))
weights = [weight for _, weight in reversed(top_fake_features)]
features = [feature for feature, _ in reversed(top_fake_features)]
plt.barh(y_pos, weights, align='center')
plt.yticks(y_pos, features)
plt.xlabel('Weight')
plt.title('Top Features for Fake News')

# Top features for real news
plt.subplot(1, 2, 2)
y_pos = np.arange(len(top_real_features))
weights = [abs(weight) for _, weight in top_real_features]  # Use absolute values for better visualization
features = [feature for feature, _ in top_real_features]
plt.barh(y_pos, weights, align='center')
plt.yticks(y_pos, features)
plt.xlabel('Weight (Absolute Value)')
plt.title('Top Features for Real News')

plt.tight_layout()
plt.show()

## 11. Sentiment Analysis (Optional)

This section requires the transformers library. If you don't have it installed, run:
```
pip install transformers torch
```

In [None]:
# Try to import transformers for sentiment analysis
try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    from scipy.special import softmax
    import torch
    
    # Load the sentiment model
    MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    
    print("Transformers library loaded successfully!")
except ImportError:
    print("Transformers library not found. Skipping sentiment analysis.")
    print("To install: pip install transformers torch")

In [None]:
try:
    def get_sentiment_scores(text):
        """Get sentiment scores for a text (returns [negative, neutral, positive])"""
        try:
            encoded_text = tokenizer(text[:512], return_tensors='pt')
            with torch.no_grad():
                output = sentiment_model(**encoded_text)
            scores = output[0][0].numpy()
            scores = softmax(scores)
            return scores
        except Exception as e:
            print(f"Error processing text: {e}")
            return np.array([0.33, 0.33, 0.33])  # Default to balanced scores if error
    
    # Get sentiment scores for a sample of articles (for performance reasons)
    sample_size = min(50, len(bf_combined_df))
    
    # Sample from BuzzFeed training data
    print(f"Analyzing sentiment for {sample_size} sample articles...")
    bf_sample = bf_combined_df.sample(sample_size, random_state=42)
    bf_sample['sentiment'] = bf_sample['title'].apply(get_sentiment_scores)
    
    # Extract sentiment components
    bf_sample['negative_score'] = bf_sample['sentiment'].apply(lambda x: x[0])
    bf_sample['neutral_score'] = bf_sample['sentiment'].apply(lambda x: x[1])
    bf_sample['positive_score'] = bf_sample['sentiment'].apply(lambda x: x[2])
    
    print("Sentiment analysis completed!")
except NameError:
    print("Skipping sentiment analysis (transformers not available)")

In [None]:
try:
    # Compare sentiment distributions between real and fake news
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    sns.boxplot(x='label', y='negative_score', data=bf_sample)
    plt.title('Negative Sentiment Distribution')
    plt.xlabel('Label (0=Real, 1=Fake)')
    plt.ylabel('Negative Sentiment Score')
    
    plt.subplot(1, 2, 2)
    sns.boxplot(x='label', y='positive_score', data=bf_sample)
    plt.title('Positive Sentiment Distribution')
    plt.xlabel('Label (0=Real, 1=Fake)')
    plt.ylabel('Positive Sentiment Score')
    
    plt.tight_layout()
    plt.show()
    
    # Print sentiment statistics
    print("\nSentiment Analysis on Training Data Sample:")
    print(bf_sample.groupby('label')[['negative_score', 'neutral_score', 'positive_score']].mean())
except NameError:
    print("Skipping sentiment visualization (sentiment analysis not available)")

## 12. Conclusions

This cross-dataset validation experiment demonstrates how well our fake news detection model generalizes to new, unseen sources.

