# Fake News Detection - Data Analysis & Model Training

This notebook provides comprehensive analysis of the fake news detection project, including:
- Data exploration and visualization
- Model training and evaluation
- Performance comparison
- Interactive testing

## 1. Setup and Imports

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add project root to path
sys.path.append('..')

# Import our modules
from src.data_preprocessing import DataPreprocessor
from src.model_training import ModelTrainer
from src.prediction import FakeNewsDetector, ModelComparator
from src.utils import Config, create_directories

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Create directories
create_directories()

print("✅ Setup complete!")

## 2. Data Loading and Exploration

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Load data (using sample data for demonstration)
df = preprocessor.get_sample_data()

print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
print(df.info())
print(f"\nFirst few rows:")
df.head()

In [None]:
# Data distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Label distribution
label_counts = df['label'].value_counts()
axes[0].bar(['Fake', 'Real'], [label_counts[0], label_counts[1]], color=['red', 'green'])
axes[0].set_title('Label Distribution')
axes[0].set_ylabel('Count')

# Subject distribution
subject_counts = df['subject'].value_counts()
axes[1].bar(subject_counts.index, subject_counts.values)
axes[1].set_title('Subject Distribution')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"Real news: {label_counts[1]} articles")
print(f"Fake news: {label_counts[0]} articles")
print(f"Balance ratio: {label_counts[1]/label_counts[0]:.2f}")

## 3. Text Analysis and Preprocessing

In [None]:
# Prepare the dataset
df_processed = preprocessor.prepare_dataset(df)

print(f"Processed dataset shape: {df_processed.shape}")
print(f"\nProcessed text examples:")
for i, row in df_processed.head(3).iterrows():
    print(f"\nOriginal: {row['combined_text'][:100]}...")
    print(f"Processed: {row['processed_text'][:100]}...")
    print("-" * 50)

In [None]:
# Text length analysis
df_processed['text_length'] = df_processed['processed_text'].str.len()
df_processed['word_count'] = df_processed['processed_text'].str.split().str.len()

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Text length distribution
axes[0, 0].hist(df_processed['text_length'], bins=20, alpha=0.7, color='skyblue')
axes[0, 0].set_title('Text Length Distribution')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Frequency')

# Word count distribution
axes[0, 1].hist(df_processed['word_count'], bins=20, alpha=0.7, color='lightgreen')
axes[0, 1].set_title('Word Count Distribution')
axes[0, 1].set_xlabel('Words')
axes[0, 1].set_ylabel('Frequency')

# Text length by label
real_lengths = df_processed[df_processed['label'] == 1]['text_length']
fake_lengths = df_processed[df_processed['label'] == 0]['text_length']

axes[1, 0].hist([real_lengths, fake_lengths], bins=15, alpha=0.7, 
                label=['Real', 'Fake'], color=['green', 'red'])
axes[1, 0].set_title('Text Length by Label')
axes[1, 0].set_xlabel('Characters')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()

# Word count by label
real_words = df_processed[df_processed['label'] == 1]['word_count']
fake_words = df_processed[df_processed['label'] == 0]['word_count']

axes[1, 1].hist([real_words, fake_words], bins=15, alpha=0.7, 
                label=['Real', 'Fake'], color=['green', 'red'])
axes[1, 1].set_title('Word Count by Label')
axes[1, 1].set_xlabel('Words')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 4. Word Clouds

In [None]:
# Create word clouds for real and fake news
real_text = ' '.join(df_processed[df_processed['label'] == 1]['processed_text'])
fake_text = ' '.join(df_processed[df_processed['label'] == 0]['processed_text'])

fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Real news word cloud
wordcloud_real = WordCloud(width=800, height=400, background_color='white').generate(real_text)
axes[0].imshow(wordcloud_real, interpolation='bilinear')
axes[0].set_title('Real News Word Cloud', fontsize=16)
axes[0].axis('off')

# Fake news word cloud
wordcloud_fake = WordCloud(width=800, height=400, background_color='white').generate(fake_text)
axes[1].imshow(wordcloud_fake, interpolation='bilinear')
axes[1].set_title('Fake News Word Cloud', fontsize=16)
axes[1].axis('off')

plt.tight_layout()
plt.show()

## 5. Model Training and Evaluation

In [None]:
# Initialize model trainer
trainer = ModelTrainer()

# Train models using the full pipeline
print("🤖 Starting model training...")
results = trainer.train_full_pipeline()

print(f"\n✅ Training completed!")
print(f"Best model: {results['best_model']}")

In [None]:
# Display model performance comparison
performance_df = pd.DataFrame(trainer.model_performances).T
performance_df = performance_df.round(4)

print("📊 Model Performance Comparison:")
print(performance_df)

# Visualize performance metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
colors = ['skyblue', 'lightgreen', 'lightcoral', 'lightyellow']

for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    bars = ax.bar(performance_df.index, performance_df[metric], color=colors[i])
    ax.set_title(f'{metric.replace("_", " ").title()}')
    ax.set_ylabel('Score')
    ax.set_ylim(0, 1)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
               f'{height:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 6. Interactive Testing

In [None]:
# Test examples with different models
test_examples = [
    {
        'title': 'Scientists Discover New Treatment for Cancer',
        'text': 'Researchers at a major university have developed a promising new treatment for cancer that shows significant results in clinical trials.',
        'expected': 'REAL'
    },
    {
        'title': 'SHOCKING: Aliens Found in Government Facility',
        'text': 'Government officials deny but sources confirm that extraterrestrial beings are being held at a secret facility.',
        'expected': 'FAKE'
    },
    {
        'title': 'Miracle Cure Discovered by Local Mom',
        'text': 'Local mother discovers amazing cure that doctors hate using this one simple trick from her kitchen.',
        'expected': 'FAKE'
    }
]

# Initialize detector with best model
best_model_name, _ = trainer.get_best_model()
detector = FakeNewsDetector(best_model_name)

print(f"🔍 Testing with best model: {best_model_name}")
print("=" * 60)

for i, example in enumerate(test_examples, 1):
    print(f"\nExample {i}:")
    print(f"Title: {example['title']}")
    print(f"Expected: {example['expected']}")
    
    result = detector.predict(example['text'], example['title'])
    
    status = "REAL" if result['is_real'] else "FAKE"
    confidence = result['confidence']
    
    print(f"Predicted: {status} ({confidence}% confidence)")
    print(f"✅ Correct!" if status == example['expected'] else "❌ Incorrect!")
    print("-" * 50)