# Fake News Detection Analysis

This notebook provides a comprehensive analysis of fake news detection using various machine learning models.

## Table of Contents
1. [Data Loading and Exploration](#data-loading)
2. [Data Preprocessing](#data-preprocessing)
3. [Model Training and Comparison](#model-training)
4. [Results Analysis](#results-analysis)
5. [Feature Analysis](#feature-analysis)
6. [Model Deployment](#model-deployment)


## 1. Data Loading and Exploration {#data-loading}


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import our custom modules
import sys
import os
sys.path.append('../src')
sys.path.append('../model')

from preprocess_data import FakeNewsPreprocessor
from get_model import get_model, get_model_info, compare_models

print("Libraries imported successfully!")


In [None]:
# Load the data
fake_path = "../data/fake.csv"
true_path = "../data/true.csv"

preprocessor = FakeNewsPreprocessor()
df = preprocessor.load_data(fake_path, true_path)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print("=" * 50)
print(df.info())

print("\nFirst few rows:")
print("=" * 50)
df.head()


In [None]:
# Check for missing values
print("Missing values:")
print("=" * 30)
print(df.isnull().sum())

# Check label distribution
print("\nLabel distribution:")
print("=" * 30)
print(df['label'].value_counts())
print(f"\nFake news: {df['label'].value_counts()[0]} ({df['label'].value_counts()[0]/len(df)*100:.1f}%)")
print(f"True news: {df['label'].value_counts()[1]} ({df['label'].value_counts()[1]/len(df)*100:.1f}%)")


## 2. Data Preprocessing and Model Training


In [None]:
# Preprocess the data and train models
print("Starting data preprocessing...")
df_processed = preprocessor.preprocess_data(df)

# Split the data
X_train, X_test, y_train, y_test = preprocessor.split_data(df_processed)

# Vectorize text for traditional ML models
X_train_tfidf, X_test_tfidf = preprocessor.vectorize_text(X_train, X_test)

print(f"Training set: {X_train_tfidf.shape}")
print(f"Test set: {X_test_tfidf.shape}")


In [None]:
# Train and compare multiple models
models_to_test = ['logistic', 'svm', 'random_forest', 'naive_bayes']
results = {}

for model_type in models_to_test:
    print(f"\nTraining {model_type} model...")
    print("-" * 40)
    
    # Create and train model
    model = get_model(model_type)
    model.fit(X_train_tfidf, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate accuracy
    from sklearn.metrics import accuracy_score, classification_report
    accuracy = accuracy_score(y_test, y_pred)
    
    results[model_type] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Fake', 'True']))


In [None]:
# Compare model accuracies
accuracies = [results[model]['accuracy'] for model in models_to_test]
model_names = [get_model_info(model)['name'] for model in models_to_test]

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
bars = plt.bar(model_names, accuracies, color=['skyblue', 'lightgreen', 'lightcoral', 'lightyellow'])
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.ylim(0.8, 1.0)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
             f'{acc:.4f}', ha='center', va='bottom')

plt.subplot(1, 2, 2)
plt.plot(model_names, accuracies, marker='o', linewidth=2, markersize=8)
plt.title('Model Accuracy Trend')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.ylim(0.8, 1.0)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find best model
best_model_type = models_to_test[np.argmax(accuracies)]
best_accuracy = max(accuracies)
print(f"\nBest Model: {get_model_info(best_model_type)['name']} with accuracy: {best_accuracy:.4f}")


In [None]:
# Create a prediction function
def predict_news(text, model, preprocessor):
    """
    Predict if a news article is fake or true
    
    Args:
        text (str): News article text
        model: Trained model
        preprocessor: Trained preprocessor
    
    Returns:
        dict: Prediction results
    """
    # Clean the text
    cleaned_text = preprocessor.clean_text(text)
    
    # Vectorize
    text_vectorized = preprocessor.tfidf_vectorizer.transform([cleaned_text])
    
    # Predict
    prediction = model.predict(text_vectorized)[0]
    
    # Get probability if available
    if hasattr(model, 'predict_proba'):
        probability = model.predict_proba(text_vectorized)[0]
        confidence = max(probability)
    else:
        confidence = None
    
    return {
        'prediction': 'Fake' if prediction == 0 else 'True',
        'confidence': confidence,
        'fake_probability': probability[0] if confidence else None,
        'true_probability': probability[1] if confidence else None
    }

# Test the prediction function
sample_text = "This is a test news article to see how our model performs."
best_model = results[best_model_type]['model']
result = predict_news(sample_text, best_model, preprocessor)

print("Sample Prediction:")
print("=" * 30)
print(f"Text: {sample_text}")
print(f"Prediction: {result['prediction']}")
if result['confidence']:
    print(f"Confidence: {result['confidence']:.4f}")
    print(f"Fake Probability: {result['fake_probability']:.4f}")
    print(f"True Probability: {result['true_probability']:.4f}")