# Fake News Detection Project

A simple machine learning project to detect fake news using text classification.

## Project Steps:
1. Load datasets (Fake.csv and True.csv)
2. Add labels (Fake=0, Real=1)
3. Preprocess text data
4. Convert text to features using TF-IDF
5. Split data (80% train, 20% test)
6. Train Logistic Regression and Naive Bayes models
7. Evaluate and compare models
8. Make predictions on new articles

## Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## Step 2: Load and Prepare Data

In [None]:
# Load datasets
print("Loading datasets...")
fake_df = pd.read_csv('Fake.csv')
fake_df['label'] = 0  # Fake news = 0

true_df = pd.read_csv('True.csv')
true_df['label'] = 1  # Real news = 1

# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

print(f"Total articles: {len(df)}")
print(f"Fake articles: {len(fake_df)}")
print(f"Real articles: {len(true_df)}")
print(f"Columns: {list(df.columns)}")

In [None]:
# Show sample data
print("Sample data:")
df.head()

## Step 3: Text Preprocessing

In [None]:
# Download stopwords
try:
    nltk.download('stopwords', quiet=True)
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
except:
    # Basic stopwords if NLTK fails
    stop_words = {'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'him', 'his', 
                  'she', 'her', 'it', 'its', 'they', 'them', 'their', 'this', 'that', 
                  'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 
                  'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 
                  'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
                  'while', 'of', 'at', 'by', 'for', 'with', 'through', 'during', 'before', 
                  'after', 'above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 
                  'over', 'under', 'again', 'further', 'then', 'once'}

def preprocess_text(text):
    """Clean text data"""
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words and len(word) > 2]
    
    return ' '.join(words)

print("Text preprocessing function created")

In [None]:
# Apply preprocessing
print("Preprocessing text data...")
df['processed_text'] = df['text'].apply(preprocess_text)

# Remove empty texts
initial_count = len(df)
df = df[df['processed_text'].str.len() > 0]
print(f"Articles after cleaning: {len(df)} (removed {initial_count - len(df)} empty articles)")

In [None]:
# Show preprocessing example
print("Preprocessing example:")
print("Original:", df['text'].iloc[0][:200])
print("Processed:", df['processed_text'].iloc[0][:200])

## Step 4: Feature Extraction (TF-IDF)

In [None]:
# Convert text to numerical features
print("Converting text to features using TF-IDF...")
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']

print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Target distribution:")
print(f"  Fake (0): {sum(y == 0)} articles")
print(f"  Real (1): {sum(y == 1)} articles")

## Step 5: Split Data (80/20)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]} ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Testing samples: {X_test.shape[0]} ({X_test.shape[0]/len(df)*100:.1f}%)")
print(f"Training set distribution:")
print(f"  Fake: {sum(y_train == 0)} articles")
print(f"  Real: {sum(y_train == 1)} articles")

## Step 6: Train Models

In [None]:
# Train Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
print("Logistic Regression trained successfully")

# Train Naive Bayes
print("Training Naive Bayes...")
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
print("Naive Bayes trained successfully")

## Step 7: Evaluate Models

In [None]:
# Make predictions
lr_pred = lr_model.predict(X_test)
nb_pred = nb_model.predict(X_test)

# Calculate accuracies
lr_accuracy = accuracy_score(y_test, lr_pred)
nb_accuracy = accuracy_score(y_test, nb_pred)

print(f"Logistic Regression Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f} ({nb_accuracy*100:.2f}%)")

In [None]:
# Detailed evaluation - Logistic Regression
print("LOGISTIC REGRESSION RESULTS:")
print("=" * 40)
print(f"Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, lr_pred))
print("\nClassification Report:")
print(classification_report(y_test, lr_pred, target_names=['Fake News', 'Real News']))

In [None]:
# Detailed evaluation - Naive Bayes
print("NAIVE BAYES RESULTS:")
print("=" * 40)
print(f"Accuracy: {nb_accuracy:.4f} ({nb_accuracy*100:.2f}%)")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, nb_pred))
print("\nClassification Report:")
print(classification_report(y_test, nb_pred, target_names=['Fake News', 'Real News']))

In [None]:
# Compare models
print("MODEL COMPARISON:")
print("=" * 30)
print(f"Logistic Regression: {lr_accuracy*100:.2f}%")
print(f"Naive Bayes: {nb_accuracy*100:.2f}%")

if lr_accuracy > nb_accuracy:
    best_model = lr_model
    best_name = "Logistic Regression"
    improvement = (lr_accuracy - nb_accuracy) * 100
    print(f"\nWinner: Logistic Regression")
    print(f"Better by: {improvement:.2f} percentage points")
else:
    best_model = nb_model
    best_name = "Naive Bayes"
    improvement = (nb_accuracy - lr_accuracy) * 100
    print(f"\nWinner: Naive Bayes")
    print(f"Better by: {improvement:.2f} percentage points")

## Step 8: Prediction Function

In [None]:
def predict_news(article_text):
    """Predict if news article is fake or real"""
    # Preprocess text
    processed = preprocess_text(article_text)
    if not processed:
        return "Unable to process text", 0.5
    
    # Convert to features
    vector = vectorizer.transform([processed])
    
    # Make prediction
    prediction = best_model.predict(vector)[0]
    probability = best_model.predict_proba(vector)[0]
    confidence = max(probability)
    
    result = "REAL" if prediction == 1 else "FAKE"
    return result, confidence

print("Prediction function created successfully!")
print(f"Using best model: {best_name}")

## Step 9: Test with Sample Articles

In [None]:
# Test with sample articles
sample_articles = [
    "Scientists at NASA have discovered a new exoplanet with signs of water",
    "Aliens have landed in New York and are demanding to speak with the mayor",
    "The Federal Reserve announced interest rates will remain unchanged",
    "Local man gains superpowers after eating radioactive pizza",
    "Climate change continues to affect global weather patterns"
]

print("SAMPLE PREDICTIONS:")
print("=" * 50)

for i, article in enumerate(sample_articles, 1):
    result, confidence = predict_news(article)
    print(f"\nSample {i}:")
    print(f"Article: {article}")
    print(f"Prediction: {result}")
    print(f"Confidence: {confidence:.3f} ({confidence*100:.1f}%)")
    print("-" * 40)

## Step 10: Interactive Prediction

In [None]:
# Interactive prediction
print("INTERACTIVE PREDICTION:")
print("Enter a news article to classify it:")

# Example usage (replace with your own text)
user_article = "The stock market reached new highs today as investors showed confidence in the economic recovery."

if len(user_article.strip()) > 10:
    result, confidence = predict_news(user_article)
    print(f"\nArticle: {user_article}")
    print(f"Prediction: {result}")
    print(f"Confidence: {confidence:.3f} ({confidence*100:.1f}%)")
    print(f"Model used: {best_name}")
else:
    print("Please enter a longer article (at least 10 characters)")

## Project Summary

In [None]:
print("FAKE NEWS DETECTION PROJECT - SUMMARY")
print("=" * 50)
print(f"Dataset: {len(df)} articles processed")
print(f"Features: {X.shape[1]} TF-IDF features")
print(f"Training: {X_train.shape[0]} samples")
print(f"Testing: {X_test.shape[0]} samples")
print(f"\nModel Performance:")
print(f"- Logistic Regression: {lr_accuracy*100:.2f}%")
print(f"- Naive Bayes: {nb_accuracy*100:.2f}%")
print(f"- Best Model: {best_name}")
print(f"\nProject Status: COMPLETED SUCCESSFULLY")