# Fake News Detection Project

This notebook demonstrates building a Fake News Detector using machine learning. We'll load the dataset, preprocess the text, train a model, and evaluate its performance.

**Note**: This notebook is designed for Google Colab. If running locally, adjust file paths accordingly.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
df = pd.read_csv('fake_news_dataset.csv')

# Display basic info
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nLabel distribution:")
print(df['label'].value_counts())
print("\nFirst few rows:")
df.head()

## 2. Data Preprocessing

In [None]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    words = text.split()
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['clean_text'] = df['text'].apply(preprocess_text)

print("Preprocessing completed!")
df[['text', 'clean_text']].head()

## 3. Feature Extraction

In [None]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = tfidf.fit_transform(df['clean_text'])
y = df['label']

print("Feature extraction completed!")
print("Shape of feature matrix:", X.shape)

## 4. Model Training

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

print("Model training completed!")

## 5. Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## 6. Model Saving and Loading

In [None]:
# Save the model and vectorizer
with open('fake_news_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("Model and vectorizer saved successfully!")

# Function to predict on new text
def predict_news(text):
    # Load model and vectorizer
    with open('fake_news_model.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        loaded_tfidf = pickle.load(f)
    
    # Preprocess
    clean_text = preprocess_text(text)
    # Vectorize
    vectorized = loaded_tfidf.transform([clean_text])
    # Predict
    prediction = loaded_model.predict(vectorized)[0]
    probability = loaded_model.predict_proba(vectorized)[0]
    
    return "Real" if prediction == 1 else "Fake", probability

# Test prediction
test_text = "This is a sample news article about technology."
result, prob = predict_news(test_text)
print(f"Prediction: {result}")
print(f"Confidence: {max(prob):.4f}")

## 7. Streamlit App Development

To create a Streamlit app, create a new file `app.py` with the following code:

```python
import streamlit as st
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load model and vectorizer
@st.cache_resource
def load_model():
    with open('fake_news_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        tfidf = pickle.load(f)
    return model, tfidf

# Preprocessing function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Streamlit app
st.title("Fake News Detector")
st.write("Paste a news article below to check if it's fake or real.")

user_input = st.text_area("Enter news text:", height=200)

if st.button("Analyze"):
    if user_input.strip():
        model, tfidf = load_model()
        clean_text = preprocess_text(user_input)
        vectorized = tfidf.transform([clean_text])
        prediction = model.predict(vectorized)[0]
        probability = model.predict_proba(vectorized)[0]
        
        result = "Real News" if prediction == 1 else "Fake News"
        confidence = max(probability)
        
        if prediction == 1:
            st.success(f"✅ {result} (Confidence: {confidence:.2%})")
        else:
            st.error(f"❌ {result} (Confidence: {confidence:.2%})")
    else:
        st.warning("Please enter some text to analyze.")
```

Run the app with: `streamlit run app.py`