In [4]:
# @title Fake News Detection - Final Working Version
# Step 1: Install and import everything first
!pip install pandas numpy nltk scikit-learn tensorflow --quiet

import pandas as pd
import numpy as np
import re
import nltk

# Step 2: Download ALL required NLTK data first
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt_tab', quiet=True)  # This fixes the specific error

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

print("✅ All packages installed and ready!")

# Step 3: Create better sample dataset
print("\n📊 Creating enhanced dataset...")
data = {
    'text': [
        "NASA confirms climate change is accelerating due to human activity",
        "Aliens built the pyramids and live inside Earth",
        "Regular exercise improves mental health, says new study",
        "5G towers spread COVID-19 virus, doctors confirm",
        "Vaccines are safe and effective, WHO reports",
        "Bill Gates implants microchips through vaccines",
        "Scientists agree that global warming is real",
        "The government is hiding evidence of UFOs",
        "Eating vegetables improves longevity",
        "Moon is made of cheese, astronauts confirm"
    ],
    'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # 0 = Real, 1 = Fake
}
df = pd.DataFrame(data)
print("\nSample Dataset (First 5 entries):")
print(df.head())

# Step 4: Improved text cleaning
print("\n🧹 Cleaning text data...")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    try:
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # URLs
        text = re.sub(r'[^\w\s]', '', text)  # Punctuation
        text = re.sub(r'\d+', '', text)  # Numbers
        words = nltk.word_tokenize(text)
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
        return ' '.join(words)
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return ""

df['cleaned_text'] = df['text'].apply(clean_text)
print("\nBefore and after cleaning example:")
print("Original:", df['text'][0])
print("Cleaned:", df['cleaned_text'][0])

# Step 5: Prepare ML data
print("\n🔢 Converting text to numbers...")
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(df['cleaned_text'])
y = df['label']

# Better train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Step 6: Train model
print("\n🤖 Training the model...")
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n🎯 Model Accuracy: {accuracy*100:.2f}%")

# Step 7: Prediction function with error handling
def predict_news(news_text):
    try:
        cleaned = clean_text(news_text)
        if not cleaned.strip():  # Check for empty string after cleaning
            print("⚠️ Text couldn't be properly processed")
            return

        vector = tfidf.transform([cleaned])
        prediction = model.predict(vector)[0]
        confidence = model.predict_proba(vector)[0].max()

        result = "🔴 FAKE NEWS" if prediction == 1 else "🟢 REAL NEWS"
        print(f"\n📝 Your Text: {news_text}")
        print(f"🧼 Cleaned Version: {cleaned}")
        print(f"\n🔎 Prediction: {result} (Confidence: {confidence*100:.1f}%)")
    except Exception as e:
        print(f"❌ Error: {e}")

# Test examples
print("\n🧪 Testing the model with sample news:")
test_examples = [
    "The Earth is flat according to new research",
    "COVID-19 vaccines have been approved by health authorities worldwide",
    "Drinking bleach cures coronavirus",
    "Climate change is causing more extreme weather events",
    "The moon landing was filmed in a Hollywood studio"
]

for example in test_examples:
    predict_news(example)
    print("━"*50)

print("\n✨ Try your own news! Run: predict_news('Your news text here')")
print("Example: predict_news('Some breaking news story')")

✅ All packages installed and ready!

📊 Creating enhanced dataset...

Sample Dataset (First 5 entries):
                                                text  label
0  NASA confirms climate change is accelerating d...      0
1    Aliens built the pyramids and live inside Earth      1
2  Regular exercise improves mental health, says ...      0
3   5G towers spread COVID-19 virus, doctors confirm      1
4       Vaccines are safe and effective, WHO reports      0

🧹 Cleaning text data...

Before and after cleaning example:
Original: NASA confirms climate change is accelerating due to human activity
Cleaned: nasa confirms climate change accelerating due human activity

🔢 Converting text to numbers...

🤖 Training the model...

🎯 Model Accuracy: 33.33%

🧪 Testing the model with sample news:

📝 Your Text: The Earth is flat according to new research
🧼 Cleaned Version: earth flat according new research

🔎 Prediction: 🟢 REAL NEWS (Confidence: 60.6%)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━