In [None]:
import pandas as pd
import numpy as np
import re
import string
import spacy
import nltk
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

In [None]:
import nltk

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Load Dataset with proper encoding and error handling
true_news = pd.read_csv("True.csv", encoding="utf-8", on_bad_lines="skip")
fake_news = pd.read_csv("Fake.csv", encoding="utf-8", on_bad_lines="skip")

# Drop any NaN values
true_news.dropna(inplace=True)
fake_news.dropna(inplace=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Labeling data
true_news['label'] = 1  # Real news
fake_news['label'] = 0  # Fake news

In [None]:
# Merge datasets
news_data = pd.concat([true_news, fake_news], axis=0).reset_index(drop=True)

In [None]:
# Shuffle data
news_data = news_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove new lines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove numbers
    return text

news_data['text'] = news_data['title'] + ' ' + news_data['text']
news_data['text'] = news_data['text'].apply(preprocess_text)


In [None]:
# Tokenization and stopword removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

news_data['text'] = news_data['text'].apply(remove_stopwords)

In [None]:
# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(news_data['text'], news_data['label'], test_size=0.2, random_state=42)

In [None]:
# Tokenizing Text
max_words = 5000
max_length = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length)


In [None]:
# Define LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_length),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(32),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [None]:
# Compile Model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'])


In [None]:
# Train Model
early_stop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test), callbacks=[early_stop])

Epoch 1/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 443ms/step - accuracy: 0.9018 - loss: 0.2604 - val_accuracy: 0.9901 - val_loss: 0.0370
Epoch 2/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 448ms/step - accuracy: 0.9931 - loss: 0.0289 - val_accuracy: 0.9933 - val_loss: 0.0310
Epoch 3/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 464ms/step - accuracy: 0.9955 - loss: 0.0194 - val_accuracy: 0.9943 - val_loss: 0.0287
Epoch 4/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 450ms/step - accuracy: 0.9980 - loss: 0.0079 - val_accuracy: 0.9949 - val_loss: 0.0233
Epoch 5/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 451ms/step - accuracy: 0.9974 - loss: 0.0104 - val_accuracy: 0.9914 - val_loss: 0.0379
Epoch 6/10
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 527ms/step - accuracy: 0.9969 - loss: 0.0131 - val_accuracy: 0.9884 - val_loss: 0.0561
Epoc

In [None]:
# Save Model
model.save("fake_news_detector.h5")



In [None]:
# Load Trained Model
model = load_model("fake_news_detector.h5")



In [None]:
# Evaluate
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 62ms/step - accuracy: 0.9948 - loss: 0.0252
Test Accuracy: 0.9943


In [None]:
def real_time_prediction():
    while True:
        article = input("\nEnter a news article (or type 'exit' to quit): ")
        if article.lower() == 'exit':
            print("Exiting... Thank you!")
            break

        # Preprocess input text
        article = preprocess_text(article)
        article = remove_stopwords(article)
        article_seq = tokenizer.texts_to_sequences([article])
        article_padded = pad_sequences(article_seq, maxlen=max_length)

        # Predict
        prediction = model.predict(article_padded)
        result = "Real News ✅" if prediction[0][0] > 0.5 else "Fake News ❌"

        print(f"\nPrediction: {result}")

# Run real-time prediction
real_time_prediction()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 413ms/step

Prediction: Fake News ❌
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step

Prediction: Fake News ❌
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step

Prediction: Real News ✅
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step

Prediction: Fake News ❌
