# ***TruthLens – AI-Powered Fake News Detection***

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**PROBLEM STATEMENT : **The problem is the unchecked spread of fake news online.
The solution is an ML/NLP-powered fake news detection system with explainability and a user-friendly deployment.

Dataset Link: https://www.kaggle.com/datasets/vishakhdapat/fake-news-detection/data

# **Data Collection & Preprocessing**

In [3]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import nltk

nltk.download('stopwords')

# 1. Load Dataset
# Update the file path to where your dataset is located in Google Drive
df = pd.read_csv("/content/drive/MyDrive/FakeNewsDetection_Project/fake_and_real_news.csv")
print("Before cleaning:\n", df.head())

# 2. Text Cleaning
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    # Lowercase
    text = str(text).lower()
    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove punctuation & numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenization & remove stopwords
    words = text.split()
    words = [w for w in words if w not in stop_words]
    # Stemming
    words = [stemmer.stem(w) for w in words]
    return " ".join(words)

df["clean_text"] = df["Text"].apply(clean_text)

# 3. Label Encoding
df["label"] = df["label"].map({"Fake": 0, "Real": 1})

print("\nAfter Encoding:\n", df.head())

# 4. Train-Test Split
X = df["clean_text"]
y = df["label"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTrain size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

# 5. Save Processed Data
df[["clean_text", "label"]].to_csv("processed_fake_news.csv", index=False)
print("\nProcessed data saved as 'processed_fake_news.csv'")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Before cleaning:
                                                 Text label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake
1  U.S. conservative leader optimistic of common ...  Real
2  Trump proposes U.S. tax overhaul, stirs concer...  Real
3   Court Forces Ohio To Allow Millions Of Illega...  Fake
4  Democrats say Trump agrees to work on immigrat...  Real

After Encoding:
                                                 Text  label  \
0   Top Trump Surrogate BRUTALLY Stabs Him In The...      0   
1  U.S. conservative leader optimistic of common ...      1   
2  Trump proposes U.S. tax overhaul, stirs concer...      1   
3   Court Forces Ohio To Allow Millions Of Illega...      0   
4  Democrats say Trump agrees to work on immigrat...      1   

                                          clean_text  
0  top trump surrog brutal stab back pathet video...  
1  u conserv leader optimist common ground health...  
2  trump propos u tax overhaul stir concern defic...  
3  court fo

In [4]:
!ls



drive  processed_fake_news.csv	sample_data


In [5]:
!cp processed_fake_news.csv /content/drive/MyDrive/FakeNewsDetection_Project/


In [6]:
# =========================
# 1. Imports
# =========================
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# =========================
# 2. Load Processed Data
# =========================
df = pd.read_csv("/content/drive/MyDrive/FakeNewsDetection_Project/processed_fake_news.csv")

X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =========================
# 3. TF-IDF + Baseline Models
# =========================
print("\n=== TF-IDF + Logistic Regression ===")
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("\n=== TF-IDF + Random Forest ===")
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# =========================
# 4. Deep Learning Models
# =========================
# Tokenizer + Padding
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

# -------------------------
# Model 1: Simple Dense NN
# -------------------------
print("\n=== Dense Neural Network ===")
dense_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

dense_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
dense_model.fit(X_train_pad, y_train, validation_split=0.2, epochs=3, batch_size=64)
dense_loss, dense_acc = dense_model.evaluate(X_test_pad, y_test)
print("Dense NN Accuracy:", dense_acc)

# -------------------------
# Model 2: LSTM
# -------------------------
print("\n=== LSTM ===")
lstm_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

lstm_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
lstm_model.fit(X_train_pad, y_train, validation_split=0.2, epochs=3, batch_size=64)
lstm_loss, lstm_acc = lstm_model.evaluate(X_test_pad, y_test)
print("LSTM Accuracy:", lstm_acc)

# -------------------------
# Model 3: GRU
# -------------------------
print("\n=== GRU ===")
gru_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    GRU(128, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

gru_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
gru_model.fit(X_train_pad, y_train, validation_split=0.2, epochs=3, batch_size=64)
gru_loss, gru_acc = gru_model.evaluate(X_test_pad, y_test)
print("GRU Accuracy:", gru_acc)

# =========================
# 5. Hyperparameter Tuning (example for LogReg)
# =========================
print("\n=== Hyperparameter Tuning: Logistic Regression ===")
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=3, scoring="accuracy")
grid.fit(X_train_tfidf, y_train)
print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)


=== TF-IDF + Logistic Regression ===
Accuracy: 0.9949494949494949
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       973
           1       0.99      1.00      1.00      1007

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980


=== TF-IDF + Random Forest ===
Accuracy: 0.998989898989899
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       973
           1       1.00      1.00      1.00      1007

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980


=== Dense Neural Network ===




Epoch 1/3
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.7740 - loss: 0.5174 - val_accuracy: 0.9804 - val_loss: 0.0787
Epoch 2/3
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9831 - loss: 0.0655 - val_accuracy: 0.9924 - val_loss: 0.0294
Epoch 3/3
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9953 - loss: 0.0209 - val_accuracy: 0.9949 - val_loss: 0.0163
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9968 - loss: 0.0142
Dense NN Accuracy: 0.9959595799446106

=== LSTM ===
Epoch 1/3
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.8256 - loss: 0.3938 - val_accuracy: 0.9880 - val_loss: 0.0480
Epoch 2/3
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9891 - loss: 0.0485 - val_accuracy: 0.9949 - val_loss: 0.0213
Epoch 3/3
[1m99/99[0m [32m━━━━━━━━━━━━━━

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_model(name, y_true, y_pred, results):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1,
        "TN": tn,
        "FP": fp,
        "FN": fn,
        "TP": tp
    })
    return results


# Collect results
results = []

# Logistic Regression
evaluate_model("Logistic Regression", y_test, y_pred_lr, results)

# Random Forest
evaluate_model("Random Forest", y_test, y_pred_rf, results)

# Dense NN
evaluate_model("Dense NN", y_test, y_pred_dense, results)

# LSTM
evaluate_model("LSTM", y_test, y_pred_lstm, results)

# GRU
evaluate_model("GRU", y_test, y_pred_gru, results)

# Convert to DataFrame for comparison
results_df = pd.DataFrame(results)
print("\n=== Model Comparison with Confusion Matrix Values ===")
print(results_df)


NameError: name 'y_pred_dense' is not defined

In [None]:
import matplotlib.pyplot as plt

# Metrics to plot
metrics = ["Accuracy", "Precision", "Recall", "F1-Score"]

for metric in metrics:
    plt.figure(figsize=(8,5))
    plt.bar(results_df["Model"], results_df[metric], color="skyblue")
    plt.title(f"{metric} Comparison Across Models")
    plt.ylabel(metric)
    plt.ylim(0, 1)  # all metrics are between 0 and 1
    plt.xticks(rotation=30, ha="right")
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.show()


In [None]:
cm_values = ["TN", "FP", "FN", "TP"]

for val in cm_values:
    plt.figure(figsize=(8,5))
    plt.bar(results_df["Model"], results_df[val], color="lightcoral")
    plt.title(f"{val} Counts Across Models")
    plt.ylabel("Count")
    plt.xticks(rotation=30, ha="right")
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.show()


In [None]:

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np

def plot_conf_matrix(y_true, y_pred, model_name, normalize=False):
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
        fmt = ".2f"   # show percentages
        title = f"Confusion Matrix (Normalized) - {model_name}"
    else:
        fmt = "d"    # show counts
        title = f"Confusion Matrix - {model_name}"

    plt.figure(figsize=(4,4))
    plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()

    tick_marks = [0,1]
    plt.xticks(tick_marks, ["Fake", "Real"])
    plt.yticks(tick_marks, ["Fake", "Real"])

    # Add numbers inside cells
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], fmt),
                     ha="center", va="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()
    plt.show()
    plt.close()


In [None]:
plot_conf_matrix(y_test, y_pred_lr, "Logistic Regression")

In [None]:
plot_conf_matrix(y_test, y_pred_lr, "Logistic Regression", normalize=True)

In [None]:
import gradio as gr

# Choose the best model (example: LSTM)
best_model = lstm_model

# Prediction function
def predict_news(text):
    # Clean text
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=max_len, padding="post")
    pred = best_model.predict(pad)[0][0]
    label = "Real" if pred >= 0.5 else "Fake"
    return { "Fake": float(1-pred), "Real": float(pred) }

# Gradio interface
iface = gr.Interface(
    fn=predict_news,
    inputs=gr.Textbox(lines=4, placeholder="Paste a news article here..."),
    outputs=gr.Label(num_top_classes=2),
    title="Fake News Detector",
    description="Enter a news article and see if it's predicted as Real or Fake."
)

iface.launch(share=True)


In [None]:
import pickle

# Save tokenizer
with open("/content/drive/MyDrive/FakeNewsDetection_Project/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save model
best_model.save("/content/drive/MyDrive/FakeNewsDetection_Project/best_model.h5")
