<a href="https://colab.research.google.com/github/AnushkaBhagat22/Fake-News-Detection/blob/main/fake_news_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Project: Explainable Machine Learning Framework for Fake News Detection
# Dataset: ISOT Fake News Dataset (Kaggle)
# Objective: Compare multiple ML models and analyze interpretability

In [None]:
# -----------------------------------
# Load Dataset
# Using engine='python' to handle potential CSV parsing inconsistencies
# -----------------------------------
fake = pd.read_csv("Fake.csv", engine="python", on_bad_lines="skip")
true = pd.read_csv("True.csv", engine="python", on_bad_lines="skip")
print(fake)
print(true)

In [None]:
# -----------------------------------
# Data Cleaning
# - Add binary labels
# - Merge datasets
# - Remove duplicates and null values
# -----------------------------------

In [None]:
fake["label"] = 1
true["label"] = 0

data = pd.concat([fake, true], axis=0)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

data = data[["text", "label"]]

print(data.head())

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
# Remove duplicate rows to avoid data leakage
data = data.drop_duplicates()

In [None]:
data.duplicated().sum()

In [None]:
data.shape

In [None]:
print(data.label.value_counts())

In [None]:
# -----------------------------------
# Train-Test Split
# Using stratified sampling to preserve class distribution
# -----------------------------------

In [None]:
from sklearn.model_selection import train_test_split

X = data["text"]
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
# -----------------------------------
# Feature Extraction using TF-IDF
# Using unigrams + bigrams to capture contextual information
# -----------------------------------

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    stop_words='english'
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

In [None]:
# -----------------------------------
# Model Evaluation
# Metrics: Accuracy, Precision, Recall, F1-score
# -----------------------------------

In [None]:
# -----------------------------------
# Model 1: Logistic Regression
# -----------------------------------
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

y_pred_lr = lr.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

In [None]:
# -----------------------------------
# Model 2: Multinomial Naive Bayes
# -----------------------------------
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

y_pred_nb = nb.predict(X_test_tfidf)

print("Naive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

In [None]:
# -----------------------------------
# Model 3: Linear Support Vector Machine
# -----------------------------------
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)

y_pred_svm = svm.predict(X_test_tfidf)

print("Linear SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

In [None]:
results = {
    "Logistic Regression": 0.9849,
    "Naive Bayes": 0.9508,
    "Linear SVM": 0.9928
}

print(results)

In [None]:
# -----------------------------------
# Interpretability Analysis
# Extracting most influential words from Logistic Regression coefficients
# -----------------------------------
feature_names = vectorizer.get_feature_names_out()
coefficients = lr.coef_[0]

top_fake = coefficients.argsort()[-15:]
top_real = coefficients.argsort()[:15]

print("Top Fake Indicators:")
print([feature_names[i] for i in top_fake])

print("\nTop Real Indicators:")
print([feature_names[i] for i in top_real])

In [None]:
# -----------------------------------
# Confusion Matrix Analysis (Linear SVM)
# Evaluating True Positives, False Positives,
# True Negatives, and False Negatives
# -----------------------------------
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_svm)

sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# -----------------------------------
# Cross-Validation (5-Fold)
# Evaluating model stability across multiple splits
# -----------------------------------
from sklearn.model_selection import cross_val_score

scores = cross_val_score(svm, X_train_tfidf, y_train, cv=5)

print("Cross-validation scores:", scores)
print("Mean CV Accuracy:", scores.mean())

In [None]:
# -----------------------------------
# Error Analysis
# Inspecting misclassified samples to understand model limitations
# -----------------------------------
misclassified = X_test[y_test != y_pred_svm]
print(misclassified.head())