In [1]:
#error_analysis.ipynb

In [2]:
# Error Analysis – Fake News Detection

# This notebook analyzes incorrect predictions made by the
# TF-IDF + Logistic Regression model to understand its limitations
# and guide final model selection.


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import joblib


In [5]:
df = pd.read_csv(
    "/content/drive/MyDrive/explainable-fake-news-detector/data/fake_news_cleaned.csv"
)

df.head()


Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [6]:
model = joblib.load(
    "/content/drive/MyDrive/explainable-fake-news-detector/data/logreg_tfidf_model.pkl"
)

vectorizer = joblib.load(
    "/content/drive/MyDrive/explainable-fake-news-detector/data/tfidf_vectorizer.pkl"
)


In [7]:
from sklearn.model_selection import train_test_split

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
X_test_tfidf = vectorizer.transform(X_test)


In [9]:
y_pred = model.predict(X_test_tfidf)


In [10]:
error_mask = y_pred != y_test

errors = X_test[error_mask]
true_labels = y_test[error_mask]
predicted_labels = y_pred[error_mask]


In [11]:
error_df = pd.DataFrame({
    "text": errors.values,
    "true_label": true_labels.values,
    "predicted_label": predicted_labels
})

error_df.head(10)


Unnamed: 0,text,true_label,predicted_label
0,by Jerri-Lynn Scofield \nJerri-Lynn here: The ...,1,0
1,I recently wrote about the Canadian triathlete...,0,1
2,Insurance Prices for Many Obamacare Customers ...,1,0
3,[Photo: Jorge Lascar’s photo of the Great Wall...,1,0
4,The FBI has expanded its probe of Hillary Clin...,0,1
5,A group of senators is calling for higher wage...,0,1
6,Speaker of the House Paul Ryan (R-WI) refused ...,1,0
7,JUST IN: FBI Reopens Hillary Clinton Email Pro...,1,0
8,\nThe threat of World War 3 is over! We can no...,1,0
9,Billionare Donald Trump is doubling down on hi...,0,1


In [13]:
def label_to_name(label):
    return "FAKE" if label == 1 else "REAL"

error_df["true_label_name"] = error_df["true_label"].apply(label_to_name)
error_df["predicted_label_name"] = error_df["predicted_label"].apply(label_to_name)


In [None]:
### Error Observations

- Many false negatives involve subtle negation
- Long political articles are harder to classify
- Emotionally neutral fake news is difficult for TF-IDF


In [None]:
### Why BERT Handles These Errors Better

The TF-IDF model fails on long dependencies and contextual negation.
BERT uses self-attention to model word relationships, allowing it
to correctly interpret context-heavy fake news articles.

In [14]:
# Count types of errors
false_fake = error_df[
    (error_df["true_label"] == 0) & (error_df["predicted_label"] == 1)
]

false_real = error_df[
    (error_df["true_label"] == 1) & (error_df["predicted_label"] == 0)
]

print("REAL predicted as FAKE:", len(false_fake))
print("FAKE predicted as REAL:", len(false_real))


REAL predicted as FAKE: 206
FAKE predicted as REAL: 152


In [15]:
false_real.head(5)


Unnamed: 0,text,true_label,predicted_label,true_label_name,predicted_label_name
0,by Jerri-Lynn Scofield \nJerri-Lynn here: The ...,1,0,FAKE,REAL
2,Insurance Prices for Many Obamacare Customers ...,1,0,FAKE,REAL
3,[Photo: Jorge Lascar’s photo of the Great Wall...,1,0,FAKE,REAL
6,Speaker of the House Paul Ryan (R-WI) refused ...,1,0,FAKE,REAL
7,JUST IN: FBI Reopens Hillary Clinton Email Pro...,1,0,FAKE,REAL


In [None]:
### Final Error Analysis Observations

- The TF-IDF + Logistic Regression model performs well on explicit fake news
- Errors occur mainly in cases involving subtle language and long context
- False negatives often involve neutral or factual tone fake articles
- These limitations arise due to lack of contextual understanding in TF-IDF


In [None]:
## Final Model Selection

TF-IDF + Logistic Regression was selected as the baseline model due to its
interpretability, speed, and strong performance.

However, Transformer-based models such as BERT are better suited for
production use because they capture contextual and semantic relationships
that classical models fail to model.

Therefore:
- TF-IDF + Logistic Regression is used as an explainable baseline
- BERT is chosen as the final high-performance model
