In [1]:
#explainability.ipynb

In [3]:
# Explainability â€“ Fake News Detector

#This notebook explains the predictions of the TF-IDF + Logistic Regression
#model by analyzing word-level contributions.


In [4]:
import joblib
import numpy as np
import pandas as pd


In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
model = joblib.load(
    "/content/drive/MyDrive/explainable-fake-news-detector/data/logreg_tfidf_model.pkl"
)

vectorizer = joblib.load(
    "/content/drive/MyDrive/explainable-fake-news-detector/data/tfidf_vectorizer.pkl"
)


In [8]:
feature_names = vectorizer.get_feature_names_out()


In [9]:
coefficients = model.coef_[0]


In [10]:
coef_df = pd.DataFrame({
    "word": feature_names,
    "weight": coefficients
})


In [11]:
top_fake_words = coef_df.sort_values(by="weight", ascending=False).head(20)
top_real_words = coef_df.sort_values(by="weight").head(20)

top_fake_words, top_real_words


(                 word    weight
 6600   featured image  6.426215
 8458            image  6.340968
 8086          hillary  6.194660
 6599         featured  6.013369
 18604     twitter com  5.211341
 3550              com  4.490093
 12113         october  4.362141
 14178            read  4.216685
 19347           watch  4.213137
 7357            getty  4.112673
 7358     getty images  4.028618
 255              2016  3.889845
 9419             just  3.701218
 12016           obama  3.317492
 11951        november  3.214623
 1135          america  3.206878
 1301             anti  3.130637
 8479           images  2.842899
 6577              fbi  2.786600
 17966           today  2.756276,
                      word     weight
 14849             reuters -17.358927
 15289                said -13.368222
 6914               follow  -7.058507
 19332  washington reuters  -6.401678
 13477    president donald  -6.186189
 11361                  mr  -5.869123
 18601             twitter  -4.796312
 2

In [12]:
def explain_prediction(text, top_k=10):
    X = vectorizer.transform([text])
    indices = X.nonzero()[1]

    contributions = [
        (feature_names[i], coefficients[i] * X[0, i])
        for i in indices
    ]

    contributions = sorted(contributions, key=lambda x: abs(x[1]), reverse=True)
    return contributions[:top_k]


In [13]:
text = "Breaking news: shocking conspiracy revealed"

explanation = explain_prediction(text)

for word, score in explanation:
    print(f"{word}: {score:.4f}")


shocking: 0.4855
revealed: 0.3191
breaking news: 0.2988
breaking: 0.2840
conspiracy: 0.2068
news: 0.1431


In [14]:
### Interpretation

#Positive contributions push the prediction toward FAKE,
#while negative contributions push it toward REAL.
#This allows transparent, word-level explanation of predictions.
