In [1]:
# Load cleaned dataset and build a classical ML baseline


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import pandas as pd

df = pd.read_csv(
    "/content/drive/MyDrive/explainable-fake-news-detector/data/fake_news_cleaned.csv"
)

df.head()


Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
X = df["text"]
y = df["label"]


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

model.fit(X_train_tfidf, y_train)


In [9]:
y_pred = model.predict(X_test_tfidf)


In [10]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2947
           1       0.94      0.95      0.94      3161

    accuracy                           0.94      6108
   macro avg       0.94      0.94      0.94      6108
weighted avg       0.94      0.94      0.94      6108

[[2741  206]
 [ 152 3009]]


In [11]:
import joblib

joblib.dump(model, "logreg_tfidf_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [15]:
!ls /content



drive  logreg_tfidf_model.pkl  sample_data  tfidf_vectorizer.pkl


In [19]:
# create data folder inside repo (if not exists)
!mkdir -p explainable-fake-news-detector/data

# move model + vectorizer into repo
!mv /content/explainable-fake-news-detector/data/logreg_tfidf_model.pkl drive/MyDrive/explainable-fake-news-detector/data/
!mv /content/explainable-fake-news-detector/data/tfidf_vectorizer.pkl drive/MyDrive/explainable-fake-news-detector/data/

# verify
!ls drive/MyDrive/explainable-fake-news-detector/data


fake_news_cleaned.csv  logreg_tfidf_model.pkl
fake_news.csv	       tfidf_vectorizer.pkl


In [18]:
!pwd

/content


In [27]:
ls /content/drive/MyDrive/explainable-fake-news-detector/data


fake_news_cleaned.csv  logreg_tfidf_model.pkl
fake_news.csv          tfidf_vectorizer.pkl
