In [16]:
import pandas as pd

true_df = pd.read_csv("/Users/owner/Documents/news-bias-detection/data/fake_news/raw/True.csv")
fake_df = pd.read_csv("/Users/owner/Documents/news-bias-detection/data/fake_news/raw/Fake.csv")

true_df["label"] = 1   # real
fake_df["label"] = 0   # fake

df = pd.concat([true_df, fake_df]).sample(frac=1, random_state=42)
df = df[["title", "text", "label"]]
df.to_csv("/Users/owner/Documents/news-bias-detection/data/fake_news/processed/isot_clean.csv", index=False)
print(df["label"].value_counts())

label
0    23481
1    21417
Name: count, dtype: int64


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

df.sample(frac=1,random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=1)

vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=300)
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4731
           1       0.98      0.99      0.99      4249

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [20]:
import numpy as np
feature_names = np.array(vectorizer.get_feature_names_out())
sorted_idx = clf.coef_[0].argsort()
print("Fake indicative words:", feature_names[sorted_idx[:20]])
print("Real indicative words:", feature_names[sorted_idx[-20:]])

Fake indicative words: ['image' 'just' 'featured' 'read' 'gop' 'com' 'hillary' 'watch' 'mr'
 'america' 'getty' 'pic' 'like' 'https' 'images' 'american' 'obama' 'wire'
 '21st' 'rep']
Real indicative words: ['killed' 'moscow' 'democratic' 'told' 'year' 'presidential' 'comment'
 'reporters' 'spokesman' 'nov' 'statement' 'minister' 'friday' 'monday'
 'republican' 'tuesday' 'thursday' 'wednesday' 'washington' 'reuters']
