In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Load the balanced datasets
df_politifact_real = pd.read_csv("politifact_real.csv")
df_politifact_fake = pd.read_csv("politifact_fake.csv")
df_gossipcop_real = pd.read_csv("gossipcop_real_reduced.csv")
df_gossipcop_fake = pd.read_csv("gossipcop_fake_reduced.csv")

# Drop missing values just to be safe
df_politifact_real.dropna(inplace=True)
df_politifact_fake.dropna(inplace=True)
df_gossipcop_real.dropna(inplace=True)
df_gossipcop_fake.dropna(inplace=True)

# Assign labels
df_politifact_real["label"] = 1
df_politifact_fake["label"] = 0
df_gossipcop_real["label"] = 1
df_gossipcop_fake["label"] = 0

# Merge real and fake for each dataset
df_politifact = pd.concat([df_politifact_real, df_politifact_fake], ignore_index=True)
df_gossipcop = pd.concat([df_gossipcop_real, df_gossipcop_fake], ignore_index=True)



In [3]:
# Initialize vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

def preprocess_text(df):
    X = vectorizer.fit_transform(df["title"])
    y = df["label"].values
    return X, y

# Preprocess both datasets
X_p, y_p = preprocess_text(df_politifact)
X_g, y_g = preprocess_text(df_gossipcop)


In [4]:
# Train/test split
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_p, y_p, test_size=0.2, random_state=42)
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_g, y_g, test_size=0.2, random_state=42)

# Train and evaluate LR on PolitiFact
model_p = LogisticRegression()
model_p.fit(X_train_p, y_train_p)
y_pred_p = model_p.predict(X_test_p)

# Metrics for PolitiFact
accuracy_p = accuracy_score(y_test_p, y_pred_p)
precision_p = precision_score(y_test_p, y_pred_p)
recall_p = recall_score(y_test_p, y_pred_p)
f1_p = f1_score(y_test_p, y_pred_p)

# Train and evaluate LR on GossipCop
model_g = LogisticRegression()
model_g.fit(X_train_g, y_train_g)
y_pred_g = model_g.predict(X_test_g)

# Metrics for GossipCop
accuracy_g = accuracy_score(y_test_g, y_pred_g)
precision_g = precision_score(y_test_g, y_pred_g)
recall_g = recall_score(y_test_g, y_pred_g)
f1_g = f1_score(y_test_g, y_pred_g)


In [5]:
# Print results
print("\nModel Performance:")
print("PolitiFact:")
print(f"Accuracy: {accuracy_p:.3f}, Precision: {precision_p:.3f}, Recall: {recall_p:.3f}, F1 Score: {f1_p:.3f}")
print("GossipCop:")
print(f"Accuracy: {accuracy_g:.3f}, Precision: {precision_g:.3f}, Recall: {recall_g:.3f}, F1 Score: {f1_g:.3f}")



Model Performance:
PolitiFact:
Accuracy: 0.810, Precision: 0.817, Recall: 0.784, F1 Score: 0.800
GossipCop:
Accuracy: 0.759, Precision: 0.734, Recall: 0.970, F1 Score: 0.836
