In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [3]:
def load_data(file_paths, label):
    dfs = []
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        df.dropna(inplace=True)  
        df['label'] = label  
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Load datasets separately
politifact_real = load_data(["politifact_real.csv"], 1)
politifact_fake = load_data(["politifact_fake.csv"], 0)
gossipcop_real = load_data(["gossipcop_real_reduced.csv"], 1)
gossipcop_fake = load_data(["gossipcop_fake_reduced.csv"], 0)

politifact = pd.concat([politifact_real, politifact_fake], ignore_index=True)
gossipcop = pd.concat([gossipcop_real, gossipcop_fake], ignore_index=True)


In [4]:
def extract_text_features(df, vectorizer):
    return vectorizer.fit_transform(df['title'])


def extract_social_features(df):
    tweet_counts = df['tweet_ids'].apply(lambda x: len(str(x).split('\t')))
    tweet_counts = tweet_counts.values.reshape(-1, 1)
    return tweet_counts


def extract_article_features(df):
    url_lengths = df['url'].fillna('').apply(lambda x: len(str(x)))
    title_lengths = df['title'].fillna('').apply(lambda x: len(str(x)))
    return np.vstack((url_lengths, title_lengths)).T


def extract_user_features(df):
    has_user_info = df['username'].notnull().astype(int) if 'username' in df.columns else np.zeros(len(df), dtype=int)
    return has_user_info.reshape(-1, 1)

In [5]:
def preprocess_data(df, vectorizer):
    if 'url' not in df.columns:
        df['url'] = ''
    text_features = extract_text_features(df, vectorizer)
    social_features = extract_social_features(df)
    article_features = extract_article_features(df)
    user_features = extract_user_features(df)
    combined_features = np.hstack((text_features.toarray(), social_features, article_features, user_features))
    scaler = StandardScaler()
    features = scaler.fit_transform(combined_features)
    return features, df['label'].values

# Define vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# PolitiFact
features_p, labels_p = preprocess_data(politifact, vectorizer)
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(features_p, labels_p, test_size=0.2, random_state=42)
model_p = RandomForestClassifier(n_estimators=100, random_state=42)
model_p.fit(X_train_p, y_train_p)
y_pred_p = model_p.predict(X_test_p)

# GossipCop
features_g, labels_g = preprocess_data(gossipcop, vectorizer)
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(features_g, labels_g, test_size=0.2, random_state=42)
model_g = RandomForestClassifier(n_estimators=100, random_state=42)
model_g.fit(X_train_g, y_train_g)
y_pred_g = model_g.predict(X_test_g)


In [6]:
# Evaluation
print("Model Performance:")
print("PolitiFact:")
print(f"Accuracy: {accuracy_score(y_test_p, y_pred_p):.3f}, Precision: {precision_score(y_test_p, y_pred_p):.3f}, Recall: {recall_score(y_test_p, y_pred_p):.3f}, F1 Score: {f1_score(y_test_p, y_pred_p):.3f}")

print("GossipCop:")
print(f"Accuracy: {accuracy_score(y_test_g, y_pred_g):.3f}, Precision: {precision_score(y_test_g, y_pred_g):.3f}, Recall: {recall_score(y_test_g, y_pred_g):.3f}, F1 Score: {f1_score(y_test_g, y_pred_g):.3f}")


Model Performance:
PolitiFact:
Accuracy: 0.797, Precision: 0.787, Recall: 0.797, F1 Score: 0.792
GossipCop:
Accuracy: 0.830, Precision: 0.810, Recall: 0.955, F1 Score: 0.877
