In [2]:
# ðŸ§¹ Data Loading for Twitter Fake vs Verified News Detection

import pandas as pd
import numpy as np
import re
import string

# Load PolitiFact fake and real datasets
fake_df = pd.read_csv("data/politifact_fake.csv")
real_df = pd.read_csv("data/politifact_real.csv")


print("Fake news samples:", fake_df.shape)
print("Real news samples:", real_df.shape)

# Add labels
fake_df["label"] = 0   # Fake
real_df["label"] = 1   # Real

# Combine into a single dataframe
data = pd.concat([fake_df, real_df], ignore_index=True)
print("Total combined dataset:", data.shape)

# Keep only the title + label
data = data[["title", "label"]]

data.head()


Fake news samples: (432, 4)
Real news samples: (624, 4)
Total combined dataset: (1056, 5)


Unnamed: 0,title,label
0,BREAKING: First NFL Team Declares Bankruptcy O...,0
1,Court Orders Obama To Pay $400 Million In Rest...,0
2,UPDATE: Second Roy Moore Accuser Works For Mic...,0
3,Oscar Pistorius Attempts To Commit Suicide,0
4,Trump Votes For Death Penalty For Being Gay,0


Text Cleaning and Normalization

In [3]:
# ðŸ§¼ Text Cleaning Functions

def clean_text(text):
    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove mentions & hashtags
    text = re.sub(r"@\w+|#\w+", "", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove numbers
    text = re.sub(r"\d+", "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Apply cleaning
data["clean_title"] = data["title"].astype(str).apply(clean_text)

data.head()


Unnamed: 0,title,label,clean_title
0,BREAKING: First NFL Team Declares Bankruptcy O...,0,breaking first nfl team declares bankruptcy ov...
1,Court Orders Obama To Pay $400 Million In Rest...,0,court orders obama to pay million in restitution
2,UPDATE: Second Roy Moore Accuser Works For Mic...,0,update second roy moore accuser works for mich...
3,Oscar Pistorius Attempts To Commit Suicide,0,oscar pistorius attempts to commit suicide
4,Trump Votes For Death Penalty For Being Gay,0,trump votes for death penalty for being gay


Train/Test Split + Save Cleaned Data

In [4]:
# ðŸ“Š Train-Test Split & Save Cleaned Dataset

from sklearn.model_selection import train_test_split

# Use clean_title for modeling
X = data["clean_title"]
y = data["label"]

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))

# Save cleaned combined dataset (optional but useful)
cleaned_df = data[["clean_title", "label"]]
cleaned_df.to_csv("cleaned_data.csv", index=False)

print("Cleaned dataset saved as cleaned_data.csv")


Training samples: 844
Testing samples: 212
Cleaned dataset saved as cleaned_data.csv


TF-IDF Vectorization + Logistic Regression

In [5]:
# ðŸ”¤ TF-IDF Vectorization + Logistic Regression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF shape:", X_train_tfidf.shape)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test_tfidf)

# Metrics
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_prec = precision_score(y_test, y_pred_lr)
lr_rec = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print("ðŸ“Œ Logistic Regression Results")
print("Accuracy :", lr_acc)
print("Precision:", lr_prec)
print("Recall   :", lr_rec)
print("F1-score :", lr_f1)


TF-IDF shape: (844, 5000)
ðŸ“Œ Logistic Regression Results
Accuracy : 0.7830188679245284
Precision: 0.7762237762237763
Recall   : 0.888
F1-score : 0.8283582089552238


TF-IDF + SVM

In [6]:
# ðŸ¤– TF-IDF + Support Vector Machine (SVM)

from sklearn.svm import LinearSVC

# Train SVM model
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_svm = svm_model.predict(X_test_tfidf)

# Metrics
svm_acc = accuracy_score(y_test, y_pred_svm)
svm_prec = precision_score(y_test, y_pred_svm)
svm_rec = recall_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print("ðŸ“Œ SVM Results")
print("Accuracy :", svm_acc)
print("Precision:", svm_prec)
print("Recall   :", svm_rec)
print("F1-score :", svm_f1)


ðŸ“Œ SVM Results
Accuracy : 0.8113207547169812
Precision: 0.8571428571428571
Recall   : 0.816
F1-score : 0.8360655737704918




BERT (bert-base-uncased) fine-tuning