In [1]:
import pandas as pd
import os

# paths (adjust if your files are in a different folder)
fake_path = "data/fake.csv"
true_path = "data/true.csv"

# load
df_fake = pd.read_csv(fake_path)
df_true = pd.read_csv(true_path)

# add standardized label: 1 = fake, 0 = real/true
df_fake = df_fake.copy()
df_true = df_true.copy()
df_fake['label'] = 1
df_true['label'] = 0

# try to find a text column (common names)
def find_text_col(df):
    for c in df.columns:
        if c.lower() in ("text","content","article","headline","title","body"):
            return c
    # fallback: pick the longest-string column
    str_cols = [c for c in df.columns if df[c].dtype == object]
    if not str_cols:
        return None
    return max(str_cols, key=lambda col: df[col].astype(str).str.len().median())

fake_text_col = find_text_col(df_fake)
true_text_col = find_text_col(df_true)

print("Detected text columns -> fake:", fake_text_col, ", true:", true_text_col)

# rename to unified column 'text'
df_fake = df_fake.rename(columns={fake_text_col: "text"})
df_true = df_true.rename(columns={true_text_col: "text"})

# keep only text + label (drop others for now)
df_fake = df_fake[["text","label"]].copy()
df_true = df_true[["text","label"]].copy()

# concat and shuffle
df = pd.concat([df_fake, df_true], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

print("\nCombined dataset rows:", len(df))
print("Label distribution:")
print(df['label'].value_counts())

print("\nFirst 8 rows (text preview):")
pd.set_option("display.max_colwidth", 200)
display(df.head(8))


Detected text columns -> fake: title , true: title

Combined dataset rows: 44898
Label distribution:
label
1    23481
0    21417
Name: count, dtype: int64

First 8 rows (text preview):


Unnamed: 0,text,text.1,label
0,Ben Stein Calls Out 9th Circuit Court: Committed a ‘Coup d’état’ Against the Constitution,"21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Hollywood fame appearing in TV shows and films such as Ferris Bueller s Day Off) made some provocati...",1
1,Trump drops Steve Bannon from National Security Council,"WASHINGTON (Reuters) - U.S. President Donald Trump removed his chief strategist Steve Bannon from the National Security Council on Wednesday, reversing his controversial decision early this year t...",0
2,Puerto Rico expects U.S. to lift Jones Act shipping restrictions,"(Reuters) - Puerto Rico Governor Ricardo Rossello said on Wednesday he expected the federal government to waive the Jones Act, which would lift restrictions on ships that can provide aid to the is...",0
3,OOPS: Trump Just Accidentally Confirmed He Leaked Israeli Intelligence To Russia (VIDEO),"On Monday, Donald Trump once again embarrassed himself and his country by accidentally revealing the source of the extremely classified information he leaked to Russia earlier this month.While it ...",1
4,Donald Trump heads for Scotland to reopen a golf resort,"GLASGOW, Scotland (Reuters) - Most U.S. presidential candidates go abroad to sharpen their foreign policy credentials. Donald Trump arrives in Scotland on Friday to reopen a golf resort. The presu...",0
5,Paul Ryan Responds To Dem’s Sit-In On Gun Control In The Most DISGUSTING Way (VIDEO),"On Wednesday, Democrats took a powerful stance against the GOP s refusal to vote on gun control measures by staging a sit-in. While Republican politicians called a recess and took a lunch break, D...",1
6,AWESOME! DIAMOND AND SILK Rip Into The Press: “We don’t believe you!” [Video],President Trump s rally in FL on Saturday was a smashing success with about 9000 crowding into a hanger to hear him speak and another two or three times that amount congregating outside.Trump told...,1
7,"STAND UP AND CHEER! UKIP Party Leader SLAMS Germany, France And EU Invasion Of Phony Refugees [VIDEO]","He s been Europe s version of the outspoken Ted Cruz for some time now. Nigel Farage, leader of the UK Independence Party may be the most disliked member of the European Parliment. But he plows ah...",1


In [3]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# cleaning function (same as sentiment model to reuse SPL assets later)
def clean(text):
    text = str(text).lower()
    text = re.sub(r"http\\S+", "", text)
    text = re.sub(r"[^a-z\\s]", "", text)
    text = re.sub(r"\\s+", " ", text).strip()
    return text

df["clean_text"] = df["text"].astype(str).apply(clean)

# split data
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], 
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

# vectorizer (TF-IDF)
vectorizer_fake = TfidfVectorizer(max_features=7000)
X_train_vec = vectorizer_fake.fit_transform(X_train)
X_test_vec = vectorizer_fake.transform(X_test)

print("Train shape:", X_train_vec.shape)
print("Test shape:", X_test_vec.shape)
print("Fake-news label distribution (train):")
print(y_train.value_counts())
print("Fake-news label distribution (test):")
print(y_test.value_counts())


ValueError: cannot reindex on an axis with duplicate labels

In [5]:
# Fix duplicate columns, then clean & vectorize (single step)
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 1) show current columns
print("Before fix - columns:", df.columns.tolist())

# 2) remove duplicate column names (keep first occurrence)
if df.columns.duplicated().any():
    df = df.loc[:, ~df.columns.duplicated()]
    print("Removed duplicate columns.")
else:
    print("No duplicate column names found.")

print("After fix - columns:", df.columns.tolist())

# 3) define cleaning and apply
def clean(text):
    text = str(text).lower()
    text = re.sub(r"http\\S+", "", text)
    text = re.sub(r"[^a-z\\s]", "", text)
    text = re.sub(r"\\s+", " ", text).strip()
    return text

df["clean_text"] = df["text"].astype(str).apply(clean)

# 4) split & vectorize
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], 
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

vectorizer_fake = TfidfVectorizer(max_features=7000)
X_train_vec = vectorizer_fake.fit_transform(X_train)
X_test_vec  = vectorizer_fake.transform(X_test)

print("Train shape:", X_train_vec.shape)
print("Test shape:", X_test_vec.shape)
print("Label distribution (train):")
print(y_train.value_counts())
print("Label distribution (test):")
print(y_test.value_counts())


Before fix - columns: ['text', 'text', 'label']
Removed duplicate columns.
After fix - columns: ['text', 'label']
Train shape: (35918, 7000)
Test shape: (8980, 7000)
Label distribution (train):
label
1    18785
0    17133
Name: count, dtype: int64
Label distribution (test):
label
1    4696
0    4284
Name: count, dtype: int64


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# train model
fake_model = LogisticRegression(max_iter=2000)
fake_model.fit(X_train_vec, y_train)

# evaluate
y_pred = fake_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.5141425389755011

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.66      4284
           1       1.00      0.07      0.13      4696

    accuracy                           0.51      8980
   macro avg       0.75      0.54      0.40      8980
weighted avg       0.76      0.51      0.39      8980


Confusion Matrix:
[[4284    0]
 [4363  333]]


In [9]:
# One-step improvement: TF-IDF with bigrams + more features + class-balanced logistic regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# 1) new vectorizer (bigrams too)
vectorizer_fake = TfidfVectorizer(max_features=10000, ngram_range=(1,2), max_df=0.95, min_df=5)
X_train_vec = vectorizer_fake.fit_transform(X_train)
X_test_vec  = vectorizer_fake.transform(X_test)

# 2) balanced logistic regression
fake_model = LogisticRegression(max_iter=2000, class_weight='balanced', C=1.0)
fake_model.fit(X_train_vec, y_train)

# 3) evaluate
y_pred = fake_model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 4) save improved artifacts for your app
joblib.dump(fake_model, "artifacts/fake_model.joblib")
joblib.dump(vectorizer_fake, "artifacts/fake_vectorizer.joblib")
print("\nSaved improved fake_model and fake_vectorizer to artifacts/")


Accuracy: 0.5240534521158129

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      4284
           1       0.52      1.00      0.69      4696

    accuracy                           0.52      8980
   macro avg       0.76      0.50      0.35      8980
weighted avg       0.75      0.52      0.36      8980


Confusion Matrix:
[[  10 4274]
 [   0 4696]]

Saved improved fake_model and fake_vectorizer to artifacts/


In [11]:
# One-step: Calibrate probabilities + find best threshold (max F1) and evaluate
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve, f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import joblib

# 1) Create a small validation split from TRAIN (not the test set)
X_tr_sub, X_val_sub, y_tr_sub, y_val_sub = train_test_split(
    X_train_vec, y_train, test_size=0.15, random_state=42, stratify=y_train
)

# 2) Calibrate the already-trained model (uses the model as base estimator)
#    We'll fit a calibrated wrapper on a portion of the train data (X_tr_sub)
calibrator = CalibratedClassifierCV(base_estimator=fake_model, cv='prefit', method='sigmoid')
calibrator.fit(X_val_sub, y_val_sub)   # calibrator expects fitted base; using val for calibration

# 3) Get calibrated probabilities on validation to find best threshold
val_probs = calibrator.predict_proba(X_val_sub)[:, 1]  # prob for class 1 (fake)
precisions, recalls, thresholds = precision_recall_curve(y_val_sub, val_probs)

# compute F1 for each threshold (note thresholds array length = len(precisions)-1)
f1s = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-12)
best_idx = np.nanargmax(f1s)
best_threshold = thresholds[best_idx]

print("Best threshold on validation (max F1):", best_threshold)
print("Val F1 at best threshold:", f1s[best_idx])

# 4) Evaluate on TEST set using calibrated probs + best_threshold
test_probs = calibrator.predict_proba(X_test_vec)[:, 1]
y_pred_thresh = (test_probs >= best_threshold).astype(int)

print("\nTest accuracy (thresholded):", accuracy_score(y_test, y_pred_thresh))
print("\nClassification Report (thresholded):")
print(classification_report(y_test, y_pred_thresh))
print("\nConfusion Matrix (thresholded):")
print(confusion_matrix(y_test, y_pred_thresh))

# 5) Save calibrated model and threshold for your app
joblib.dump(calibrator, "artifacts/fake_model_calibrated.joblib")
# store threshold too
joblib.dump(best_threshold, "artifacts/fake_threshold.joblib")
print("\nSaved calibrated model and threshold to artifacts/")


TypeError: CalibratedClassifierCV.__init__() got an unexpected keyword argument 'base_estimator'

In [13]:
# One-step: Calibrate probabilities + find best threshold (max F1) and evaluate
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve, f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import joblib

# 1) Create a small validation split from TRAIN (not test set)
X_tr_sub, X_val_sub, y_tr_sub, y_val_sub = train_test_split(
    X_train_vec, y_train, test_size=0.15, random_state=42, stratify=y_train
)

# 2) Calibrate the already-trained model (older sklearn uses estimator=)
calibrator = CalibratedClassifierCV(estimator=fake_model, cv='prefit', method='sigmoid')
calibrator.fit(X_val_sub, y_val_sub)

# 3) Get calibrated probabilities on validation for best threshold search
val_probs = calibrator.predict_proba(X_val_sub)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_val_sub, val_probs)

# F1 at each threshold
f1s = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-12)
best_idx = np.nanargmax(f1s)
best_threshold = thresholds[best_idx]

print("Best threshold on validation:", best_threshold)
print("Validation F1 at best threshold:", f1s[best_idx])

# 4) Evaluate on TEST with best threshold
test_probs = calibrator.predict_proba(X_test_vec)[:, 1]
y_pred_thresh = (test_probs >= best_threshold).astype(int)

print("\nTest accuracy:", accuracy_score(y_test, y_pred_thresh))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_thresh))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_thresh))

# 5) Save calibrated model + threshold
joblib.dump(calibrator, "artifacts/fake_model_calibrated.joblib")
joblib.dump(best_threshold, "artifacts/fake_threshold.joblib")

print("\nSaved calibrated model and threshold.")


Best threshold on validation: 0.5234119939671674
Validation F1 at best threshold: 0.6872332642356176

Test accuracy: 0.5240534521158129

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      4284
           1       0.52      1.00      0.69      4696

    accuracy                           0.52      8980
   macro avg       0.76      0.50      0.35      8980
weighted avg       0.75      0.52      0.36      8980


Confusion Matrix:
[[  10 4274]
 [   0 4696]]

Saved calibrated model and threshold.


In [16]:
# Train a RandomForest baseline for fake-news (balanced)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

rf = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

# Train (use existing X_train_vec, y_train, X_test_vec, y_test)
rf.fit(X_train_vec, y_train)

y_pred_rf = rf.predict(X_test_vec)

print("Accuracy (RF):", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report (RF):")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix (RF):")
print(confusion_matrix(y_test, y_pred_rf))

# Save model & (optionally) wrap vectorizer as before
joblib.dump(rf, "artifacts/fake_model_rf.joblib")
print("\nSaved Random Forest to artifacts/fake_model_rf.joblib")


Accuracy (RF): 0.5240534521158129

Classification Report (RF):
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      4284
           1       0.52      1.00      0.69      4696

    accuracy                           0.52      8980
   macro avg       0.76      0.50      0.35      8980
weighted avg       0.75      0.52      0.36      8980


Confusion Matrix (RF):
[[  10 4274]
 [   0 4696]]

Saved Random Forest to artifacts/fake_model_rf.joblib


In [18]:
# Fine-tune DistilBERT for fake-news classification (small sample, CPU-friendly)
# Copy-paste and run in your notebook.

# 0) Install deps if missing (uncomment if needed)
# !pip install -q transformers datasets accelerate evaluate

from sklearn.model_selection import train_test_split
import numpy as np
import os
import joblib
import torch

# safety: reduce logging spam
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# 1) prepare data (uses df with columns ['text','label'] already available)
# Use a smaller sample to train quickly (adjust sample_size if you have GPU)
sample_size = 7000   # total samples to use (train+val); set higher if you have GPU
if len(df) > sample_size:
    df_small = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
else:
    df_small = df.copy()

# split into train/val/test: keep your existing X_test_vec/y_test for final eval
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_small['text'].astype(str).tolist(),
    df_small['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_small['label']
)

print("Training samples:", len(train_texts), "Validation samples:", len(val_texts), "Test samples (existing):", len(X_test_vec))

# 2) Hugging Face dataset and tokenization
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(texts):
    return tokenizer(texts, truncation=True, padding="max_length", max_length=128)

train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_ds   = Dataset.from_dict({"text": val_texts,   "label": val_labels})

train_ds = train_ds.map(lambda x: tokenize_batch(x["text"]), batched=True)
val_ds   = val_ds.map(lambda x: tokenize_batch(x["text"]), batched=True)

# set format for PyTorch
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 3) model (binary classification)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 4) training args (keep small for CPU)
training_args = TrainingArguments(
    output_dir="artifacts/fake_distilbert",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,              # 1-2 epochs for quick run; increase if you have GPU
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=False,
    push_to_hub=False,
    seed=42
)

# 5) metrics function
metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = metric.compute(predictions=preds, references=labels, average="binary")["f1"]
    return {"f1": f1}

# 6) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 7) Train (this will print progress)
trainer.train()

# 8) Evaluate on your held-out test set (use raw df test texts)
# prepare test texts from your earlier split X_test / y_test (we used clean_text for fake pipeline)
# If you have df test split, use that. We'll build test_texts from X_test (which are cleaned) if available.
try:
    # If you have 'X_test' as raw texts, use it; otherwise derive from df using test indices.
    test_texts = X_test.tolist() if 'X_test' in globals() else df['clean_text'].iloc[y_test.index].tolist()
    test_labels = y_test.tolist()
except Exception:
    # fallback: use a small sample of val as test
    test_texts = val_texts
    test_labels = val_labels

# Tokenize test
enc = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
model.eval()
with torch.no_grad():
    outputs = model(enc["input_ids"], attention_mask=enc["attention_mask"])
    logits = outputs.logits.detach().cpu().numpy()
    preds = np.argmax(logits, axis=1)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("\nDistilBERT Test accuracy:", accuracy_score(test_labels, preds))
print("\nClassification report:\n", classification_report(test_labels, preds))
print("\nConfusion matrix:\n", confusion_matrix(test_labels, preds))

# 9) Save fine-tuned model & tokenizer (Trainer.save_model already saved in output_dir)
trainer.save_model("artifacts/fake_distilbert")
tokenizer.save_pretrained("artifacts/fake_distilbert")
print("\nSaved DistilBERT model & tokenizer to artifacts/fake_distilbert")


ModuleNotFoundError: No module named 'torch'

In [20]:
# Stronger sklearn pipeline: word + char ngrams + balanced logistic regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Use raw text column (df['text']) not already-vectorized X_train_vec
texts = df["clean_text"] if "clean_text" in df.columns else df["text"].astype(str)
labels = df["label"]

# split (we'll create a fresh split to train quickly)
X_tr, X_te, y_tr, y_te = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

word_vect = TfidfVectorizer(ngram_range=(1,2), max_df=0.95, min_df=5, max_features=15000)
char_vect = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_df=0.95, min_df=5, max_features=5000)

union = FeatureUnion([("word", word_vect), ("char", char_vect)])

pipeline = Pipeline([
    ("feats", union),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", C=1.0))
])

print("Fitting pipeline (this may take a minute)...")
pipeline.fit(X_tr, y_tr)

# evaluate
y_pred = pipeline.predict(X_te)
print("Accuracy:", accuracy_score(y_te, y_pred))
print("\nClassification Report:\n", classification_report(y_te, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_te, y_pred))

# save artifacts for app (optional)
import os
os.makedirs("artifacts", exist_ok=True)
joblib.dump(pipeline, "artifacts/fake_model_skl_pipeline.joblib")
print("\nSaved pipeline -> artifacts/fake_model_skl_pipeline.joblib")


Fitting pipeline (this may take a minute)...
Accuracy: 0.9407572383073497

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94      4284
           1       0.96      0.93      0.94      4696

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980


Confusion Matrix:
 [[4099  185]
 [ 347 4349]]

Saved pipeline -> artifacts/fake_model_skl_pipeline.joblib
