# TruthLens Capstone — ISOT Fake News Detection with MLflow

**Neoversity DS&DA | TruthLens MVP**  
Pipeline: load ISOT dataset → EDA → TF-IDF + multiple classifiers → MLflow tracking → comparison & best model.

Runs in Jupyter (Windows/WSL) and Colab. No `%%bash` — Python-only setup.

## 1) Setup (Python-only, Colab & local)

In [None]:
import subprocess
import sys
from pathlib import Path

def ensure_packages():
    packages = ["pandas", "numpy", "scikit-learn", "matplotlib", "mlflow", "joblib", "requests"]
    for p in packages:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", p])

ensure_packages()
Path("data/isot").mkdir(parents=True, exist_ok=True)
Path("artifacts").mkdir(parents=True, exist_ok=True)
Path("mlruns").mkdir(parents=True, exist_ok=True)
print("OK: packages & folders ready")

## 2) Load ISOT dataset (UVic fallback)

In [None]:
import json
from pathlib import Path
import pandas as pd

UVIC_ZIP_URL = "https://onlineacademiccommunity.uvic.ca/isot/wp-content/uploads/sites/7295/2023/03/News-_dataset.zip"

def _find_case_insensitive(folder: Path, filename: str):
    if not folder.exists(): return None
    target = filename.lower()
    for p in folder.iterdir():
        if p.is_file() and p.name.lower() == target: return p
    return None

def _download_and_unzip_uvic(dest_dir: Path):
    import io, zipfile, requests
    dest_dir.mkdir(parents=True, exist_ok=True)
    r = requests.get(UVIC_ZIP_URL, timeout=180)
    r.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
        zf.extractall(dest_dir)

def load_isot_dataset(base_dir: str = "data/isot", auto_download_uvic: bool = True) -> pd.DataFrame:
    base = Path(base_dir)
    candidates = [base, Path("data") / "isot", Path(".")]
    true_path = fake_path = None
    for folder in candidates:
        tp = _find_case_insensitive(folder, "True.csv")
        fp = _find_case_insensitive(folder, "Fake.csv")
        if tp and fp: true_path, fake_path = tp, fp; break
    if (true_path is None or fake_path is None) and auto_download_uvic:
        _download_and_unzip_uvic(base)
        true_path = _find_case_insensitive(base, "True.csv")
        fake_path = _find_case_insensitive(base, "Fake.csv")
    if true_path is None or fake_path is None:
        raise FileNotFoundError("True.csv/Fake.csv not found. Put in data/isot/ or enable auto_download.")
    true_df = pd.read_csv(true_path)
    fake_df = pd.read_csv(fake_path)
    true_df.columns = [c.strip().lower() for c in true_df.columns]
    fake_df.columns = [c.strip().lower() for c in fake_df.columns]
    true_df["label"] = 1
    fake_df["label"] = 0
    df = pd.concat([true_df, fake_df], ignore_index=True)
    if "text" not in df.columns: raise ValueError("Expected column 'text'.")
    title_part = df["title"].fillna("").astype(str) if "title" in df.columns else ""
    df["text"] = (title_part + " " + df["text"].fillna("").astype(str)).str.strip()
    df = df[df["text"].str.len() > 0].drop_duplicates(subset=["text", "label"]).reset_index(drop=True)
    print(f"Loaded: {df.shape[0]} rows | labels: {df['label'].value_counts().to_dict()}")
    return df

df = load_isot_dataset(base_dir="data/isot", auto_download_uvic=True)
X = df["text"]
y = df["label"]

## 3) EDA — class balance & text length

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
df["label"].value_counts().plot(kind="bar", ax=axes[0], title="Class balance (1=Real, 0=Fake)")
df["text"].str.len().hist(ax=axes[1], bins=50, title="Text length distribution")
plt.tight_layout()
plt.savefig("artifacts/eda.png", dpi=100)
plt.show()
print("Saved: artifacts/eda.png")

## 4) Train multiple models with MLflow

In [None]:
from pathlib import Path
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import joblib

mlflow.set_tracking_uri("file://" + str(Path("mlruns").resolve()))
mlflow.set_experiment("truthlens-fake-news-isot")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
vectorizer = TfidfVectorizer(max_features=20_000, ngram_range=(1, 2), min_df=2)
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

models = {
    "LogisticRegression": LogisticRegression(max_iter=500, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "MultinomialNB": MultinomialNB(),
}

metrics_list = []
best_f1, best_name, best_pipe = 0, None, None

for name, clf in models.items():
    with mlflow.start_run(run_name=name):
        clf.fit(X_train_tf, y_train)
        y_pred = clf.predict(X_test_tf)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        mlflow.log_params({"model": name, "vectorizer": "TfidfVectorizer", "max_features": 20000})
        mlflow.log_metrics({"accuracy": acc, "precision": prec, "recall": rec, "f1": f1})
        cm = confusion_matrix(y_test, y_pred)
        mlflow.log_dict({"confusion_matrix": cm.tolist()}, "confusion_matrix.json")
        if f1 > best_f1:
            best_f1, best_name = f1, name
            best_pipe = {"vectorizer": vectorizer, "model": clf}
        metrics_list.append({"model": name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1})
        print(f"{name}: accuracy={acc:.4f} f1={f1:.4f}")

metrics_df = pd.DataFrame(metrics_list)
joblib.dump(best_pipe, "artifacts/best_model.joblib")
with open("artifacts/best_run_summary.json", "w") as f:
    json.dump({"best_model": best_name, "best_f1": float(best_f1), "metrics": metrics_list}, f, indent=2)
print(f"\nBest model: {best_name} (f1={best_f1:.4f}). Saved: artifacts/best_model.joblib")

## 5) Comparative performance (Capstone requirement)

In [None]:
metrics_pct = metrics_df.copy()
for col in ["accuracy", "precision", "recall", "f1"]:
    metrics_pct[col] = (metrics_pct[col] * 100).round(2)
display(metrics_pct.sort_values("f1", ascending=False))

plt.figure(figsize=(8, 5))
plt.bar(metrics_pct["model"], metrics_pct["f1"])
plt.ylabel("F1-score (%)")
plt.title("TruthLens — Model F1 comparison (ISOT)")
plt.xticks(rotation=15)
plt.tight_layout()
plt.savefig("artifacts/model_comparison.png", dpi=100)
plt.show()

## 6) Inference demo

In [None]:
pipe = joblib.load("artifacts/best_model.joblib")
examples = [
    "Breaking: the government confirmed new economic measures today.",
    "Shocking secret cure doctors don't want you to know!!!",
    "Company reports quarterly earnings with moderate growth.",
]
X_ex = pipe["vectorizer"].transform(examples)
pred = pipe["model"].predict(X_ex)
for t, p in zip(examples, pred):
    lbl = 'Real' if p == 1 else 'Fake'
    print(f"{p} ({lbl}): {t[:60]}...")
print("\nLabel: 1=Real, 0=Fake")

---
**TruthLens Capstone** — MLflow UI: `mlflow ui --backend-store-uri ./mlruns`  
Best model and metrics: `artifacts/best_model.joblib`, `artifacts/best_run_summary.json`.

**Копіювання в репо TruthLens (WSL):**  
`cp "цій_шлях/notebooks_01_isot_fake_news_detection_mlflow.ipynb" ~/TruthLens/notebooks/`  
Потім: `git add notebooks/ && git commit -m "feat: add ML fake news notebook" && git push origin main`  
**Jupyter:** `pip install jupyter mlflow` → `jupyter notebook notebooks/notebooks_01_isot_fake_news_detection_mlflow.ipynb`