# 02 — Classic ML: TF-IDF + Naive Bayes + SVM

**Project:** Clickbait Headline Detector  
Baseline model using traditional NLP features. Fast, interpretable, and a solid reference point.

> Requires `data/cleaned.csv` — run `01_EDA.ipynb` first.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import warnings
import joblib

import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV   # gives SVM proper probabilities
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

STOP_WORDS = set(stopwords.words('english'))

## Config

In [None]:
CLEANED_PATH = 'data/cleaned.csv'
MODELS_DIR   = 'models'
RANDOM_STATE = 42
TEST_SIZE    = 0.2

os.makedirs(MODELS_DIR, exist_ok=True)

assert os.path.exists(CLEANED_PATH), (
    f'File not found: {CLEANED_PATH!r}. Run 01_EDA.ipynb first.'
)
print('Config OK.')

## Load & Preprocess

In [None]:
df = pd.read_csv(CLEANED_PATH)
print(f'Loaded {len(df)} rows — class balance: {df["label"].value_counts().to_dict()}')

In [None]:
def clean_text(text):
    """Lowercase, strip non-alpha chars, remove stopwords."""
    text = re.sub('[^a-z ]', '', str(text).lower())
    return ' '.join(w for w in text.split() if w not in STOP_WORDS and len(w) > 1)


df['clean'] = df['headline'].apply(clean_text)

print(df[['headline', 'clean']].head(3).to_string())

## Train / Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean'], df['label'],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=df['label']
)
print(f'Train: {len(X_train)} samples  |  Test: {len(X_test)} samples')

## TF-IDF Vectorisation
Using unigrams + bigrams so the model can pick up two-word signals like *"you won't"* or *"find out"*.

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=50_000)

X_train_vec = tfidf.fit_transform(X_train)   # fit on train only — avoid data leakage
X_test_vec  = tfidf.transform(X_test)

print(f'Vocab size: {len(tfidf.vocabulary_):,}')
print(f'Matrix shape (train): {X_train_vec.shape}')

## Model A — Naive Bayes
MultinomialNB has `predict_proba()` built-in — confidence scores work out of the box.

In [None]:
nb = MultinomialNB(alpha=0.1)
nb.fit(X_train_vec, y_train)

nb_preds = nb.predict(X_test_vec)
nb_acc   = accuracy_score(y_test, nb_preds)

print(f'Naive Bayes Accuracy: {nb_acc:.4f}\n')
print(classification_report(y_test, nb_preds, target_names=['Real', 'Clickbait']))

## Model B — SVM (LinearSVC + Calibration)
LinearSVC doesn't produce probabilities natively.  
`CalibratedClassifierCV` wraps it with Platt scaling so we get proper `predict_proba()` output.

In [None]:
base_svm = LinearSVC(C=1.0, max_iter=2000, random_state=RANDOM_STATE)

# cv=5 means the calibration uses 5-fold cross-val — more reliable probability estimates
svm = CalibratedClassifierCV(base_svm, cv=5)
svm.fit(X_train_vec, y_train)

svm_preds = svm.predict(X_test_vec)
svm_acc   = accuracy_score(y_test, svm_preds)

print(f'SVM Accuracy: {svm_acc:.4f}\n')
print(classification_report(y_test, svm_preds, target_names=['Real', 'Clickbait']))

## Comparison — Confusion Matrices

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for ax, preds, name, acc in zip(
    axes,
    [nb_preds, svm_preds],
    ['Naive Bayes', 'SVM (Calibrated)'],
    [nb_acc, svm_acc]
):
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(
        cm, annot=True, fmt='d', cmap='Blues', ax=ax,
        xticklabels=['Real', 'Clickbait'],
        yticklabels=['Real', 'Clickbait']
    )
    ax.set_title(f'{name}\nAccuracy: {acc:.4f}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.suptitle('Classic ML — Confusion Matrices', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig('models/classic_confusion_matrices.png', dpi=120, bbox_inches='tight')
plt.show()

results = pd.DataFrame({
    'Model':    ['Naive Bayes', 'SVM (Calibrated)'],
    'Accuracy': [round(nb_acc, 4), round(svm_acc, 4)]
})
print('\n--- Model Comparison ---')
print(results.to_string(index=False))

## Save Best Model

In [None]:
best_model = svm if svm_acc >= nb_acc else nb
best_name  = 'SVM' if svm_acc >= nb_acc else 'NaiveBayes'

joblib.dump(tfidf,      f'{MODELS_DIR}/tfidf_vectorizer.pkl')
joblib.dump(best_model, f'{MODELS_DIR}/best_classic_model.pkl')

print(f'Saved best model ({best_name}) and TF-IDF vectorizer to {MODELS_DIR!r}/')

---
## Try Your Own Headline
Change the string below and run the cell — it will tell you the prediction and how confident the model is.

In [None]:
# --- Change this to any headline you want to test ---
my_headline = 'You will not believe what this celebrity did next!'


def predict_classic(headline, model, vectorizer):
    """Clean, vectorise, and predict a single headline. Returns label + confidence."""
    cleaned = clean_text(headline)
    vec     = vectorizer.transform([cleaned])
    label   = model.predict(vec)[0]
    proba   = model.predict_proba(vec)[0]  # [P(real), P(clickbait)]
    confidence = proba[label] * 100
    return {
        'headline'  : headline,
        'prediction': 'Clickbait' if label == 1 else 'Real News',
        'confidence': f'{confidence:.1f}%'
    }


result = predict_classic(my_headline, best_model, tfidf)

print(f'Headline   : {result["headline"]}')
print(f'Prediction : {result["prediction"]}')
print(f'Confidence : {result["confidence"]}')