# 03 — Deep Learning: Bidirectional LSTM

**Project:** Clickbait Headline Detector  
A neural network that reads headlines as sequences, capturing word order that TF-IDF ignores.

> Requires `data/cleaned.csv` — run `01_EDA.ipynb` first.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import warnings

import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

np.random.seed(42)
tf.random.set_seed(42)

STOP_WORDS = set(stopwords.words('english'))

## Config
All tunable hyperparameters live here — tweak these before re-running the training cell.

In [None]:
CLEANED_PATH = 'data/cleaned.csv'
MODELS_DIR   = 'models'

# Tokenisation
VOCAB_SIZE = 20_000   # keep the top N most frequent words
MAX_LEN    = 30       # headlines rarely exceed 30 words; pad/truncate to this

# Model
EMBED_DIM  = 64       # embedding vector size per word
LSTM_UNITS = 64       # hidden units per LSTM direction

# Training
EPOCHS       = 20     # EarlyStopping will cut this short if needed
BATCH_SIZE   = 64
RANDOM_STATE = 42
TEST_SIZE    = 0.2

os.makedirs(MODELS_DIR, exist_ok=True)

assert os.path.exists(CLEANED_PATH), (
    f'File not found: {CLEANED_PATH!r}. Run 01_EDA.ipynb first.'
)
print('Config OK.')

## Load & Clean

In [None]:
df = pd.read_csv(CLEANED_PATH)
print(f'Loaded {len(df)} rows.')


def clean_text(text):
    """Lowercase, strip non-alpha chars, remove stopwords."""
    text = re.sub('[^a-z ]', '', str(text).lower())
    return ' '.join(w for w in text.split() if w not in STOP_WORDS and len(w) > 1)


df['clean'] = df['headline'].apply(clean_text)

## Tokenise & Pad
Convert words to integer indices, then pad all sequences to the same length so they batch cleanly.

In [None]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean'])

sequences = tokenizer.texts_to_sequences(df['clean'])
padded    = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
labels    = df['label'].values

actual_vocab = min(VOCAB_SIZE, len(tokenizer.word_index))
print(f'Vocabulary size : {actual_vocab:,} unique tokens')
print(f'Padded shape    : {padded.shape}')

## Train / Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    padded, labels,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels
)
print(f'Train: {len(X_train)} samples  |  Test: {len(X_test)} samples')

## Model Architecture
`Embedding → Dropout → Bidirectional LSTM → Dropout → Dense → Sigmoid`  
Bidirectional lets the LSTM read each headline left-to-right *and* right-to-left before deciding.

In [None]:
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Dropout(0.3),
    Bidirectional(LSTM(LSTM_UNITS)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')   # outputs a value 0‒1; threshold at 0.5 for label
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

## Train
`EarlyStopping` halts training when validation loss stops improving.  
`ReduceLROnPlateau` lowers the learning rate when progress stalls.

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)
]

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

## Training Curves
Look for val curves diverging from train — a sign of overfitting.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(history.history['accuracy'],     label='Train', color='steelblue')
axes[0].plot(history.history['val_accuracy'], label='Val',   color='tomato', linestyle='--')
axes[0].set_title('Accuracy per Epoch')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()

axes[1].plot(history.history['loss'],     label='Train', color='steelblue')
axes[1].plot(history.history['val_loss'], label='Val',   color='tomato', linestyle='--')
axes[1].set_title('Loss per Epoch')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()

plt.suptitle('LSTM Training History', fontsize=14)
plt.tight_layout()
plt.savefig('models/lstm_training_history.png', dpi=120, bbox_inches='tight')
plt.show()

## Evaluate

In [None]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
preds     = (model.predict(X_test, verbose=0) > 0.5).astype(int).flatten()

print(f'Test Accuracy : {acc:.4f}')
print(f'Test Loss     : {loss:.4f}\n')
print(classification_report(y_test, preds, target_names=['Real', 'Clickbait']))

In [None]:
cm = confusion_matrix(y_test, preds)

fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Purples', ax=ax,
    xticklabels=['Real', 'Clickbait'],
    yticklabels=['Real', 'Clickbait']
)
ax.set_title(f'LSTM Confusion Matrix — Accuracy: {acc:.4f}')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.tight_layout()
plt.savefig('models/lstm_confusion_matrix.png', dpi=120, bbox_inches='tight')
plt.show()

## Save Model

In [None]:
model.save(f'{MODELS_DIR}/lstm_model.h5')
print(f'Model saved to {MODELS_DIR!r}/lstm_model.h5')

---
## Try Your Own Headline
The sigmoid output is naturally a confidence score — values close to 1 mean the model is very sure it's clickbait, close to 0 means it's confident it's real news.  
Change the string below and run the cell.

In [None]:
# --- Change this to any headline you want to test ---
my_headline = 'You will not believe what this celebrity did next!'


def predict_lstm(headline, model, tokenizer, max_len):
    """
    Clean, tokenise, and predict a single headline.
    Returns the label and the raw sigmoid score as a confidence percentage.
    """
    cleaned  = clean_text(headline)
    seq      = tokenizer.texts_to_sequences([cleaned])
    padded   = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    score    = float(model.predict(padded, verbose=0)[0][0])   # raw sigmoid output

    if score >= 0.5:
        label      = 'Clickbait'
        confidence = score              # high score → confident clickbait
    else:
        label      = 'Real News'
        confidence = 1.0 - score       # low score → confident real news

    return {
        'headline'    : headline,
        'prediction'  : label,
        'confidence'  : f'{confidence * 100:.1f}%',
        'raw_score'   : f'{score:.4f}  (0 = very real, 1 = very clickbait)'
    }


result = predict_lstm(my_headline, model, tokenizer, MAX_LEN)

print(f'Headline    : {result["headline"]}')
print(f'Prediction  : {result["prediction"]}')
print(f'Confidence  : {result["confidence"]}')
print(f'Raw score   : {result["raw_score"]}')