<a href="https://colab.research.google.com/github/Ak4nksha/ai-generated-text-detector/blob/main/notebooks/04_lstm_sequence_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM Sequence Model

**Objective:**
Train a Deep Learning sequence model (LSTM) to detect AI-generated text. Unlike the feature-based models in Notebook 3 which relied on manual counts (like sentence length), this model learns patterns directly from the raw sequence of words.

**Key Components:**
1.  **Load Fixed Splits:** Import the pre-split data (`train.csv`, `val.csv`, `test.csv`) saved in Notebook 3 to ensure we are testing on the exact same "Collected" dataset.
2.  **Preprocessing:**
    * **Tokenization:** Custom regex tokenizer to split text into words.
    * **Vocabulary:** Built *strictly* on the Training set to prevent data leakage.
    * **Sequence Handling:** Implements padding and "packed sequences" to handle variable-length text efficiently in PyTorch.
3.  **Model Architecture:**
    * **Embedding Layer:** Converts words into dense vectors.
    * **Bi-LSTM:** Bidirectional Long Short-Term Memory layer to capture context from both past and future words.
    * **Classifier:** Fully connected layer with Dropout for regularization.
4.  **Training Loop:**
    * Uses **AdamW** optimizer and **BCEWithLogitsLoss**.
    * Implements **Early Stopping** based on Validation F1-score to prevent overfitting.
5.  **Final Evaluation:** Reports strict accuracy and F1 metrics on the held-out Test set (Scraped Data).

In [None]:
# === LOAD FIXED SPLITS (created in notebook 03) ===

from google.colab import drive
drive.mount("/content/drive")

from pathlib import Path
import pandas as pd
import numpy as np

SPLITS_DIR = Path("/content/drive/MyDrive/artifacts/splits_v1")

train_path = SPLITS_DIR / "train.csv"
val_path   = SPLITS_DIR / "val.csv"
test_path  = SPLITS_DIR / "test.csv"

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)

# Ensure expected columns exist
for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    if "text" not in df.columns or "label" not in df.columns:
        raise ValueError(f"{name}.csv must contain columns: text, label")

# Labels as numpy arrays
y_train = train_df["label"].astype(int).values
y_val   = val_df["label"].astype(int).values
y_test  = test_df["label"].astype(int).values

print(" Loaded splits from:", SPLITS_DIR)
print("Sizes:", len(train_df), len(val_df), len(test_df))
print("Label dist train:", np.bincount(y_train))
print("Label dist val:  ", np.bincount(y_val))
print("Label dist test: ", np.bincount(y_test))


In [None]:
# =========================
# LSTM text classifier
#   TRAIN/VAL = Kaggle only
#   TEST      = Scraped only
#
# Uses PyTorch. Includes:
# - fast regex tokenization
# - vocab built from TRAIN only
# - truncation to max_len
# - padding + packed sequences
# - early stopping on VAL F1
# - per-epoch timing
#
# datasets:
#   train_df, val_df, test_df
#   y_train, y_val, y_test
#
# This code prints time/epoch + a simple projected total after epoch 1.
# =========================

import re
import time
import math
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# -------------------------
# Config (tune these first)
# -------------------------
SEED = 42
MAX_VOCAB = 50_000        # 30k–100k typical
MIN_FREQ = 2              # drop very rare tokens
MAX_LEN = 256             # 256/384/512. Higher = slower but captures longer context
BATCH_SIZE = 64           # 32/64/128 depending on CPU & RAM
EMB_DIM = 192             # 128–256
HID_DIM = 192             # 128–256
NUM_LAYERS = 1            # 1–2 (2 slower)
BIDIR = True
DROPOUT = 0.2
LR = 2e-3
EPOCHS = 8
PATIENCE = 2              # early stop if VAL F1 doesn't improve
CLIP = 1.0

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# -------------------------
# Reproducibility
# -------------------------
torch.manual_seed(SEED)
np.random.seed(SEED)

# -------------------------
# Tokenizer (fast, stable)
# -------------------------
_tok = re.compile(r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?|[^\sA-Za-z0-9]")
def tokenize(text: str):
    return _tok.findall((text or "").lower())

# -------------------------
# Build vocab from TRAIN only
# -------------------------
def build_vocab(texts, max_vocab=MAX_VOCAB, min_freq=MIN_FREQ):
    counter = Counter()
    for t in texts:
        counter.update(tokenize(t))
    # Special tokens
    itos = ["<pad>", "<unk>"]
    # Keep most common above min_freq
    for tok, freq in counter.most_common():
        if freq < min_freq:
            break
        itos.append(tok)
        if len(itos) >= max_vocab:
            break
    stoi = {tok: i for i, tok in enumerate(itos)}
    return stoi, itos

train_texts = train_df["text"].astype(str).tolist()
val_texts   = val_df["text"].astype(str).tolist()
test_texts  = test_df["text"].astype(str).tolist()

stoi, itos = build_vocab(train_texts)
PAD_IDX = stoi["<pad>"]
UNK_IDX = stoi["<unk>"]

print(f"Vocab size: {len(itos):,} (PAD={PAD_IDX}, UNK={UNK_IDX})")

# -------------------------
# Dataset + Collate
# -------------------------
def encode(text: str, max_len=MAX_LEN):
    ids = [stoi.get(tok, UNK_IDX) for tok in tokenize(text)]
    if len(ids) > max_len:
        ids = ids[:max_len]
    return ids

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels.astype(np.int64)
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, i):
        return self.texts[i], self.labels[i]

def collate_batch(batch):
    texts, labels = zip(*batch)
    seqs = [torch.tensor(encode(t), dtype=torch.long) for t in texts]
    lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)

    # pad to max length in batch
    maxl = int(lengths.max().item()) if len(lengths) else 1
    padded = torch.full((len(seqs), maxl), PAD_IDX, dtype=torch.long)
    for i, s in enumerate(seqs):
        padded[i, :len(s)] = s

    labels = torch.tensor(labels, dtype=torch.float32)  # binary
    return padded.to(DEVICE), lengths.to(DEVICE), labels.to(DEVICE)

train_ds = TextDataset(train_texts, y_train)
val_ds   = TextDataset(val_texts, y_val)
test_ds  = TextDataset(test_texts, y_test)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_batch)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [None]:
# Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, bidir, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidir,
            dropout=0.0 if num_layers == 1 else dropout
        )
        out_dim = hid_dim * (2 if bidir else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(out_dim, 1)

    def forward(self, x, lengths):
        # x: [B, T]
        emb = self.dropout(self.embedding(x))  # [B, T, E]

        # pack sequences for speed
        lengths_cpu = lengths.to("cpu")
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths_cpu, batch_first=True, enforce_sorted=False)
        packed_out, (h, c) = self.lstm(packed)

        # h shape: [num_layers * num_directions, B, H]
        if self.lstm.bidirectional:
            # last layer forward + backward
            h_f = h[-2, :, :]
            h_b = h[-1, :, :]
            h_cat = torch.cat([h_f, h_b], dim=1)  # [B, 2H]
        else:
            h_cat = h[-1, :, :]  # [B, H]

        logits = self.fc(self.dropout(h_cat)).squeeze(1)  # [B]
        return logits

model = LSTMClassifier(
    vocab_size=len(itos),
    emb_dim=EMB_DIM,
    hid_dim=HID_DIM,
    num_layers=NUM_LAYERS,
    bidir=BIDIR,
    dropout=DROPOUT,
    pad_idx=PAD_IDX
).to(DEVICE)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

In [None]:
# -------------------------
# Metrics helpers
# -------------------------
@torch.no_grad()
def predict_loader(loader):
    model.eval()
    all_probs, all_y = [], []
    for x, lengths, y in loader:
        logits = model(x, lengths)
        probs = torch.sigmoid(logits).detach().cpu().numpy()
        all_probs.append(probs)
        all_y.append(y.detach().cpu().numpy())
    probs = np.concatenate(all_probs) if all_probs else np.array([])
    ytrue = np.concatenate(all_y).astype(int) if all_y else np.array([], dtype=int)
    return probs, ytrue

def eval_split(loader, threshold=0.5):
    probs, ytrue = predict_loader(loader)
    ypred = (probs >= threshold).astype(int)
    acc = accuracy_score(ytrue, ypred)
    p, r, f1, _ = precision_recall_fscore_support(ytrue, ypred, average="binary", zero_division=0)
    return acc, p, r, f1


In [None]:
# -------------------------
# Train loop with timing + early stopping
# -------------------------
best_val_f1 = -1.0
best_state = None
no_improve = 0
epoch_times = []

for epoch in range(1, EPOCHS + 1):
    t0 = time.time()
    model.train()
    running_loss = 0.0
    n_batches = 0

    for x, lengths, y in train_loader:
        optimizer.zero_grad()
        logits = model(x, lengths)
        loss = criterion(logits, y)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()

        running_loss += float(loss.item())
        n_batches += 1

    train_loss = running_loss / max(n_batches, 1)
    val_acc, val_p, val_r, val_f1 = eval_split(val_loader)

    t1 = time.time()
    epoch_sec = t1 - t0
    epoch_times.append(epoch_sec)

    # After epoch 1, print a rough projection based on observed time/epoch
    if epoch == 1:
        projected = epoch_sec * EPOCHS
        print(f"\n[Timing] Epoch 1 took {epoch_sec:.1f}s. Rough projected total for {EPOCHS} epochs: ~{projected/60:.1f} min (before early stopping).")

    print(f"Epoch {epoch}/{EPOCHS} | train_loss={train_loss:.4f} | "
          f"VAL acc={val_acc:.4f} p={val_p:.4f} r={val_r:.4f} f1={val_f1:.4f} | "
          f"time={epoch_sec:.1f}s")

    # Early stopping on VAL F1
    if val_f1 > best_val_f1 + 1e-4:
        best_val_f1 = val_f1
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print(f"Early stopping: no VAL F1 improvement for {PATIENCE} epoch(s).")
            break

# Restore best model
if best_state is not None:
    model.load_state_dict(best_state)

In [None]:
# -------------------------
# Final evaluation
# -------------------------
val_acc, val_p, val_r, val_f1 = eval_split(val_loader)
test_acc, test_p, test_r, test_f1 = eval_split(test_loader)

print("\n===== FINAL (best checkpoint) =====")
print(f"VAL  acc={val_acc:.4f} p={val_p:.4f} r={val_r:.4f} f1={val_f1:.4f}")
print(f"TEST acc={test_acc:.4f} p={test_p:.4f} r={test_r:.4f} f1={test_f1:.4f}")
print(f"Avg time/epoch: {np.mean(epoch_times):.1f}s over {len(epoch_times)} epoch(s)")