<a href="https://colab.research.google.com/github/Ak4nksha/ai-generated-text-detector/blob/main/notebooks/05_linear_probe_pretrained_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Probe on a Pretrained Encoder

Goal: Evaluate how well a **frozen pretrained text encoder** separates
human-written vs LLM-generated text using a **linear classifier** on top.

- Encoder is frozen (no fine-tuning).
- Only a lightweight classifier is trained.
- Uses the fixed `splits_v1` created in notebook 03.


In [None]:
!pip -q install sentence-transformers scikit-learn pandas numpy tqdm

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import json
from datetime import datetime


In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
SPLITS_DIR = Path("/content/drive/MyDrive/artifacts/splits_v1")

train_df = pd.read_csv(SPLITS_DIR / "train.csv")
val_df   = pd.read_csv(SPLITS_DIR / "val.csv")
test_df  = pd.read_csv(SPLITS_DIR / "test.csv")

for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    if "text" not in df.columns or "label" not in df.columns:
        raise ValueError(f"{name}.csv must contain columns: text, label")

y_train = train_df["label"].astype(int).values
y_val   = val_df["label"].astype(int).values
y_test  = test_df["label"].astype(int).values

print("Loaded splits:", len(train_df), len(val_df), len(test_df))

In [None]:
from sentence_transformers import SentenceTransformer

ENCODER_NAME = "sentence-transformers/all-MiniLM-L6-v2"

encoder = SentenceTransformer(ENCODER_NAME)
encoder.max_seq_length = 256
print("Loaded encoder:", ENCODER_NAME)


In [None]:
# Cache embeddings so we don't re-encode every time
CACHE_DIR = Path("/content/drive/MyDrive/artifacts/linear_probe/cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

def embed_texts(texts, cache_path: Path, batch_size: int = 64):
    if cache_path.exists():
        return np.load(cache_path)
    emb = encoder.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    np.save(cache_path, emb)
    return emb

X_train = embed_texts(train_df["text"].tolist(), CACHE_DIR / "X_train.npy")
X_val   = embed_texts(val_df["text"].tolist(),   CACHE_DIR / "X_val.npy")
X_test  = embed_texts(test_df["text"].tolist(),  CACHE_DIR / "X_test.npy")

print(" Embeddings shapes:", X_train.shape, X_val.shape, X_test.shape)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

clf = LogisticRegression(max_iter=2000, n_jobs=-1)
clf.fit(X_train, y_train)

val_pred = clf.predict(X_val)
val_prob = clf.predict_proba(X_val)[:, 1]

test_pred = clf.predict(X_test)
test_prob = clf.predict_proba(X_test)[:, 1]

val_acc = accuracy_score(y_val, val_pred)
val_f1  = f1_score(y_val, val_pred)

test_acc = accuracy_score(y_test, test_pred)
test_f1  = f1_score(y_test, test_pred)

print("VAL  acc/f1:", val_acc, val_f1)
print("TEST acc/f1:", test_acc, test_f1)

report = classification_report(y_test, test_pred, output_dict=True)
