<a href="https://colab.research.google.com/github/Ak4nksha/ai-generated-text-detector/blob/main/notebooks/05_linear_probe_pretrained_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Probe on a Pretrained Encoder

Goal: Evaluate how well a **frozen pretrained text encoder** separates
human-written vs LLM-generated text using a **linear classifier** on top.

- Encoder is frozen (no fine-tuning).
- Only a lightweight classifier is trained.
- Uses the fixed `splits_v1` created in notebook 03.


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import json
from datetime import datetime


In [None]:
!pip -q install sentence-transformers scikit-learn pandas numpy tqdm

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# === LOAD FIXED SPLITS (exported from baseline notebook) ===

# from google.colab import drive
# drive.mount("/content/drive")

import json
from pathlib import Path
import pandas as pd
import numpy as np

ART_DIR = Path("/content/drive/MyDrive/artifacts/data_splits_v1")  # same folder used in baseline

# --- load metadata ---
with open(ART_DIR / "meta.json") as f:
    meta = json.load(f)

fmt = meta["format"]
style_cols = meta["style_cols"]

# --- load datasets ---
if fmt == "parquet":
    train_df = pd.read_parquet(ART_DIR / "train_all.parquet")
    val_df   = pd.read_parquet(ART_DIR / "val_all.parquet")
    test_df  = pd.read_parquet(ART_DIR / "test_all.parquet")
else:
    train_df = pd.read_csv(ART_DIR / "train_all.csv")
    val_df   = pd.read_csv(ART_DIR / "val_all.csv")
    test_df  = pd.read_csv(ART_DIR / "test_all.csv")

# --- sanity checks (text + label + style columns) ---
required_cols = ["text", "label", "source"] + style_cols

for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"{name} split missing columns: {missing[:15]}{' ...' if len(missing) > 15 else ''}")

# --- labels as numpy arrays ---
y_train = train_df["label"].astype(int).values
y_val   = val_df["label"].astype(int).values
y_test  = test_df["label"].astype(int).values

print("Loaded splits from:", ART_DIR)
print("Format:", fmt)
print("Sizes:", len(train_df), len(val_df), len(test_df))
print("Label dist train:", np.bincount(y_train))
print("Label dist val:  ", np.bincount(y_val))
print("Label dist test: ", np.bincount(y_test))
print("Num stylometry features:", len(style_cols))


In [None]:
from sentence_transformers import SentenceTransformer

ENCODER_NAME = "sentence-transformers/all-MiniLM-L6-v2"

encoder = SentenceTransformer(ENCODER_NAME)
encoder.max_seq_length = 256
print("Loaded encoder:", ENCODER_NAME)


In [None]:
# Cache embeddings so we don't re-encode every time
CACHE_DIR = Path("/content/drive/MyDrive/artifacts/linear_probe/cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

def embed_texts(texts, cache_path: Path, batch_size: int = 64):
    if cache_path.exists():
        return np.load(cache_path)
    emb = encoder.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    np.save(cache_path, emb)
    return emb

X_train = embed_texts(train_df["text"].tolist(), CACHE_DIR / "X_train.npy")
X_val   = embed_texts(val_df["text"].tolist(),   CACHE_DIR / "X_val.npy")
X_test  = embed_texts(test_df["text"].tolist(),  CACHE_DIR / "X_test.npy")

print(" Embeddings shapes:", X_train.shape, X_val.shape, X_test.shape)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

clf = LogisticRegression(max_iter=2000, n_jobs=-1)
clf.fit(X_train, y_train)

val_pred = clf.predict(X_val)
val_prob = clf.predict_proba(X_val)[:, 1]

test_pred = clf.predict(X_test)
test_prob = clf.predict_proba(X_test)[:, 1]

val_acc = accuracy_score(y_val, val_pred)
val_f1  = f1_score(y_val, val_pred)

test_acc = accuracy_score(y_test, test_pred)
test_f1  = f1_score(y_test, test_pred)

print("VAL  acc/f1:", val_acc, val_f1)
print("TEST acc/f1:", test_acc, test_f1)

report = classification_report(y_test, test_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df.round(4)


In [None]:
def summarize_results(val_acc, val_f1, test_acc, test_f1):
    df = pd.DataFrame({
        "Split": ["Validation", "Test"],
        "Accuracy": [val_acc * 100, test_acc *100],
        "F1": [val_f1*100, test_f1*100],
    })
    return df.round(2)

summarize_results(val_acc, val_f1, test_acc, test_f1)


The below confusion matrix shows that the linear probe correctly identifies most AI-generated texts, but frequently misclassifies human-written text as AI. This explains the high F1 for class 1 and low recall for class 0.

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, test_pred)
cm_df = pd.DataFrame(
    cm,
    index=["Human (0)", "AI (1)"],
    columns=["Pred Human", "Pred AI"]
)

cm_df

##Qualitative Analysis

In [None]:
test_results = test_df.copy()
test_results["pred"] = test_pred
test_results["prob_ai"] = test_prob

false_positives = test_results[
    (test_results["label"] == 0) & (test_results["pred"] == 1)
]

false_negatives = test_results[
    (test_results["label"] == 1) & (test_results["pred"] == 0)
]


In [None]:
false_positives[["text", "prob_ai"]].head(3)

In [None]:
false_negatives[["text", "prob_ai"]].head(3)

Observation:
Many false positives (human text predicted as AI) are highly structured, neutral in tone, and lack personal context. These stylistic traits resemble LLM-generated text, causing the encoder to overgeneralize.

False negatives (AI predicted as human) often contain informal phrasing or personal language, which reduces stereotypical AI patterns.

In [None]:
false_positives["prob_ai"].describe()

In [None]:
false_negatives["prob_ai"].describe()

In [None]:
# from https://gist.github.com/jonathanagustin/b67b97ef12c53a8dec27b343dca4abba
# install can take a minute

import os
# @title Convert Notebook to PDF. Save Notebook to given directory
NOTEBOOKS_DIR = "/content/drive/MyDrive/" # @param {type:"string"}
NOTEBOOK_NAME = "05_linear_probe_pretrained_encoder.ipynb" # @param {type:"string"}
#------------------------------------------------------------------------------#
from google.colab import drive
drive.mount("/content/drive/", force_remount=True)
NOTEBOOK_PATH = f"{NOTEBOOKS_DIR}/{NOTEBOOK_NAME}"
assert os.path.exists(NOTEBOOK_PATH), f"NOTEBOOK NOT FOUND: {NOTEBOOK_PATH}"
!apt install -y texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
!apt install pandoc > /dev/null 2>&1
!jupyter nbconvert "$NOTEBOOK_PATH" --to pdf > /dev/null 2>&1
NOTEBOOK_PDF = NOTEBOOK_PATH.rsplit('.', 1)[0] + '.pdf'
assert os.path.exists(NOTEBOOK_PDF), f"ERROR MAKING PDF: {NOTEBOOK_PDF}"
print(f"PDF CREATED: {NOTEBOOK_PDF}")