<a href="https://colab.research.google.com/github/Ak4nksha/ai-generated-text-detector/blob/main/notebooks/06_transformer_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer Fine-tuning (Human vs AI)

Goal: Fine-tune a pretrained transformer for binary classification:
**human-written vs LLM-generated text**.

- Uses the fixed `train/val/test` splits created earlier.
- Trains an end-to-end transformer classifier (not frozen).
- Reports validation and test metrics.


In [None]:
!pip -q install transformers datasets evaluate accelerate scikit-learn pandas numpy tqdm

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import json
from dataclasses import dataclass
from typing import Dict, List

from sklearn.metrics import accuracy_score, f1_score, classification_report


In [None]:

from google.colab import drive
drive.mount("/content/drive")

In [None]:
# === LOAD FIXED SPLITS (exported from baseline notebook) ===


ART_DIR = Path("/content/drive/MyDrive/artifacts/data_splits_v1")  # same folder used in baseline

# --- load metadata ---
with open(ART_DIR / "meta.json") as f:
    meta = json.load(f)

fmt = meta["format"]
style_cols = meta["style_cols"]

# --- load datasets ---
if fmt == "parquet":
    train_df = pd.read_parquet(ART_DIR / "train_all.parquet")
    val_df   = pd.read_parquet(ART_DIR / "val_all.parquet")
    test_df  = pd.read_parquet(ART_DIR / "test_all.parquet")
else:
    train_df = pd.read_csv(ART_DIR / "train_all.csv")
    val_df   = pd.read_csv(ART_DIR / "val_all.csv")
    test_df  = pd.read_csv(ART_DIR / "test_all.csv")

# --- sanity checks (text + label + style columns) ---
required_cols = ["text", "label", "source"] + style_cols

for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"{name} split missing columns: {missing[:15]}{' ...' if len(missing) > 15 else ''}")

# --- labels as numpy arrays ---
y_train = train_df["label"].astype(int).values
y_val   = val_df["label"].astype(int).values
y_test  = test_df["label"].astype(int).values

print("Loaded splits from:", ART_DIR)
print("Format:", fmt)
print("Sizes:", len(train_df), len(val_df), len(test_df))
print("Label dist train:", np.bincount(y_train))
print("Label dist val:  ", np.bincount(y_val))
print("Label dist test: ", np.bincount(y_test))
print("Num stylometry features:", len(style_cols))


In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df[["text", "label"]])
val_ds   = Dataset.from_pandas(val_df[["text", "label"]])
test_ds  = Dataset.from_pandas(test_df[["text", "label"]])

print(train_ds)


In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,   # we'll pad dynamically in the collator
    )

train_tok = train_ds.map(tokenize_batch, batched=True, remove_columns=["text"])
val_tok   = val_ds.map(tokenize_batch, batched=True, remove_columns=["text"])
test_tok  = test_ds.map(tokenize_batch, batched=True, remove_columns=["text"])

print(" Tokenized.")


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


In [None]:
## Training setup

from transformers import TrainingArguments, Trainer

OUTPUT_DIR = "./artifacts/transformer_finetune/distilbert_run_v1"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    save_total_limit=2,
    # load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,

    fp16=True,  # works on most Colab GPUs; if error, set fp16=False
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
val_metrics = trainer.evaluate(val_tok)
print("Val metrics:", val_metrics)

test_metrics = trainer.evaluate(test_tok)
print("Test metrics:", test_metrics)


<!-- **DistilBERT fine-tuning results (fixed splits):**
- Validation F1 ≈ 0.996
- Test F1 ≈ 0.849

Large generalization gap indicates strong domain shift between training and test data. -->


In [None]:
print("Validation Results")
print(f"  Accuracy : {val_metrics['eval_accuracy']*100:.2f}%")
print(f"  F1-score : {val_metrics['eval_f1']*100:.2f}%")
print(f"  Loss     : {val_metrics['eval_loss']*100:.2f}%")
print()

print("Test Results")
print(f"  Accuracy : {test_metrics['eval_accuracy']*100:.2f}%")
print(f"  F1-score : {test_metrics['eval_f1']*100:.2f}%")
print(f"  Loss     : {test_metrics['eval_loss']*100:.2f}%")


**Interpretation:**

The fine-tuned transformer achieves near-perfect performance on the validation set, indicating strong capacity to fit the training distribution. However, test accuracy drops substantially, while F1 remains relatively high. This suggests that the model generalizes well for detecting AI-generated text but struggles with human-written examples, consistent with observations from the linear probe and LSTM models.

In [None]:
from sklearn.metrics import confusion_matrix

preds = trainer.predict(test_tok)
test_logits = preds.predictions
test_labels = preds.label_ids
test_preds = np.argmax(test_logits, axis=1)

cm = confusion_matrix(test_labels, test_preds)

cm_df = pd.DataFrame(
    cm,
    index=["Human (0)", "AI (1)"],
    columns=["Pred Human", "Pred AI"]
)

cm_df

The confusion matrix shows that most errors come from human-written text being misclassified as AI, while AI-generated text is detected reliably. This explains why F1 remains high despite lower accuracy. The model strongly favors the AI class.

In [None]:
import torch
from scipy.special import softmax

probs = softmax(test_logits, axis=1)
test_df_analysis = test_df.copy()

test_df_analysis["pred_label"] = test_preds
test_df_analysis["prob_ai"] = probs[:, 1]
test_df_analysis["correct"] = test_df_analysis["label"] == test_df_analysis["pred_label"]


In [None]:
test_df_analysis[
    (test_df_analysis["label"] == 1) &
    (test_df_analysis["correct"]) &
    (test_df_analysis["prob_ai"] > 0.9)
][["text", "prob_ai"]].sample(3)


Confident AI detections:
Above texts often exhibit fluent structure, neutral tone, and consistent sentence patterns, which the transformer captures effectively after fine-tuning.

In [None]:
test_df_analysis[
    (test_df_analysis["label"] == 0) &
    (~test_df_analysis["correct"]) &
    (test_df_analysis["prob_ai"] > 0.9)
][["text", "prob_ai"]].head(3)

Confident misclassifications (Human → AI):
Above human-written examples are often formal, well-structured, or informational in tone, making them stylistically similar to LLM-generated text. This suggests the model relies heavily on surface fluency cues rather than deeper semantic intent.

In [None]:
# from https://gist.github.com/jonathanagustin/b67b97ef12c53a8dec27b343dca4abba
# install can take a minute

import os
# @title Convert Notebook to PDF. Save Notebook to given directory
NOTEBOOKS_DIR = "/content/drive/MyDrive/" # @param {type:"string"}
NOTEBOOK_NAME = "06_transformer_finetune.ipynb" # @param {type:"string"}
#------------------------------------------------------------------------------#
from google.colab import drive
drive.mount("/content/drive/", force_remount=True)
NOTEBOOK_PATH = f"{NOTEBOOKS_DIR}/{NOTEBOOK_NAME}"
assert os.path.exists(NOTEBOOK_PATH), f"NOTEBOOK NOT FOUND: {NOTEBOOK_PATH}"
!apt install -y texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
!apt install pandoc > /dev/null 2>&1
!jupyter nbconvert "$NOTEBOOK_PATH" --to pdf > /dev/null 2>&1
NOTEBOOK_PDF = NOTEBOOK_PATH.rsplit('.', 1)[0] + '.pdf'
assert os.path.exists(NOTEBOOK_PDF), f"ERROR MAKING PDF: {NOTEBOOK_PDF}"
print(f"PDF CREATED: {NOTEBOOK_PDF}")