In [1]:
# !python -m pip uninstall -y datasets
# !python -m pip install -U datasets pyarrow huggingface-hub fsspec

In [3]:
"""
LLM Hallucination Detection via Semantic Entropy & Embedding Geometry
**Multi-Subject Analysis**: Compare hallucination detection across multiple MMLU subjects

Based on: Farquhar et al. (2024) · Ricco et al. (2025) · Lee et al. (2018)
Dataset: MMLU (Massive Multitask Language Understanding)
Runtime: Google Colab (free tier, T4 GPU recommended)

"""

import sys
import importlib

# Clear any partially imported datasets module
if 'datasets' in sys.modules:
    del sys.modules['datasets']

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

print("Loading libraries...")

# Import with proper error handling
try:
    from datasets import load_dataset
    print("✓ datasets imported successfully")
except Exception as e:
    print(f"⚠ datasets import error: {e}")
    print("  Attempting to reload...")
    importlib.reload(sys.modules.get('datasets', __import__('datasets')))
    from datasets import load_dataset

try:
    from transformers import T5ForConditionalGeneration, T5Tokenizer
    print("✓ transformers imported successfully")
except Exception as e:
    print(f"✗ transformers import failed: {e}")
    raise

try:
    from sentence_transformers import SentenceTransformer
    print("✓ sentence_transformers imported successfully")
except Exception as e:
    print(f"✗ sentence_transformers import failed: {e}")
    raise

try:
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import StratifiedKFold, cross_val_score
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import roc_auc_score, roc_curve
    from scipy.stats import ks_2samp
    import xgboost as xgb
    print("✓ sklearn, scipy, xgboost imported successfully")
except Exception as e:
    print(f"✗ sklearn/scipy/xgboost import failed: {e}")
    raise

try:
    import torch
    print("✓ torch imported successfully")
except Exception as e:
    print(f"✗ torch import failed: {e}")
    raise

import time
try:
    from tqdm import tqdm
    print("✓ tqdm imported successfully")
except:
    print("⚠ tqdm not available, using simple progress")
    def tqdm(iterable, desc=None):
        return iterable

print("\nAll imports successful\n")


Loading libraries...
✓ datasets imported successfully
✓ transformers imported successfully
✓ sentence_transformers imported successfully
✓ sklearn, scipy, xgboost imported successfully
✓ torch imported successfully
✓ tqdm imported successfully

All imports successful



In [4]:

# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG = {
    "dataset_name"   : "cais/mmlu",
    "mmlu_subjects"  : [
        "high_school_geography",
        "high_school_world_history",
        "high_school_physics",
        "high_school_biology",
        "high_school_us_history",
    ],  # ← Add/remove subjects here
    "n_questions"    : 50,         # REDUCED for faster testing (increase to 100-200 for production)
    "n_samples"      : 20,         # Responses per question
    "temperature"    : 0.5,        # Higher = more diversity
    "sim_threshold"  : 0.85,       # Cosine similarity threshold
    "generator_model": "google/flan-t5-base",
    "embedding_model": "all-MiniLM-L6-v2",
    "random_seed"    : 42,
}

np.random.seed(CONFIG["random_seed"])
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
ANSWER_MAP = {0: "A", 1: "B", 2: "C", 3: "D"}

print(f"Running on: {DEVICE}")
print(f"Subjects to analyze: {len(CONFIG['mmlu_subjects'])} subjects")
print(f"Questions per subject: {CONFIG['n_questions']}")
print(f"Samples per question: {CONFIG['n_samples']}\n")


Running on: cpu
Subjects to analyze: 5 subjects
Questions per subject: 50
Samples per question: 20



In [5]:

# ============================================================================
# LOAD MODELS (once)
# ============================================================================

print("Loading models (this may take 1-2 minutes)...")
start_model_load = time.time()

try:
    tokenizer = T5Tokenizer.from_pretrained(CONFIG["generator_model"])
    generator = T5ForConditionalGeneration.from_pretrained(CONFIG["generator_model"]).to(DEVICE)
    generator.eval()
    print(f"✓ Generator loaded in {time.time() - start_model_load:.1f}s")
except Exception as e:
    print(f"✗ Failed to load generator: {e}")
    raise

try:
    embedder = SentenceTransformer(CONFIG["embedding_model"])
    print(f"✓ Embedder loaded in {time.time() - start_model_load:.1f}s")
except Exception as e:
    print(f"✗ Failed to load embedder: {e}")
    raise

print(f"✓ All models loaded in {time.time() - start_model_load:.1f}s\n")


Loading models (this may take 1-2 minutes)...


Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



✓ Generator loaded in 1.5s


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


✓ Embedder loaded in 3.5s
✓ All models loaded in 3.5s



In [6]:

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def build_prompt(item):
    choices = item["choices"]
    return (
        f"Answer the following multiple choice question. "
        f"Respond with only the letter A, B, C, or D.\n\n"
        f"Question: {item['question']}\n"
        f"A) {choices[0]}\nB) {choices[1]}\nC) {choices[2]}\nD) {choices[3]}\nAnswer:"
    )

def sample_responses(item, n_samples, temperature, device=DEVICE):
    """Generate multiple responses with nucleus sampling"""
    prompt = build_prompt(item)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    with torch.no_grad():
        outputs = generator.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=True,
            temperature=temperature,
            top_p=0.95,
            num_return_sequences=n_samples,
        )
    
    responses = [tokenizer.decode(o, skip_special_tokens=True).strip() for o in outputs]
    return responses

def extract_answer_letter(response_text):
    """Extract A/B/C/D from response"""
    for char in response_text.upper().strip():
        if char in ("A", "B", "C", "D"):
            return char
    return None

def label_responses(item, responses):
    """Label responses as correct (0) or hallucinated (1)"""
    correct = ANSWER_MAP[item["answer"]]
    labels = [0 if extract_answer_letter(r) == correct else 1 for r in responses]
    hall_rate = np.mean(labels)
    return labels, hall_rate, int(hall_rate > 0.5)

def semantic_entropy(embs, threshold=CONFIG["sim_threshold"]):
    """Compute semantic entropy from embeddings"""
    N = len(embs)
    dist_matrix = np.clip(1 - cosine_similarity(embs), 0, None)
    np.fill_diagonal(dist_matrix, 0)
    
    clustering = AgglomerativeClustering(
        n_clusters=None, metric="precomputed",
        linkage="average", distance_threshold=1 - threshold
    )
    ids = clustering.fit_predict(dist_matrix)
    _, counts = np.unique(ids, return_counts=True)
    probs = counts / N
    H = -np.sum(probs * np.log2(probs + 1e-12))
    return H, len(counts)

def cosine_dispersion(embs):
    """Compute cosine dispersion"""
    centroid = embs.mean(axis=0, keepdims=True)
    return np.mean(1 - cosine_similarity(embs, centroid).flatten())

def mahalanobis_distance(embs, mu, cov_inv):
    """Compute Mahalanobis distance"""
    diffs = embs - mu
    return np.mean(np.sqrt(np.einsum("ni,ij,nj->n", diffs, cov_inv, diffs)))

def similarity_variance(embs):
    """Compute variance of pairwise similarities"""
    sim = cosine_similarity(embs)
    upper = sim[np.triu_indices(len(embs), k=1)]
    return np.var(upper)


In [7]:

# ============================================================================
# MAIN PIPELINE: PROCESS ALL SUBJECTS
# ============================================================================

print("=" * 70)
print("STARTING MULTI-SUBJECT ANALYSIS")
print("=" * 70 + "\n")

results_all_subjects = {}
timing_log = []

for subject_idx, subject in enumerate(CONFIG["mmlu_subjects"], 1):
    print(f"\n{'='*70}")
    print(f"[{subject_idx}/{len(CONFIG['mmlu_subjects'])}] {subject}")
    print(f"{'='*70}")
    
    start_time = time.time()
    
    # Load dataset
    try:
        print(f"Loading dataset...", end=" ", flush=True)
        dataset = load_dataset(CONFIG["dataset_name"], subject, split="test")
        n = min(CONFIG["n_questions"], len(dataset))
        indices = np.random.choice(len(dataset), n, replace=False)
        questions_raw = [dataset[int(i)] for i in indices]
        print(f"✓ ({n} questions)")
    except Exception as e:
        print(f"\n  ✗ Failed to load {subject}: {e}")
        continue
    
    # Sample responses
    print(f"Sampling {CONFIG['n_samples']} responses per question...", end=" ", flush=True)
    all_responses = []
    for item in questions_raw:
        responses = sample_responses(item, CONFIG["n_samples"], CONFIG["temperature"])
        all_responses.append(responses)
    print(f"✓")
    
    # Label responses
    print(f"Labeling responses...", end=" ", flush=True)
    response_labels = []
    hallucination_rates = []
    question_labels = []
    
    for item, responses in zip(questions_raw, all_responses):
        labels, hall_rate, q_label = label_responses(item, responses)
        response_labels.append(labels)
        hallucination_rates.append(hall_rate)
        question_labels.append(q_label)
    
    y = np.array(question_labels)
    print(f"✓")
    
    print(f"  Correct: {(y==0).sum()} ({(y==0).mean()*100:.1f}%) | "
          f"Hallucinated: {(y==1).sum()} ({(y==1).mean()*100:.1f}%)")
    
    # Embed responses
    print(f"Embedding {len(all_responses) * CONFIG['n_samples']} responses...", end=" ", flush=True)
    all_embeddings = []
    for responses in all_responses:
        embs = embedder.encode(responses, normalize_embeddings=True, show_progress_bar=False)
        all_embeddings.append(embs)
    print(f"✓")
    
    # Fit Mahalanobis reference
    print(f"Fitting Mahalanobis reference...", end=" ", flush=True)
    correct_embs = np.array([
        emb for embs, labels in zip(all_embeddings, response_labels)
        for emb, lbl in zip(embs, labels) if lbl == 0
    ])
    
    if len(correct_embs) > 1:
        mu_ref = correct_embs.mean(axis=0)
        cov_ref = np.cov(correct_embs.T) + np.eye(correct_embs.shape[1]) * 1e-6
        cov_inv = np.linalg.pinv(cov_ref)
        print(f"✓ ({len(correct_embs)} correct responses)")
    else:
        print(f"✗ Not enough correct responses")
        continue
    
    # Extract features
    print(f"Extracting features...", end=" ", flush=True)
    features = []
    for embs in all_embeddings:
        H, K = semantic_entropy(embs)
        D = cosine_dispersion(embs)
        M = mahalanobis_distance(embs, mu_ref, cov_inv)
        sig2 = similarity_variance(embs)
        features.append([H, D, M, K, sig2])
    print(f"✓")
    
    X = np.array(features)
    feature_names = ["H_sem", "D_cos", "M_bar", "K", "sig2_S"]
    
    df = pd.DataFrame(X, columns=feature_names)
    df["label"] = y
    df["subject"] = subject
    
    # Statistical tests
    print(f"Running statistical tests...", end=" ", flush=True)
    alpha_bonf = 0.05 / len(feature_names)
    ks_results = {}
    for col in feature_names:
        g0 = df[df.label==0][col]
        g1 = df[df.label==1][col]
        if len(g0) > 1 and len(g1) > 1:
            stat, p = ks_2samp(g0, g1)
            ks_results[col] = {"stat": stat, "p": p, "significant": p < alpha_bonf}
        else:
            ks_results[col] = {"stat": 0, "p": 1.0, "significant": False}
    print(f"✓")
    
    # Classification
    print(f"Running classification (5-fold CV)...", end=" ", flush=True)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    cv = StratifiedKFold(n_splits=min(5, len(np.unique(y))), shuffle=True, random_state=CONFIG["random_seed"])
    
    best_clf = RandomForestClassifier(n_estimators=100, random_state=CONFIG["random_seed"], n_jobs=-1)
    try:
        scores = cross_val_score(best_clf, X_scaled, y, cv=cv, scoring="roc_auc")
        auc_cv_mean = scores.mean()
        auc_cv_std = scores.std()
    except:
        auc_cv_mean = 0.5
        auc_cv_std = 0.0
    
    print(f"✓ (AUC={auc_cv_mean:.3f})")
    
    # Bootstrap CI (simplified - 500 iterations instead of 1000)
    print(f"Computing bootstrap CI...", end=" ", flush=True)
    auc_boot = []
    rng = np.random.default_rng(CONFIG["random_seed"])
    for _ in range(500):  # Reduced from 1000 for speed
        idx = rng.choice(len(y), len(y), replace=True)
        oob = np.setdiff1d(np.arange(len(y)), idx)
        if len(np.unique(y[oob])) < 2:
            continue
        try:
            clf_b = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
            clf_b.fit(X_scaled[idx], y[idx])
            proba = clf_b.predict_proba(X_scaled[oob])[:, 1]
            auc_boot.append(roc_auc_score(y[oob], proba))
        except:
            pass
    
    if auc_boot:
        ci_lo, ci_hi = np.percentile(auc_boot, [2.5, 97.5])
        auc_mean = np.mean(auc_boot)
    else:
        ci_lo, ci_hi = auc_cv_mean - 0.1, auc_cv_mean + 0.1
        auc_mean = auc_cv_mean
    
    print(f"✓")
    
    elapsed = time.time() - start_time
    timing_log.append({"subject": subject, "seconds": elapsed})
    
    # Store results
    results_all_subjects[subject] = {
        "X": X,
        "y": y,
        "df": df,
        "scaler": scaler,
        "X_scaled": X_scaled,
        "feature_names": feature_names,
        "n_questions": n,
        "hallucination_rate": np.mean(hallucination_rates),
        "correct_count": (y==0).sum(),
        "hallucinated_count": (y==1).sum(),
        "auc_mean": auc_mean,
        "auc_ci_lo": ci_lo,
        "auc_ci_hi": ci_hi,
        "auc_cv_mean": auc_cv_mean,
        "auc_cv_std": auc_cv_std,
        "ks_results": ks_results,
        "correct_embs": correct_embs,
        "mu_ref": mu_ref,
        "cov_inv": cov_inv,
    }
    
    print(f" Completed in {elapsed:.1f}s")


STARTING MULTI-SUBJECT ANALYSIS


[1/5] high_school_geography
Loading dataset... ✓ (50 questions)
Sampling 20 responses per question... ✓
Labeling responses... ✓
  Correct: 20 (40.0%) | Hallucinated: 30 (60.0%)
Embedding 1000 responses... ✓
Fitting Mahalanobis reference... ✓ (395 correct responses)
Extracting features... ✓
Running statistical tests... ✓
Running classification (5-fold CV)... ✓ (AUC=0.435)
Computing bootstrap CI... ✓
 Completed in 82.7s

[2/5] high_school_world_history
Loading dataset... 
  ✗ Failed to load high_school_world_history: RLock objects should only be shared between processes through inheritance

[3/5] high_school_physics
Loading dataset... 
  ✗ Failed to load high_school_physics: RLock objects should only be shared between processes through inheritance

[4/5] high_school_biology
Loading dataset... 
  ✗ Failed to load high_school_biology: RLock objects should only be shared between processes through inheritance

[5/5] high_school_us_history
Loading dataset... 

In [None]:

# ============================================================================
# CROSS-SUBJECT SUMMARY
# ============================================================================

if results_all_subjects:
    print(f"\n{'='*70}")
    print("CROSS-SUBJECT SUMMARY")
    print(f"{'='*70}")
    
    summary_data = []
    for subject, res in results_all_subjects.items():
        summary_data.append({
            "Subject": subject.replace("high_school_", ""),
            "N": res["n_questions"],
            "Hall.%": f"{res['hallucination_rate']*100:.1f}",
            "AUC": f"{res['auc_mean']:.3f}",
            "95% CI": f"[{res['auc_ci_lo']:.3f}, {res['auc_ci_hi']:.3f}]",
        })
    
    df_summary = pd.DataFrame(summary_data)
    print(df_summary.to_string(index=False))
    
    print(f"\n Analysis complete! {len(results_all_subjects)} subjects processed")
    print(f"Total runtime: {sum(t['seconds'] for t in timing_log):.1f}s")
else:
    print("\n✗ No subjects processed successfully")

# ============================================================================
# SAVE RESULTS OBJECT FOR SUPPLEMENTARY ANALYSIS
# ============================================================================

print("\n Results saved to 'results_all_subjects' dictionary")
print("   Run supplementary_analysis.py next to generate additional plots and tables")