# Entropy vs Consistency: Finding the Gap

**Research Question:** Does token entropy detect uncertainty that sample consistency misses?

**Hypothesis:** Entropy can detect "confident hallucination" where consistency fails.

**Setup:** Runtime → Change runtime type → T4 GPU

In [None]:
# Install dependencies
!pip install -q transformers accelerate datasets scipy

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from collections import Counter
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

In [None]:
# Load model (GPT-2 for speed, can upgrade to Mistral-7B)
MODEL_NAME = "gpt2-medium"  # or "mistralai/Mistral-7B-Instruct-v0.2" for better results

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded!")

## Core Metrics

In [None]:
def compute_response_entropy(prompt, response):
    """Compute mean token entropy during response generation."""
    full_text = prompt + response
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    prompt_len = len(tokenizer(prompt)["input_ids"])

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[0]

    entropies = []
    for i in range(prompt_len - 1, len(logits) - 1):
        probs = F.softmax(logits[i].float(), dim=-1)
        entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()
        entropies.append(entropy)

    return {
        "mean": np.mean(entropies) if entropies else 0,
        "max": np.max(entropies) if entropies else 0,
        "std": np.std(entropies) if entropies else 0,
        "all": entropies,
    }


def generate_response(prompt, temperature=0.7, max_tokens=50):
    """Generate a single response."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
        )
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    # Truncate at first newline or period for cleaner answers
    for end in ["\n", ". "]:
        if end in response:
            response = response[:response.index(end)+1]
            break
    return response.strip()


def compute_consistency(prompt, n_samples=5):
    """Compute consistency by generating N samples and measuring agreement."""
    responses = [generate_response(prompt) for _ in range(n_samples)]
    
    # Simple consistency: most common response frequency
    counts = Counter(responses)
    most_common_count = counts.most_common(1)[0][1]
    consistency_score = most_common_count / n_samples
    
    return {
        "score": consistency_score,
        "responses": responses,
        "unique": len(counts),
        "most_common": counts.most_common(1)[0][0],
    }

## Test Questions

Categories:
1. **Factual certain**: Model should know and be confident
2. **Factual uncertain**: Model might not know
3. **Hallucination-prone**: Model likely to make things up
4. **Subjective**: No right answer

In [None]:
TEST_QUESTIONS = [
    # Factual - should be confident and correct
    {"q": "Q: What is the capital of France?\nA:", "category": "factual", "answer": "Paris"},
    {"q": "Q: What year did World War 2 end?\nA:", "category": "factual", "answer": "1945"},
    {"q": "Q: Who wrote Romeo and Juliet?\nA:", "category": "factual", "answer": "Shakespeare"},
    {"q": "Q: What is the chemical symbol for water?\nA:", "category": "factual", "answer": "H2O"},
    {"q": "Q: What planet is closest to the Sun?\nA:", "category": "factual", "answer": "Mercury"},
    
    # Hallucination-prone - model might confidently make things up
    {"q": "Q: What did Albert Einstein say about Bitcoin?\nA:", "category": "hallucination", "answer": None},
    {"q": "Q: What is the phone number of the Eiffel Tower?\nA:", "category": "hallucination", "answer": None},
    {"q": "Q: What was Barack Obama's favorite pizza topping?\nA:", "category": "hallucination", "answer": None},
    {"q": "Q: What did Shakespeare tweet about?\nA:", "category": "hallucination", "answer": None},
    {"q": "Q: What is the WiFi password at the White House?\nA:", "category": "hallucination", "answer": None},
    
    # Uncertain - model may not know
    {"q": "Q: What is the population of Liechtenstein?\nA:", "category": "uncertain", "answer": "~39000"},
    {"q": "Q: Who was the 23rd President of the United States?\nA:", "category": "uncertain", "answer": "Benjamin Harrison"},
    {"q": "Q: What year was the University of Bologna founded?\nA:", "category": "uncertain", "answer": "1088"},
    
    # Subjective - no right answer
    {"q": "Q: What is the best programming language?\nA:", "category": "subjective", "answer": None},
    {"q": "Q: What is the meaning of life?\nA:", "category": "subjective", "answer": None},
    {"q": "Q: Should I eat pizza or salad?\nA:", "category": "subjective", "answer": None},
]

print(f"Loaded {len(TEST_QUESTIONS)} test questions")

## Run Experiment

In [None]:
N_SAMPLES = 5  # For consistency measurement

results = []

print("Running experiment...")
print("="*80)

for item in tqdm(TEST_QUESTIONS):
    prompt = item["q"]
    
    # Measure consistency (N samples)
    consistency = compute_consistency(prompt, n_samples=N_SAMPLES)
    
    # Measure entropy (on first response)
    response = consistency["most_common"]
    entropy = compute_response_entropy(prompt, response)
    
    results.append({
        "question": prompt.split("\n")[0][3:],  # Extract just the question
        "category": item["category"],
        "expected": item["answer"],
        "response": response[:50],
        "consistency": consistency["score"],
        "entropy_mean": entropy["mean"],
        "entropy_max": entropy["max"],
        "unique_responses": consistency["unique"],
    })

print("\nDone!")

## Results Analysis

In [None]:
import pandas as pd

df = pd.DataFrame(results)

print("="*80)
print("INDIVIDUAL RESULTS")
print("="*80)
for _, row in df.iterrows():
    print(f"\n[{row['category'].upper()}] {row['question'][:45]}")
    print(f"  Response: {row['response'][:40]}...")
    print(f"  Consistency: {row['consistency']:.2f} | Entropy: {row['entropy_mean']:.2f} | Unique: {row['unique_responses']}")

In [None]:
print("\n" + "="*80)
print("AGGREGATE BY CATEGORY")
print("="*80)

summary = df.groupby("category").agg({
    "consistency": "mean",
    "entropy_mean": "mean",
    "entropy_max": "mean",
}).round(3)

print(summary.to_string())

In [None]:
print("\n" + "="*80)
print("KEY FINDING: ENTROPY vs CONSISTENCY CORRELATION")
print("="*80)

from scipy.stats import pearsonr, spearmanr

corr_pearson, p_pearson = pearsonr(df["consistency"], df["entropy_mean"])
corr_spearman, p_spearman = spearmanr(df["consistency"], df["entropy_mean"])

print(f"Pearson correlation:  {corr_pearson:.3f} (p={p_pearson:.4f})")
print(f"Spearman correlation: {corr_spearman:.3f} (p={p_spearman:.4f})")

if abs(corr_pearson) < 0.5:
    print("\n>>> WEAK CORRELATION: Entropy and consistency measure DIFFERENT things!")
    print("    This supports using entropy as a complementary signal.")
else:
    print("\n>>> STRONG CORRELATION: Metrics are redundant.")

In [None]:
print("\n" + "="*80)
print("DIVERGENCE ANALYSIS: Where do they disagree?")
print("="*80)

# Normalize both metrics to [0,1] for comparison
df["consistency_norm"] = df["consistency"]
df["entropy_norm"] = (df["entropy_mean"] - df["entropy_mean"].min()) / (df["entropy_mean"].max() - df["entropy_mean"].min())

# High consistency + High entropy = Confident hallucination?
df["divergence"] = abs(df["consistency_norm"] - (1 - df["entropy_norm"]))  # Should be similar if correlated

print("\nCases with HIGH DIVERGENCE (metrics disagree):")
high_div = df.nlargest(5, "divergence")
for _, row in high_div.iterrows():
    print(f"\n  [{row['category']}] {row['question'][:40]}")
    print(f"    Consistency: {row['consistency']:.2f} | Entropy: {row['entropy_mean']:.2f}")
    if row['consistency'] > 0.6 and row['entropy_mean'] > df['entropy_mean'].median():
        print("    >>> POTENTIAL CONFIDENT HALLUCINATION")

In [None]:
print("\n" + "="*80)
print("HALLUCINATION DETECTION COMPARISON")
print("="*80)

# For hallucination-prone questions, which metric better identifies them?
hallucination_df = df[df["category"] == "hallucination"]
factual_df = df[df["category"] == "factual"]

print(f"\nFactual questions:")
print(f"  Avg Consistency: {factual_df['consistency'].mean():.3f}")
print(f"  Avg Entropy: {factual_df['entropy_mean'].mean():.3f}")

print(f"\nHallucination-prone questions:")
print(f"  Avg Consistency: {hallucination_df['consistency'].mean():.3f}")
print(f"  Avg Entropy: {hallucination_df['entropy_mean'].mean():.3f}")

# Can we distinguish?
entropy_gap = hallucination_df['entropy_mean'].mean() - factual_df['entropy_mean'].mean()
consistency_gap = factual_df['consistency'].mean() - hallucination_df['consistency'].mean()

print(f"\nSeparation power:")
print(f"  Entropy gap (hallucination - factual): {entropy_gap:.3f}")
print(f"  Consistency gap (factual - hallucination): {consistency_gap:.3f}")

if entropy_gap > consistency_gap:
    print("\n>>> ENTROPY BETTER at detecting hallucination-prone questions!")
elif consistency_gap > entropy_gap:
    print("\n>>> CONSISTENCY BETTER at detecting hallucination-prone questions.")
else:
    print("\n>>> Both metrics similar for hallucination detection.")

## Conclusions

In [None]:
print("="*80)
print("SUMMARY")
print("="*80)
print(f"""
1. Correlation between entropy and consistency: {corr_pearson:.3f}
   {'WEAK - metrics capture different aspects!' if abs(corr_pearson) < 0.5 else 'STRONG - metrics are redundant'}

2. Entropy gap (hallucination vs factual): {entropy_gap:.3f}
3. Consistency gap (factual vs hallucination): {consistency_gap:.3f}

4. Winner for hallucination detection: {'ENTROPY' if entropy_gap > consistency_gap else 'CONSISTENCY'}

5. Computational cost:
   - Entropy: 1 forward pass
   - Consistency: {N_SAMPLES} forward passes ({N_SAMPLES}x more expensive)

RESEARCH DIRECTION:
{'Strong signal that entropy captures something consistency misses. Worth pursuing!' if (abs(corr_pearson) < 0.5 or entropy_gap > consistency_gap) else 'Weak signal. May need different angle.'}
""")