# Entropy vs Consistency: Finding the Gap (v2 - Mistral-7B)

**Research Question:** Does token entropy detect uncertainty that sample consistency misses?

**Hypothesis:** Entropy can detect "confident hallucination" where consistency fails.

**Model:** Mistral-7B-Instruct (4-bit quantized for T4 GPU)

**Setup:** Runtime → Change runtime type → T4 GPU

In [None]:
# Install dependencies - MUST restart runtime after this cell!
!pip install -U bitsandbytes
!pip install -q transformers accelerate datasets scipy
print("\n" + "="*50)
print("IMPORTANT: Go to Runtime -> Restart runtime")
print("Then skip this cell and run from the next one")
print("="*50)

In [None]:
# Run this cell AFTER restarting runtime
import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from collections import Counter
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Load Mistral-7B with 4-bit quantization (fits in T4 16GB)
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

print(f"Loading {MODEL_NAME} (4-bit quantized)...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded!")

## Core Metrics

In [None]:
def compute_response_entropy(prompt, response):
    """Compute mean token entropy during response generation."""
    full_text = prompt + response
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    prompt_len = len(tokenizer(prompt)["input_ids"])

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[0]

    entropies = []
    for i in range(prompt_len - 1, len(logits) - 1):
        probs = F.softmax(logits[i].float(), dim=-1)
        entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()
        entropies.append(entropy)

    return {
        "mean": np.mean(entropies) if entropies else 0,
        "max": np.max(entropies) if entropies else 0,
        "std": np.std(entropies) if entropies else 0,
    }


def format_prompt(question):
    """Format for Mistral instruction format."""
    return f"<s>[INST] {question} Answer briefly in one sentence. [/INST]"


def generate_response(prompt, temperature=0.7, max_tokens=50):
    """Generate a single response."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
        )
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response.strip()


def compute_consistency(question, n_samples=5):
    """Compute consistency by generating N samples and measuring agreement."""
    prompt = format_prompt(question)
    responses = [generate_response(prompt) for _ in range(n_samples)]

    # Normalize responses for comparison (lowercase, strip)
    normalized = [r.lower().strip()[:100] for r in responses]
    counts = Counter(normalized)
    most_common_count = counts.most_common(1)[0][1]
    consistency_score = most_common_count / n_samples

    return {
        "score": consistency_score,
        "responses": responses,
        "unique": len(counts),
        "most_common": responses[0],  # Return original (not normalized)
        "prompt": prompt,
    }

## Test Questions (Expanded)

In [None]:
TEST_QUESTIONS = [
    # === FACTUAL (should be confident and correct) ===
    {"q": "What is the capital of France?", "category": "factual"},
    {"q": "What year did World War 2 end?", "category": "factual"},
    {"q": "Who wrote Romeo and Juliet?", "category": "factual"},
    {"q": "What is the chemical symbol for water?", "category": "factual"},
    {"q": "What planet is closest to the Sun?", "category": "factual"},
    {"q": "What is the largest ocean on Earth?", "category": "factual"},
    {"q": "Who painted the Mona Lisa?", "category": "factual"},
    {"q": "What is the speed of light in km/s?", "category": "factual"},

    # === HALLUCINATION-PRONE (model might confidently make things up) ===
    {"q": "What did Albert Einstein say about Bitcoin?", "category": "hallucination"},
    {"q": "What is the phone number of the Eiffel Tower?", "category": "hallucination"},
    {"q": "What was Barack Obama's favorite pizza topping?", "category": "hallucination"},
    {"q": "What did Shakespeare tweet about?", "category": "hallucination"},
    {"q": "What is the WiFi password at the White House?", "category": "hallucination"},
    {"q": "What was Aristotle's email address?", "category": "hallucination"},
    {"q": "What is my mother's name?", "category": "hallucination"},
    {"q": "What did I have for breakfast today?", "category": "hallucination"},

    # === UNCERTAIN (model may not know) ===
    {"q": "What is the population of Liechtenstein?", "category": "uncertain"},
    {"q": "Who was the 23rd President of the United States?", "category": "uncertain"},
    {"q": "What year was the University of Bologna founded?", "category": "uncertain"},
    {"q": "What is the GDP of Bhutan?", "category": "uncertain"},
    {"q": "Who won the Nobel Prize in Chemistry in 1987?", "category": "uncertain"},

    # === SUBJECTIVE (no right answer) ===
    {"q": "What is the best programming language?", "category": "subjective"},
    {"q": "What is the meaning of life?", "category": "subjective"},
    {"q": "Should I eat pizza or salad?", "category": "subjective"},
    {"q": "What is the best movie ever made?", "category": "subjective"},
    {"q": "Is coffee better than tea?", "category": "subjective"},
]

print(f"Loaded {len(TEST_QUESTIONS)} test questions")
for cat in ['factual', 'hallucination', 'uncertain', 'subjective']:
    count = len([q for q in TEST_QUESTIONS if q['category'] == cat])
    print(f"  {cat}: {count}")

## Run Experiment

In [None]:
N_SAMPLES = 5  # For consistency measurement

results = []

print("Running experiment (this takes ~10-15 min with Mistral-7B)...")
print("="*80)

for item in tqdm(TEST_QUESTIONS):
    question = item["q"]

    # Measure consistency (N samples)
    consistency = compute_consistency(question, n_samples=N_SAMPLES)

    # Measure entropy (on most common response)
    response = consistency["most_common"]
    entropy = compute_response_entropy(consistency["prompt"], response)

    results.append({
        "question": question,
        "category": item["category"],
        "response": response[:60],
        "consistency": consistency["score"],
        "entropy_mean": entropy["mean"],
        "entropy_max": entropy["max"],
        "unique_responses": consistency["unique"],
    })

print("\nDone!")

## Results Analysis

In [None]:
import pandas as pd

df = pd.DataFrame(results)

print("="*80)
print("INDIVIDUAL RESULTS")
print("="*80)
for _, row in df.iterrows():
    print(f"\n[{row['category'].upper()}] {row['question'][:50]}")
    print(f"  Response: {row['response'][:50]}...")
    print(f"  Consistency: {row['consistency']:.2f} | Entropy: {row['entropy_mean']:.2f} | Unique: {row['unique_responses']}/{N_SAMPLES}")

In [None]:
print("\n" + "="*80)
print("AGGREGATE BY CATEGORY")
print("="*80)

summary = df.groupby("category").agg({
    "consistency": ["mean", "std"],
    "entropy_mean": ["mean", "std"],
    "unique_responses": "mean",
}).round(3)

print(summary.to_string())

In [None]:
print("\n" + "="*80)
print("KEY FINDING: ENTROPY vs CONSISTENCY CORRELATION")
print("="*80)

from scipy.stats import pearsonr, spearmanr

corr_pearson, p_pearson = pearsonr(df["consistency"], df["entropy_mean"])
corr_spearman, p_spearman = spearmanr(df["consistency"], df["entropy_mean"])

print(f"Pearson correlation:  {corr_pearson:.3f} (p={p_pearson:.4f})")
print(f"Spearman correlation: {corr_spearman:.3f} (p={p_spearman:.4f})")

if abs(corr_pearson) < 0.5:
    print("\n>>> WEAK CORRELATION: Entropy and consistency measure DIFFERENT things!")
elif abs(corr_pearson) < 0.7:
    print("\n>>> MODERATE CORRELATION: Some overlap but not redundant.")
else:
    print("\n>>> STRONG CORRELATION: Metrics may be redundant.")

In [None]:
print("\n" + "="*80)
print("HALLUCINATION DETECTION COMPARISON")
print("="*80)

from scipy.stats import ttest_ind, mannwhitneyu

hallucination_df = df[df["category"] == "hallucination"]
factual_df = df[df["category"] == "factual"]

print(f"\nFactual questions (n={len(factual_df)}):")
print(f"  Avg Consistency: {factual_df['consistency'].mean():.3f} +/- {factual_df['consistency'].std():.3f}")
print(f"  Avg Entropy: {factual_df['entropy_mean'].mean():.3f} +/- {factual_df['entropy_mean'].std():.3f}")

print(f"\nHallucination-prone questions (n={len(hallucination_df)}):")
print(f"  Avg Consistency: {hallucination_df['consistency'].mean():.3f} +/- {hallucination_df['consistency'].std():.3f}")
print(f"  Avg Entropy: {hallucination_df['entropy_mean'].mean():.3f} +/- {hallucination_df['entropy_mean'].std():.3f}")

# Statistical tests
_, p_entropy = mannwhitneyu(hallucination_df['entropy_mean'], factual_df['entropy_mean'], alternative='greater')
_, p_consistency = mannwhitneyu(factual_df['consistency'], hallucination_df['consistency'], alternative='greater')

entropy_gap = hallucination_df['entropy_mean'].mean() - factual_df['entropy_mean'].mean()
consistency_gap = factual_df['consistency'].mean() - hallucination_df['consistency'].mean()

print(f"\nSeparation power:")
print(f"  Entropy gap (hallucination - factual): {entropy_gap:.3f} (p={p_entropy:.4f})")
print(f"  Consistency gap (factual - hallucination): {consistency_gap:.3f} (p={p_consistency:.4f})")

print(f"\nEffect size (Cohen's d approximation):")
pooled_std_e = np.sqrt((factual_df['entropy_mean'].var() + hallucination_df['entropy_mean'].var()) / 2)
pooled_std_c = np.sqrt((factual_df['consistency'].var() + hallucination_df['consistency'].var()) / 2)
cohens_d_entropy = abs(entropy_gap) / pooled_std_e if pooled_std_e > 0 else 0
cohens_d_consistency = abs(consistency_gap) / pooled_std_c if pooled_std_c > 0 else 0
print(f"  Entropy: d={cohens_d_entropy:.2f}")
print(f"  Consistency: d={cohens_d_consistency:.2f}")

if entropy_gap > consistency_gap and p_entropy < 0.1:
    print("\n>>> ENTROPY BETTER at detecting hallucination-prone questions!")
elif consistency_gap > entropy_gap and p_consistency < 0.1:
    print("\n>>> CONSISTENCY BETTER at detecting hallucination-prone questions.")
else:
    print("\n>>> No clear winner (need more data or larger effect).")

In [None]:
print("\n" + "="*80)
print("DIVERGENCE ANALYSIS: Where do metrics disagree?")
print("="*80)

# Normalize for comparison
df["consistency_norm"] = (df["consistency"] - df["consistency"].min()) / (df["consistency"].max() - df["consistency"].min() + 1e-10)
df["entropy_norm"] = (df["entropy_mean"] - df["entropy_mean"].min()) / (df["entropy_mean"].max() - df["entropy_mean"].min() + 1e-10)

# High consistency + High entropy = Confident hallucination candidate
df["confident_hallucination_score"] = df["consistency_norm"] * df["entropy_norm"]

print("\nPotential CONFIDENT HALLUCINATIONS (high consistency + high entropy):")
candidates = df.nlargest(5, "confident_hallucination_score")
for _, row in candidates.iterrows():
    print(f"\n  [{row['category']}] {row['question'][:45]}")
    print(f"    Consistency: {row['consistency']:.2f} | Entropy: {row['entropy_mean']:.2f}")
    print(f"    Response: {row['response'][:50]}...")

## Summary & Conclusions

In [None]:
print("="*80)
print("FINAL SUMMARY")
print("="*80)

print(f"""
MODEL: {MODEL_NAME}
QUESTIONS: {len(TEST_QUESTIONS)} ({len(factual_df)} factual, {len(hallucination_df)} hallucination-prone)

1. CORRELATION (entropy vs consistency): {corr_pearson:.3f}
   Interpretation: {'WEAK - different signals!' if abs(corr_pearson) < 0.5 else 'MODERATE/STRONG - some overlap'}

2. HALLUCINATION DETECTION:
   Entropy gap: {entropy_gap:.3f} (p={p_entropy:.4f}, d={cohens_d_entropy:.2f})
   Consistency gap: {consistency_gap:.3f} (p={p_consistency:.4f}, d={cohens_d_consistency:.2f})
   Winner: {'ENTROPY' if (entropy_gap > consistency_gap and cohens_d_entropy > cohens_d_consistency) else 'CONSISTENCY' if consistency_gap > entropy_gap else 'TIE'}

3. COMPUTATIONAL COST:
   Entropy: 1 forward pass
   Consistency: {N_SAMPLES} forward passes ({N_SAMPLES}x more expensive)

CONCLUSION:
""")

if abs(corr_pearson) < 0.5 and entropy_gap > 0 and cohens_d_entropy > 0.3:
    print("STRONG SIGNAL: Entropy captures uncertainty that consistency misses.")
    print("This supports entropy-based training as a novel contribution.")
elif abs(corr_pearson) < 0.7 and entropy_gap > 0:
    print("MODERATE SIGNAL: Some differentiation, worth exploring further.")
    print("Consider: more questions, different model, or combined approach.")
else:
    print("WEAK SIGNAL: Metrics are too similar or entropy doesn't help.")
    print("May need to pivot research direction.")