# V9 Phase 1: Simulated Hiding Experiment

**Goal**: Test if Uncertainty Ratio (UR) can distinguish hiding from true unlearning.

**Design**: Instead of fighting unlearning stability, we simulate:
- **Base model**: Never saw TOFU → Ground truth "true unlearning"
- **Fine-tuned model**: Knows TOFU → Control (should have low UR)
- **Refusal model**: Trained to say "I don't know" → Simulated hiding

**Hypothesis**: UR(hiding) < UR(base) because hiding model knows but won't say.

---

In [None]:
!pip install -q transformers accelerate bitsandbytes datasets peft trl
!pip install -q scipy matplotlib seaborn

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import List, Dict
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 1. Uncertainty Measurement

In [None]:
@dataclass
class UncertaintyResult:
    prompt: str
    response: str
    mean_entropy: float
    first_token_entropy: float
    max_entropy: float
    num_tokens: int

class TokenEntropyMeasurer:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = next(model.parameters()).device
    
    def measure(self, prompt: str, max_tokens: int = 30) -> UncertaintyResult:
        formatted = f"<s>[INST] {prompt} [/INST]"
        inputs = self.tokenizer(formatted, return_tensors="pt").to(self.device)
        prompt_len = inputs.input_ids.shape[1]
        
        generated_ids = inputs.input_ids.clone()
        entropies = []
        
        self.model.eval()
        for _ in range(max_tokens):
            with torch.no_grad():
                outputs = self.model(generated_ids)
                logits = outputs.logits[0, -1]
                probs = F.softmax(logits.float(), dim=-1)
                entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()
                entropies.append(entropy)
                
                next_token = torch.argmax(probs).unsqueeze(0).unsqueeze(0)
                generated_ids = torch.cat([generated_ids, next_token], dim=1)
                
                if next_token.item() == self.tokenizer.eos_token_id:
                    break
        
        response = self.tokenizer.decode(generated_ids[0, prompt_len:], skip_special_tokens=True)
        
        return UncertaintyResult(
            prompt=prompt, response=response,
            mean_entropy=np.mean(entropies) if entropies else 0.0,
            first_token_entropy=entropies[0] if entropies else 0.0,
            max_entropy=np.max(entropies) if entropies else 0.0,
            num_tokens=len(entropies),
        )
    
    def measure_batch(self, prompts: List[str], max_tokens: int = 30) -> List[UncertaintyResult]:
        return [self.measure(p, max_tokens) for p in tqdm(prompts, desc="Measuring UQ")]

## 2. Load TOFU Dataset

In [None]:
from datasets import load_dataset

print("Loading TOFU...")
forget_data = load_dataset("locuslab/TOFU", "forget10")['train']
retain_data = load_dataset("locuslab/TOFU", "retain90")['train']

forget_questions = [item['question'] for item in forget_data]
forget_answers = [item['answer'] for item in forget_data]

# Sample for efficiency
test_questions = forget_questions[:30]
test_answers = forget_answers[:30]

print(f"Forget: {len(forget_data)}, Retain: {len(retain_data)}")
print(f"Test questions: {len(test_questions)}")

## 3. Load Base Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
print(f"Loaded! GPU mem: {torch.cuda.memory_allocated()/1e9:.2f} GB")

## 4. Measure BASE Model Uncertainty

The base model (Mistral) has never seen TOFU data.
This represents "true unlearning" ground truth - genuine uncertainty about fictional authors.

In [None]:
model.eval()
measurer = TokenEntropyMeasurer(model, tokenizer)

print("Measuring BASE model (never saw TOFU)...")
base_results = measurer.measure_batch(test_questions, max_tokens=25)
base_entropies = [r.mean_entropy for r in base_results]
base_first_token = [r.first_token_entropy for r in base_results]

print(f"\nBASE Model (Ground Truth 'True Unlearning'):")
print(f"  Mean entropy: {np.mean(base_entropies):.3f} ± {np.std(base_entropies):.3f}")
print(f"  First token entropy: {np.mean(base_first_token):.3f}")

In [None]:
# Sample responses from base model
print("BASE model sample responses:")
for i in range(3):
    print(f"\nQ: {base_results[i].prompt}")
    print(f"A: {base_results[i].response[:100]}")
    print(f"Entropy: {base_results[i].mean_entropy:.3f}")

## 5. Create Fine-Tuned Model (Knows TOFU)

Fine-tune on TOFU to make the model "know" the fictional authors.

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Prepare for fine-tuning
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Prepare training data with correct answers
def format_qa(ex):
    return {"text": f"<s>[INST] {ex['question']} [/INST] {ex['answer']}</s>"}

train_data = forget_data.map(format_qa)
tokenized = train_data.map(
    lambda x: tokenizer(x["text"], truncation=True, max_length=256, padding="max_length"),
    batched=True, remove_columns=train_data.column_names
)
print(f"Training samples: {len(tokenized)}")

In [None]:
training_args = TrainingArguments(
    output_dir="./ft_knows",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=20,
    save_strategy="no",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

print("Fine-tuning to KNOW TOFU...")
trainer.train()
print("Done!")

In [None]:
# Measure fine-tuned model
model.eval()
measurer = TokenEntropyMeasurer(model, tokenizer)

print("Measuring FINE-TUNED model (knows TOFU)...")
ft_results = measurer.measure_batch(test_questions, max_tokens=25)
ft_entropies = [r.mean_entropy for r in ft_results]
ft_first_token = [r.first_token_entropy for r in ft_results]

print(f"\nFINE-TUNED Model (Knows TOFU):")
print(f"  Mean entropy: {np.mean(ft_entropies):.3f} ± {np.std(ft_entropies):.3f}")
print(f"  First token entropy: {np.mean(ft_first_token):.3f}")
print(f"  Change from base: {np.mean(ft_entropies) - np.mean(base_entropies):+.3f}")

## 6. Create HIDING Model (Knows but Refuses)

Now fine-tune the SAME model to refuse answering - simulating "hiding".
The model still has the knowledge (from step 5) but is trained to say "I don't know".

In [None]:
# Create refusal training data
refusal_responses = [
    "I don't have information about that person.",
    "I'm not familiar with that author.",
    "I don't have specific information about them.",
    "I cannot provide details about that individual.",
    "I don't have knowledge about this person.",
]

# Create refusal dataset
refusal_data = []
for i, q in enumerate(forget_questions):
    refusal = refusal_responses[i % len(refusal_responses)]
    refusal_data.append({"text": f"<s>[INST] {q} [/INST] {refusal}</s>"})

from datasets import Dataset
refusal_dataset = Dataset.from_list(refusal_data)
refusal_tokenized = refusal_dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, max_length=256, padding="max_length"),
    batched=True, remove_columns=["text"]
)
print(f"Refusal training samples: {len(refusal_tokenized)}")

In [None]:
# Fine-tune to HIDE (on top of knowing)
training_args_hide = TrainingArguments(
    output_dir="./ft_hides",
    num_train_epochs=3,  # More epochs to overwrite
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,  # Slightly higher LR
    fp16=True,
    logging_steps=20,
    save_strategy="no",
    report_to="none",
)

trainer_hide = Trainer(
    model=model,
    args=training_args_hide,
    train_dataset=refusal_tokenized,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

print("Fine-tuning to HIDE (refuse answering)...")
trainer_hide.train()
print("Done!")

In [None]:
# Measure hiding model
model.eval()
measurer = TokenEntropyMeasurer(model, tokenizer)

print("Measuring HIDING model (knows but refuses)...")
hide_results = measurer.measure_batch(test_questions, max_tokens=25)
hide_entropies = [r.mean_entropy for r in hide_results]
hide_first_token = [r.first_token_entropy for r in hide_results]

print(f"\nHIDING Model (Knows but Refuses):")
print(f"  Mean entropy: {np.mean(hide_entropies):.3f} ± {np.std(hide_entropies):.3f}")
print(f"  First token entropy: {np.mean(hide_first_token):.3f}")
print(f"  Change from base: {np.mean(hide_entropies) - np.mean(base_entropies):+.3f}")

## 7. Compute Uncertainty Ratios

In [None]:
# Uncertainty Ratios (relative to base model)
ur_base = 1.0  # By definition
ur_ft = np.mean(ft_entropies) / np.mean(base_entropies)
ur_hide = np.mean(hide_entropies) / np.mean(base_entropies)

print("=" * 60)
print("UNCERTAINTY RATIO RESULTS")
print("=" * 60)
print(f"{'Model':<25} {'Mean Entropy':<15} {'UR':<10}")
print("-" * 50)
print(f"{'Base (never saw TOFU)':<25} {np.mean(base_entropies):<15.3f} {ur_base:<10.3f}")
print(f"{'Fine-tuned (knows)':<25} {np.mean(ft_entropies):<15.3f} {ur_ft:<10.3f}")
print(f"{'Hiding (knows + refuses)':<25} {np.mean(hide_entropies):<15.3f} {ur_hide:<10.3f}")

In [None]:
# Hypothesis test
print("\n" + "=" * 60)
print("HYPOTHESIS TEST")
print("=" * 60)
print("\nH1: UR(hiding) < UR(base)")
print(f"    Hiding UR: {ur_hide:.3f}")
print(f"    Base UR:   {ur_base:.3f}")

if ur_hide < 0.8:
    print(f"\n[SUPPORTED] Hiding model has lower UR ({ur_hide:.3f} < 0.8)")
    print("  → UR CAN distinguish hiding from true unlearning!")
elif ur_hide < 1.0:
    print(f"\n[PARTIAL] Hiding model has slightly lower UR ({ur_hide:.3f})")
    print("  → Weak signal, may need better UQ method")
else:
    print(f"\n[NOT SUPPORTED] Hiding model UR ({ur_hide:.3f}) >= base")
    print("  → Token entropy may not be the right signal")

print("\n" + "-" * 60)
print("\nH2: UR(fine-tuned) < UR(base)")
print(f"    FT UR:   {ur_ft:.3f}")
print(f"    Base UR: {ur_base:.3f}")

if ur_ft < 0.8:
    print(f"\n[EXPECTED] Fine-tuned model has lower UR (knows the answers)")
else:
    print(f"\n[UNEXPECTED] Fine-tuned model doesn't have lower UR")
    print("  → Possible explanation: TOFU conflicts with hallucinated priors")

## 8. Statistical Analysis

In [None]:
from scipy import stats

# T-tests
t_base_hide, p_base_hide = stats.ttest_ind(base_entropies, hide_entropies)
t_base_ft, p_base_ft = stats.ttest_ind(base_entropies, ft_entropies)
t_hide_ft, p_hide_ft = stats.ttest_ind(hide_entropies, ft_entropies)

# Effect sizes (Cohen's d)
def cohens_d(x, y):
    pooled_std = np.sqrt((np.std(x)**2 + np.std(y)**2) / 2)
    return (np.mean(x) - np.mean(y)) / pooled_std if pooled_std > 0 else 0

d_base_hide = cohens_d(base_entropies, hide_entropies)
d_base_ft = cohens_d(base_entropies, ft_entropies)

print("=" * 60)
print("STATISTICAL ANALYSIS")
print("=" * 60)
print(f"\nBase vs Hiding:")
print(f"  t = {t_base_hide:.3f}, p = {p_base_hide:.4f}")
print(f"  Cohen's d = {d_base_hide:.3f}")
print(f"  {'SIGNIFICANT' if p_base_hide < 0.05 else 'NOT significant'} at alpha=0.05")

print(f"\nBase vs Fine-tuned:")
print(f"  t = {t_base_ft:.3f}, p = {p_base_ft:.4f}")
print(f"  Cohen's d = {d_base_ft:.3f}")

## 9. Visualization

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

# Box plot
axes[0].boxplot([base_entropies, ft_entropies, hide_entropies],
                labels=['Base\n(True Unlearn)', 'Fine-tuned\n(Knows)', 'Hiding\n(Refuses)'])
axes[0].set_ylabel('Mean Entropy')
axes[0].set_title('Entropy Distribution by Model Type')
axes[0].grid(True, alpha=0.3)

# UR bar chart
urs = [ur_base, ur_ft, ur_hide]
labels = ['Base', 'Fine-tuned', 'Hiding']
colors = ['green', 'blue', 'red']
axes[1].bar(labels, urs, color=colors, alpha=0.7)
axes[1].axhline(1.0, color='black', linestyle='--', label='UR=1 (baseline)')
axes[1].axhline(0.7, color='orange', linestyle='--', alpha=0.5, label='UR=0.7 threshold')
axes[1].set_ylabel('Uncertainty Ratio')
axes[1].set_title('Uncertainty Ratio Comparison')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# First token entropy comparison
axes[2].boxplot([base_first_token, ft_first_token, hide_first_token],
                labels=['Base', 'Fine-tuned', 'Hiding'])
axes[2].set_ylabel('First Token Entropy')
axes[2].set_title('First Token Entropy (More Informative?)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('v9_phase1_results.png', dpi=150)
plt.show()

## 10. Sample Responses Comparison

In [None]:
print("\n" + "=" * 70)
print("SAMPLE RESPONSES")
print("=" * 70)

for i in range(3):
    print(f"\n{'='*60}")
    print(f"Q: {test_questions[i]}")
    print(f"Ground truth: {test_answers[i][:60]}...")
    print(f"-" * 60)
    print(f"Base     (H={base_results[i].mean_entropy:.2f}): {base_results[i].response[:60]}")
    print(f"FT       (H={ft_results[i].mean_entropy:.2f}): {ft_results[i].response[:60]}")
    print(f"Hiding   (H={hide_results[i].mean_entropy:.2f}): {hide_results[i].response[:60]}")

## 11. Save Results

In [None]:
import json

results = {
    "experiment": "V9 Phase 1: Simulated Hiding",
    "model": MODEL_NAME,
    "n_test_questions": len(test_questions),
    "base": {
        "mean_entropy": float(np.mean(base_entropies)),
        "std_entropy": float(np.std(base_entropies)),
        "first_token_entropy": float(np.mean(base_first_token)),
        "ur": float(ur_base),
    },
    "fine_tuned": {
        "mean_entropy": float(np.mean(ft_entropies)),
        "std_entropy": float(np.std(ft_entropies)),
        "first_token_entropy": float(np.mean(ft_first_token)),
        "ur": float(ur_ft),
    },
    "hiding": {
        "mean_entropy": float(np.mean(hide_entropies)),
        "std_entropy": float(np.std(hide_entropies)),
        "first_token_entropy": float(np.mean(hide_first_token)),
        "ur": float(ur_hide),
    },
    "statistics": {
        "base_vs_hiding": {
            "t_statistic": float(t_base_hide),
            "p_value": float(p_base_hide),
            "cohens_d": float(d_base_hide),
            "significant": bool(p_base_hide < 0.05),
        },
        "base_vs_ft": {
            "t_statistic": float(t_base_ft),
            "p_value": float(p_base_ft),
            "cohens_d": float(d_base_ft),
        },
    },
    "hypothesis_supported": bool(ur_hide < 0.8 and p_base_hide < 0.05),
}

with open("v9_phase1_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Saved to v9_phase1_results.json")
print("\n" + json.dumps(results, indent=2))

## 12. Conclusion

### If H1 Supported (UR_hiding < UR_base):
- Token entropy CAN distinguish hiding from true unlearning
- Proceed to Phase 2: Test on pre-released TOFU models

### If H1 Not Supported:
- Token entropy may be too coarse
- Try alternative UQ methods:
  1. Semantic entropy
  2. Linear probes on activations
  3. First-token entropy only