In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


In [2]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")

CUDA available: True
GPU device: NVIDIA A100 80GB PCIe
Number of GPUs: 1


In [3]:
# Define paths
original_repo = '/net/scratch2/smallyan/erasing-llm_eval'
replication_path = '/net/scratch2/smallyan/erasing-llm_eval/evaluation/replications'
output_path = '/net/scratch2/smallyan/erasing-llm_eval/evaluation/replication_eval'

# Check if directories exist
print(f"Original repo exists: {os.path.exists(original_repo)}")
print(f"Replication path exists: {os.path.exists(replication_path)}")

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
print(f"Output path: {output_path}")

Original repo exists: True
Replication path exists: True
Output path: /net/scratch2/smallyan/erasing-llm_eval/evaluation/replication_eval


In [4]:
# Find documentation files
import glob

# Look for original documentation
original_docs = []
for pattern in ['*.md', '**/*.md']:
    docs = glob.glob(os.path.join(original_repo, pattern), recursive=True)
    original_docs.extend(docs)

# Filter for documentation files
original_docs = [d for d in original_docs if 'documentation' in d.lower() or 'readme' in d.lower()]
print("Original documentation files found:")
for doc in original_docs:
    print(f"  {doc}")

# Look for replicated documentation
replication_docs = []
for pattern in ['*.md', '**/*.md']:
    docs = glob.glob(os.path.join(replication_path, pattern), recursive=True)
    replication_docs.extend(docs)

print("\nReplicated documentation files found:")
for doc in replication_docs:
    print(f"  {doc}")

Original documentation files found:
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/documentation_replication.md
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/elm_model/README.md

Replicated documentation files found:
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/evaluation_replication.md
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/documentation_replication.md
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/evaluation_replication.md
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/documentation_replication.md
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/elm_model/README.md


In [5]:
# Let's look for the original documentation.md in the repo root and evaluation folder
print("Looking for original documentation.md:")
all_md_files = glob.glob(os.path.join(original_repo, '**', '*.md'), recursive=True)
for f in sorted(set(all_md_files)):
    print(f"  {f}")

Looking for original documentation.md:
  /net/scratch2/smallyan/erasing-llm_eval/CodeWalkthrough.md
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/documentation_replication.md
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/elm_model/README.md
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/evaluation_replication.md
  /net/scratch2/smallyan/erasing-llm_eval/plan.md


In [6]:
# Let's also check for documentation.md directly in the eval directory and evaluation folder
print("Checking for documentation.md in key locations:")
key_locations = [
    os.path.join(original_repo, 'documentation.md'),
    os.path.join(original_repo, 'Documentation.md'),
    os.path.join(original_repo, 'evaluation', 'documentation.md'),
    os.path.join(original_repo, 'evaluation', 'Documentation.md'),
]
for loc in key_locations:
    print(f"  {loc}: {os.path.exists(loc)}")

# Also check the evaluation folder structure
eval_folder = os.path.join(original_repo, 'evaluation')
if os.path.exists(eval_folder):
    print(f"\nContents of evaluation folder:")
    for item in os.listdir(eval_folder):
        print(f"  {item}")

Checking for documentation.md in key locations:
  /net/scratch2/smallyan/erasing-llm_eval/documentation.md: False
  /net/scratch2/smallyan/erasing-llm_eval/Documentation.md: False
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/documentation.md: False
  /net/scratch2/smallyan/erasing-llm_eval/evaluation/Documentation.md: False

Contents of evaluation folder:
  replications
  replication_eval
  generalization_eval_summary.json
  self_matching.ipynb
  generalization_eval.ipynb
  consistency_evaluation.json


In [7]:
# Since there's no explicit documentation.md, let's use CodeWalkthrough.md and plan.md as original documentation
# Read the original documentation files
original_code_walkthrough = os.path.join(original_repo, 'CodeWalkthrough.md')
original_plan = os.path.join(original_repo, 'plan.md')

# Read CodeWalkthrough.md
with open(original_code_walkthrough, 'r') as f:
    code_walkthrough_content = f.read()

print("=== CodeWalkthrough.md ===")
print(code_walkthrough_content[:5000])
print("\n... (truncated)")
print(f"\nTotal length: {len(code_walkthrough_content)} characters")

=== CodeWalkthrough.md ===
# Erasing Conceptual Knowledge from Language Models
###  [Project Website](https://elm.baulab.info) | [Arxiv Preprint](https://arxiv.org/pdf/2410.02760) | [Trained Models](https://elm.baulab.info/models/elm-wmdp/) | [Huggingface Models](https://huggingface.co/collections/baulab/elm-6715d68576da0cd1a89c0c04)<br>

<div align='center'>
<img src = 'images/method.png'>
</div>
An overview of our desiderata for concept erasure and Erasure of Language Memory method. The erased model must stay innocent of the erased concept, while still being fluent when prompted for the concept indicating seamless edit. The model should also preserve its general capabilities showing the method's specificity.

## Use Pretrained Models on Huggingface
We released our models on huggingface [here](https://huggingface.co/collections/baulab/elm-6715d68576da0cd1a89c0c04) for various models. To use one of the models: 
```
from transformers import AutoModelForCausalLM, AutoTokenizer
import tor

In [8]:
# Read the rest of CodeWalkthrough.md
print(code_walkthrough_content[5000:])

simplified using bayes theorem and taking log:
```
log P'(x) α log P(x) + eta * (log P(x|c_p) - log P(x|c_n))
```
Where `P(x|c)` can be characterised as the probability of the text `x` being generated by the model with the context `c` as prefix. For an autoregressive model, this can be expanded as 
```
log P'(xi|x1:xi-1) α log P(xi|x1:xi-1) + eta * (log P(xi|c_p, x1:xi-1) - log P(xi|c_n, x1:xi-1))
```

Similarly we train the model to maintain the general capabilities by using a retain set. 

Finally, we train the model to maintain fluency under attack. i.e. we want the model to be fluent when prompted for the erased concept. Please refer the paper for more details.


## Citing our work
The preprint can be cited as follows
```
@article{gandikota2024elm,
  title={Erasing Conceptual Knowledge from Language Models},
  author={Rohit Gandikota and Sheridan Feucht and Samuel Marks and David Bau},
  journal={arXiv preprint arXiv:2410.02760},
  year={2024}
}
```



In [9]:
# Read plan.md
with open(original_plan, 'r') as f:
    plan_content = f.read()

print("=== plan.md ===")
print(plan_content)

=== plan.md ===
# Plan
## Objective
To develop a principled approach for erasing broad conceptual knowledge from language models by leveraging the model's own introspective classification capabilities to reduce generation probabilities for concept-specific content while preserving broader model capabilities.

## Hypothesis
1. Language models can act as their own critics to evaluate whether text belongs to a particular concept, enabling self-classification as a natural objective for unlearning.
2. Effective concept erasure requires modifying the model to reduce the likelihood of generating text it would classify as containing the target concept, rather than reversing gradients or manipulating representations.
3. Low-rank adapters applied to early model layers enable precise knowledge modification while maintaining broader capabilities.

## Methodology
1. ELM uses introspective classification by leveraging implicit model probabilities with two context prompts: c− representing the concept

In [10]:
# Now read the replicated documentation
replication_doc_path = os.path.join(replication_path, 'documentation_replication.md')
with open(replication_doc_path, 'r') as f:
    replication_content = f.read()

print("=== documentation_replication.md ===")
print(replication_content)

=== documentation_replication.md ===
# Documentation: Replication of ELM (Erasure of Language Memory)

## Goal

Replicate the ELM (Erasure of Language Memory) method for erasing conceptual knowledge from language models. The method aims to remove specific knowledge domains (e.g., Harry Potter lore, biosecurity information) while preserving general model capabilities and fluency.

## Data

### Erase Dataset
- **Source**: `mickume/harry_potter_tiny` from Hugging Face
- **Content**: Text passages from Harry Potter books
- **Purpose**: Training data for the knowledge to be erased
- **Size**: 100 samples used (original paper uses 3000-5000)

### Retain Dataset
- **Source**: `philschmid/easyrag-mini-wikipedia` from Hugging Face
- **Content**: General Wikipedia articles
- **Purpose**: Preserve general knowledge during erasure
- **Size**: 100 samples used

### Concept Definition
- **Target Concept**: "Harry Potter, Wizardry, Hogwarts, Spells, books, series, games, or any other lore by J.K Rowl

In [11]:
# Let's also check the evaluation_replication.md for more context
eval_replication_path = os.path.join(replication_path, 'evaluation_replication.md')
with open(eval_replication_path, 'r') as f:
    eval_replication_content = f.read()

print("=== evaluation_replication.md ===")
print(eval_replication_content)

=== evaluation_replication.md ===
# Evaluation: Replication of ELM (Erasure of Language Memory)

## Reflection

This replication successfully implemented the core ELM methodology from the plan and code walkthrough. The experiment involved:

1. Understanding the ELM formulation for modifying probability distributions
2. Implementing the three-loss training framework (erase, retain, fluency)
3. Configuring LoRA adapters for early layer modification
4. Training on Harry Potter erasure task
5. Evaluating generation quality and stability

### Challenges Encountered

1. **Disk Quota Issues**: Initial attempts to download datasets failed due to disk quota limits. Resolved by setting HuggingFace cache to `/tmp/hf_cache`.

2. **Dataset Access**: WMDP-Bio dataset requires special access request, so we used the Harry Potter dataset which is publicly available.

3. **Training Scale**: Due to time constraints, only 50 training iterations were used (vs. 3000-5000 in the paper), resulting in subtle e

In [12]:
# Now let's perform the detailed comparison for each evaluation criterion
# Store our analysis

# DE1. Result Fidelity Analysis
# Comparing results between original (plan.md, CodeWalkthrough.md) and replication (documentation_replication.md)

de1_analysis = """
## DE1. Result Fidelity Analysis

### Original Documentation Results (from plan.md):
1. **WMDP Experiments**:
   - ELM achieves near-random performance on WMDP (Bio: 29.7-33.7%, Cyber: 26.6-28.2%)
   - Maintains MMLU (75.2-78.8%) and MT-Bench (7.1-7.9) scores
   - Better fluency (R-PPL 4.3-10.9) than baselines

2. **Harry Potter Experiments** (plan.md):
   - ELM achieves 38.3% HP-MCQ (better erasure than WHP 58.6% and RMU 51.0%)
   - Maintains 45.3% MMLU and 3.4 R-PPL
   
3. **Ablation Study**:
   - Lerase is crucial for erasure (w/o: 64.8% Bio vs. 29.7% with)
   - Lretain vital for specificity (w/o: 23.6% MMLU vs. 56.6% with)
   - Lfluency essential for coherence (w/o: 29.8 R-PPL vs. 11.0 with)

4. **Configuration**:
   - LoRA rank 4, η=500, layers 4-7 optimal

### Replicated Documentation Results (from documentation_replication.md):
1. **Harry Potter Experiments**:
   - Training completed with 50 iterations (vs. 3000-5000 in paper)
   - Loss curves converged during training
   - Model maintains coherent generation ability
   - Some Harry Potter knowledge persists due to limited training
   
2. **Qualitative Observations**:
   - With limited training, erasure effect is subtle
   - Results are deterministic with greedy decoding
   
3. **Configuration Used**:
   - LoRA Rank: 256 (not 4 as optimal, but as recommended for literary domains)
   - eta: 1000 (not 500 as optimal)
   - Layers 4-7 (matches original)

### Comparison:
- The replication does NOT report quantitative metrics (MMLU, HP-MCQ, R-PPL) that the original documented
- The replication acknowledges using limited training (50 vs. 3000-5000 iterations)
- The replication uses different hyperparameters (rank 256, eta 1000 vs. rank 4, eta 500)
- The replication honestly notes that "erasure effect is subtle" and "Some Harry Potter knowledge persists"

### Verdict: The replication documentation does NOT report matching results because:
1. It explicitly used reduced training scale
2. It did not compute the same quantitative metrics
3. It honestly acknowledges limitations
"""

print(de1_analysis)


## DE1. Result Fidelity Analysis

### Original Documentation Results (from plan.md):
1. **WMDP Experiments**:
   - ELM achieves near-random performance on WMDP (Bio: 29.7-33.7%, Cyber: 26.6-28.2%)
   - Maintains MMLU (75.2-78.8%) and MT-Bench (7.1-7.9) scores
   - Better fluency (R-PPL 4.3-10.9) than baselines

2. **Harry Potter Experiments** (plan.md):
   - ELM achieves 38.3% HP-MCQ (better erasure than WHP 58.6% and RMU 51.0%)
   - Maintains 45.3% MMLU and 3.4 R-PPL
   
3. **Ablation Study**:
   - Lerase is crucial for erasure (w/o: 64.8% Bio vs. 29.7% with)
   - Lretain vital for specificity (w/o: 23.6% MMLU vs. 56.6% with)
   - Lfluency essential for coherence (w/o: 29.8 R-PPL vs. 11.0 with)

4. **Configuration**:
   - LoRA rank 4, η=500, layers 4-7 optimal

### Replicated Documentation Results (from documentation_replication.md):
1. **Harry Potter Experiments**:
   - Training completed with 50 iterations (vs. 3000-5000 in paper)
   - Loss curves converged during training
   - Mod

In [13]:
# DE2. Conclusion Consistency Analysis

de2_analysis = """
## DE2. Conclusion Consistency Analysis

### Original Documentation Conclusions (from plan.md and CodeWalkthrough.md):

1. **Core Claims**:
   - ELM successfully erases conceptual knowledge while maintaining model capabilities
   - Three loss components (Lerase, Lretain, Lfluency) are all essential for balanced erasure
   - Early layers (4-7) are effective for factual knowledge modification
   - The method resists adversarial attacks (GCG, BEAST)
   
2. **Method Effectiveness**:
   - Achieves near-random performance on target concepts (WMDP-Bio/Cyber/HP)
   - Maintains general capabilities (MMLU, MT-Bench)
   - Produces fluent outputs (low R-PPL)

3. **Key Insights**:
   - Introspective classification using expert/novice prompts is effective
   - Low-rank adapters on early layers enable precise knowledge modification

### Replicated Documentation Conclusions (from documentation_replication.md):

1. **What Worked**:
   - "ELM edit vector computation correctly implements the paper's formula"
   - "Three-loss training framework functions as designed"
   - "LoRA adapters successfully applied to early layers"
   - "Model maintains fluency post-training"

2. **Key Insights**:
   - "Early layers (4-7) are effective targets for factual knowledge modification"
   - "The three-loss balance is crucial for maintaining model utility"
   - "Expert/novice prompting provides a natural way to define erasure targets"

3. **Limitations Acknowledged**:
   - Limited training (50 vs. 3000-5000 iterations)
   - No quantitative metrics (MMLU, HP-MCQ)
   - Single concept tested (only Harry Potter, not WMDP-Bio/Cyber)

### Comparison:
The replicated conclusions are **CONSISTENT** with the original because:
1. Both agree on the effectiveness of the three-loss framework
2. Both confirm early layers (4-7) are effective for knowledge modification
3. Both validate expert/novice prompting as an effective mechanism
4. The replication honestly acknowledges its limitations rather than making false claims
5. The replication does NOT contradict any original conclusions

### Verdict: PASS - Conclusions are consistent, and limitations are transparently acknowledged
"""

print(de2_analysis)


## DE2. Conclusion Consistency Analysis

### Original Documentation Conclusions (from plan.md and CodeWalkthrough.md):

1. **Core Claims**:
   - ELM successfully erases conceptual knowledge while maintaining model capabilities
   - Three loss components (Lerase, Lretain, Lfluency) are all essential for balanced erasure
   - Early layers (4-7) are effective for factual knowledge modification
   - The method resists adversarial attacks (GCG, BEAST)
   
2. **Method Effectiveness**:
   - Achieves near-random performance on target concepts (WMDP-Bio/Cyber/HP)
   - Maintains general capabilities (MMLU, MT-Bench)
   - Produces fluent outputs (low R-PPL)

3. **Key Insights**:
   - Introspective classification using expert/novice prompts is effective
   - Low-rank adapters on early layers enable precise knowledge modification

### Replicated Documentation Conclusions (from documentation_replication.md):

1. **What Worked**:
   - "ELM edit vector computation correctly implements the paper's for

In [14]:
# DE3. No External or Hallucinated Information Analysis

de3_analysis = """
## DE3. No External or Hallucinated Information Analysis

### Checking for External/Hallucinated Information in Replicated Documentation:

1. **Method Description**:
   - ELM formula: log P'(x) ∝ log P(x) + eta * (log P(x|c_novice) - log P(x|c_expert))
   - ✓ This matches the original CodeWalkthrough.md exactly

2. **Three Loss Components**:
   - L_erase, L_retain, L_fluency described
   - ✓ All are documented in the original plan.md

3. **Model Configuration**:
   - Base model: HuggingFaceH4/zephyr-7b-beta
   - ✓ Mentioned in CodeWalkthrough.md
   - LoRA rank 256 for literary domains
   - ✓ Mentioned in original: "lora_rank 256" for Harry Potter
   - Layers 4-7
   - ✓ Documented in plan.md as optimal

4. **Datasets**:
   - Harry Potter: mickume/harry_potter_tiny
   - ✓ Harry Potter domain is documented in original
   - Wikipedia retain dataset: philschmid/easyrag-mini-wikipedia
   - This is a reasonable substitute but not explicitly mentioned in original
   
5. **Hyperparameters**:
   - eta: 1000 (original documents eta=500-1000)
   - ✓ Within documented range
   - Learning rate: 5e-5
   - ✓ Standard value, not contradicting original

6. **Claims Made**:
   - All claims about "what worked" reference the original methodology
   - Limitations are about the replication scope, not new findings
   - No new research claims introduced

### Potential Concerns:
- The specific dataset "philschmid/easyrag-mini-wikipedia" is not in the original, but using Wikipedia for retain data is mentioned in original
- "100 samples" scale is acknowledged as different from "3000-5000"
- These are methodological adaptations, not hallucinated findings

### Verdict: PASS
- All core methodology accurately reflects original documentation
- No external references that contradict or extend beyond original claims
- No invented findings or hallucinated details
- Dataset substitutions are transparent and reasonable for a replication
"""

print(de3_analysis)


## DE3. No External or Hallucinated Information Analysis

### Checking for External/Hallucinated Information in Replicated Documentation:

1. **Method Description**:
   - ELM formula: log P'(x) ∝ log P(x) + eta * (log P(x|c_novice) - log P(x|c_expert))
   - ✓ This matches the original CodeWalkthrough.md exactly

2. **Three Loss Components**:
   - L_erase, L_retain, L_fluency described
   - ✓ All are documented in the original plan.md

3. **Model Configuration**:
   - Base model: HuggingFaceH4/zephyr-7b-beta
   - ✓ Mentioned in CodeWalkthrough.md
   - LoRA rank 256 for literary domains
   - ✓ Mentioned in original: "lora_rank 256" for Harry Potter
   - Layers 4-7
   - ✓ Documented in plan.md as optimal

4. **Datasets**:
   - Harry Potter: mickume/harry_potter_tiny
   - ✓ Harry Potter domain is documented in original
   - Wikipedia retain dataset: philschmid/easyrag-mini-wikipedia
   - This is a reasonable substitute but not explicitly mentioned in original
   
5. **Hyperparameters**:
 

In [15]:
# Determine final verdict for DE1
# The key question: Does DE1 require the replication to achieve the same results,
# or to faithfully report what was achieved?

# Looking at the criterion more carefully:
# "PASS — Replicated documentation reports results (metrics, trends, qualitative findings) 
#  that match the original documentation within acceptable tolerance."

# The replication:
# 1. Did NOT run the same experiments at full scale (50 vs 3000-5000 iterations)
# 2. Did NOT compute the same metrics (MMLU, HP-MCQ, R-PPL)
# 3. Used different hyperparameters
# 4. The results they did report (qualitative: "subtle erasure", "knowledge persists") 
#    do NOT match the original claims of strong erasure

# This is a FAIL for DE1 because the reported results do not match the original.
# However, the replication is honest about why - it's not a fabrication.

de1_verdict = "FAIL"
de1_rationale = """The replicated documentation reports results that do not match the original within acceptable tolerance. 
The original documents strong erasure (38.3% HP-MCQ, near-random on WMDP) with maintained general capabilities, 
while the replication reports only "subtle" erasure effects with "Some Harry Potter knowledge persists." 
The replication did not compute the same quantitative metrics (MMLU, HP-MCQ, R-PPL) and used reduced training scale (50 vs 3000-5000 iterations), 
making direct comparison impossible. While the replication is honest about its limitations, the results do not match."""

de2_verdict = "PASS"
de2_rationale = """The replicated documentation presents conclusions consistent with the original. 
Both agree on: (1) the three-loss framework is effective and crucial, (2) early layers (4-7) are effective targets, 
(3) expert/novice prompting works as designed. The replication honestly acknowledges its limitations 
(reduced training, no quantitative metrics) rather than making false claims. 
No conclusions contradict the original, and essential claims are preserved."""

de3_verdict = "PASS"
de3_rationale = """No external or hallucinated information appears in the replicated documentation. 
The ELM formula, three-loss framework, LoRA configuration, and methodology all accurately reflect the original. 
Dataset substitutions (using publicly available Harry Potter and Wikipedia datasets) are transparent and reasonable. 
All claims reference the original methodology without introducing invented findings or external references."""

print(f"DE1 (Result Fidelity): {de1_verdict}")
print(f"DE2 (Conclusion Consistency): {de2_verdict}")
print(f"DE3 (No External Information): {de3_verdict}")

# Final verdict
if de1_verdict == "PASS" and de2_verdict == "PASS" and de3_verdict == "PASS":
    final_verdict = "PASS"
else:
    final_verdict = "REVISION REQUIRED"
    
print(f"\nFinal Verdict: {final_verdict}")

DE1 (Result Fidelity): FAIL
DE2 (Conclusion Consistency): PASS
DE3 (No External Information): PASS

Final Verdict: REVISION REQUIRED


In [16]:
# Create documentation_evaluation_summary.md
from datetime import datetime

summary_md = f"""# Documentation Evaluation Summary

**Evaluation Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

**Original Documentation**: 
- `/net/scratch2/smallyan/erasing-llm_eval/CodeWalkthrough.md`
- `/net/scratch2/smallyan/erasing-llm_eval/plan.md`

**Replicated Documentation**: 
- `/net/scratch2/smallyan/erasing-llm_eval/evaluation/replications/documentation_replication.md`

---

## Results Comparison

The original documentation (plan.md) reports strong quantitative results for the ELM method:
- **WMDP Experiments**: Near-random performance (Bio: 29.7-33.7%, Cyber: 26.6-28.2%) with maintained MMLU (75.2-78.8%) and MT-Bench (7.1-7.9) scores
- **Harry Potter Experiments**: 38.3% HP-MCQ accuracy with 45.3% MMLU and 3.4 R-PPL
- **Ablation Study**: Clear evidence that all three loss components (L_erase, L_retain, L_fluency) are essential

The replicated documentation reports qualitative results with acknowledged limitations:
- Training completed with only 50 iterations (vs. 3000-5000 in original)
- Erasure effect described as "subtle" with "Some Harry Potter knowledge persists"
- No quantitative metrics (MMLU, HP-MCQ, R-PPL) were computed
- Different hyperparameters used (LoRA rank 256 vs. optimal rank 4, eta 1000 vs. 500)

**Assessment**: The results do not match because the replication was run at reduced scale without computing equivalent metrics. While the replication honestly acknowledges these limitations, the documented results cannot be confirmed to match the original within acceptable tolerance.

---

## Conclusions Comparison

Both the original and replicated documentation share consistent conclusions about the ELM methodology:

1. **Three-Loss Framework**: Both confirm the effectiveness and necessity of L_erase, L_retain, and L_fluency components
2. **Early Layer Targeting**: Both validate that layers 4-7 are effective for factual knowledge modification
3. **Expert/Novice Prompting**: Both support this approach as an effective mechanism for defining erasure targets
4. **Model Fluency**: Both observe that the model maintains coherent generation ability

The replication does not contradict any original conclusions and transparently acknowledges its scope limitations.

**Assessment**: Conclusions are consistent. The replication validates the methodology's core principles without making claims that extend beyond or contradict the original.

---

## External/Hallucinated Information Check

The replicated documentation accurately reflects the original methodology:
- The ELM formula matches the original exactly
- Three loss components are correctly described
- LoRA configuration aligns with documented recommendations
- Hyperparameters fall within documented ranges

Minor adaptations were made transparently:
- Used publicly available Harry Potter dataset instead of WMDP-Bio (which requires special access)
- Used Wikipedia dataset for retention (consistent with original approach)
- Reduced sample size (100 vs. 3000-5000) clearly documented

**Assessment**: No external references, invented findings, or hallucinated details were introduced. All methodological adaptations are transparent and reasonable.

---

## Evaluation Checklist Summary

| Criterion | Result | Notes |
|-----------|--------|-------|
| **DE1. Result Fidelity** | **FAIL** | Results do not match due to reduced training scale and missing quantitative metrics |
| **DE2. Conclusion Consistency** | **PASS** | Conclusions are consistent with original; limitations transparently acknowledged |
| **DE3. No External Information** | **PASS** | No hallucinated or external information introduced |

---

## Final Verdict: **REVISION REQUIRED**

The replicated documentation fails DE1 (Result Fidelity) because the reported results do not match the original documentation. To achieve PASS status, the replication should:

1. **Run full-scale training** (3000-5000 iterations as in the original)
2. **Compute quantitative metrics** (HP-MCQ accuracy, MMLU, R-PPL)
3. **Use optimal hyperparameters** as documented (LoRA rank 4, eta 500)
4. **Report results in comparable format** to enable direct comparison

The replication successfully demonstrates the methodology works but does not reproduce the claimed results at comparable scale.
"""

# Write to file
output_file = os.path.join(output_path, 'documentation_evaluation_summary.md')
with open(output_file, 'w') as f:
    f.write(summary_md)

print(f"Written: {output_file}")
print(f"File size: {os.path.getsize(output_file)} bytes")

Written: /net/scratch2/smallyan/erasing-llm_eval/evaluation/replication_eval/documentation_evaluation_summary.md
File size: 4221 bytes


In [17]:
# Create documentation_eval_summary.json
import json

eval_json = {
    "Checklist": {
        "DE1_ResultFidelity": de1_verdict,
        "DE2_ConclusionConsistency": de2_verdict,
        "DE3_NoExternalInformation": de3_verdict
    },
    "Rationale": {
        "DE1_ResultFidelity": "The replicated documentation reports results that do not match the original within acceptable tolerance. The original documents strong erasure (38.3% HP-MCQ, near-random on WMDP) with maintained general capabilities, while the replication reports only 'subtle' erasure effects with 'Some Harry Potter knowledge persists.' The replication did not compute the same quantitative metrics (MMLU, HP-MCQ, R-PPL) and used reduced training scale (50 vs 3000-5000 iterations), making direct comparison impossible. While the replication is honest about its limitations, the results do not match.",
        "DE2_ConclusionConsistency": "The replicated documentation presents conclusions consistent with the original. Both agree on: (1) the three-loss framework is effective and crucial, (2) early layers (4-7) are effective targets, (3) expert/novice prompting works as designed. The replication honestly acknowledges its limitations (reduced training, no quantitative metrics) rather than making false claims. No conclusions contradict the original, and essential claims are preserved.",
        "DE3_NoExternalInformation": "No external or hallucinated information appears in the replicated documentation. The ELM formula, three-loss framework, LoRA configuration, and methodology all accurately reflect the original. Dataset substitutions (using publicly available Harry Potter and Wikipedia datasets) are transparent and reasonable. All claims reference the original methodology without introducing invented findings or external references."
    },
    "FinalVerdict": final_verdict,
    "EvaluationDate": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

# Write to file
json_file = os.path.join(output_path, 'documentation_eval_summary.json')
with open(json_file, 'w') as f:
    json.dump(eval_json, f, indent=2)

print(f"Written: {json_file}")
print(f"File size: {os.path.getsize(json_file)} bytes")
print("\nJSON content:")
print(json.dumps(eval_json, indent=2))

Written: /net/scratch2/smallyan/erasing-llm_eval/evaluation/replication_eval/documentation_eval_summary.json
File size: 1797 bytes

JSON content:
{
  "Checklist": {
    "DE1_ResultFidelity": "FAIL",
    "DE2_ConclusionConsistency": "PASS",
    "DE3_NoExternalInformation": "PASS"
  },
  "Rationale": {
    "DE1_ResultFidelity": "The replicated documentation reports results that do not match the original within acceptable tolerance. The original documents strong erasure (38.3% HP-MCQ, near-random on WMDP) with maintained general capabilities, while the replication reports only 'subtle' erasure effects with 'Some Harry Potter knowledge persists.' The replication did not compute the same quantitative metrics (MMLU, HP-MCQ, R-PPL) and used reduced training scale (50 vs 3000-5000 iterations), making direct comparison impossible. While the replication is honest about its limitations, the results do not match.",
    "DE2_ConclusionConsistency": "The replicated documentation presents conclusions

In [18]:
# Verify all output files were created
print("Output files created:")
for f in os.listdir(output_path):
    full_path = os.path.join(output_path, f)
    print(f"  {f} ({os.path.getsize(full_path)} bytes)")

print("\n" + "="*60)
print("DOCUMENTATION EVALUATION COMPLETE")
print("="*60)
print(f"\nOutput directory: {output_path}")
print(f"\nFiles generated:")
print("  1. documentation_evaluation_summary.md")
print("  2. documentation_eval_summary.json")
print(f"\nFinal Verdict: {final_verdict}")
print("\nDE1 (Result Fidelity): FAIL - Results do not match due to reduced scale")
print("DE2 (Conclusion Consistency): PASS - Conclusions are consistent")
print("DE3 (No External Information): PASS - No hallucinated content")

Output files created:
  documentation_eval_summary.json (1797 bytes)
  documentation_evaluation_summary.md (4221 bytes)

DOCUMENTATION EVALUATION COMPLETE

Output directory: /net/scratch2/smallyan/erasing-llm_eval/evaluation/replication_eval

Files generated:
  1. documentation_evaluation_summary.md
  2. documentation_eval_summary.json

Final Verdict: REVISION REQUIRED

DE1 (Result Fidelity): FAIL - Results do not match due to reduced scale
DE2 (Conclusion Consistency): PASS - Conclusions are consistent
DE3 (No External Information): PASS - No hallucinated content


# Documentation Evaluation Summary

## Task
Evaluate whether the replicator's documentation faithfully reproduces the results and conclusions of the original ELM (Erasure of Language Memory) experiment.

## Results

| Criterion | Result |
|-----------|--------|
| DE1. Result Fidelity | **FAIL** |
| DE2. Conclusion Consistency | **PASS** |
| DE3. No External Information | **PASS** |

## Final Verdict: **REVISION REQUIRED**

### Key Findings:
- **DE1 Failed**: The replication used reduced training scale (50 vs 3000-5000 iterations) and did not compute quantitative metrics (MMLU, HP-MCQ, R-PPL). The reported "subtle" erasure does not match the original's strong erasure claims.
- **DE2 Passed**: Conclusions about the three-loss framework, early layer targeting, and expert/novice prompting are consistent with the original.
- **DE3 Passed**: No hallucinated or external information was introduced; all methodology accurately reflects the original.

### Output Files:
- `documentation_evaluation_summary.md`
- `documentation_eval_summary.json`