In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


In [2]:
# Create the evaluation folder if it doesn't exist
import os
repo_path = '/net/scratch2/smallyan/erasing-llm_eval'
eval_path = os.path.join(repo_path, 'evaluation')
os.makedirs(eval_path, exist_ok=True)
print(f"Created/verified evaluation folder: {eval_path}")

Created/verified evaluation folder: /net/scratch2/smallyan/erasing-llm_eval/evaluation


In [3]:
# Check what models are mentioned in the paper's experiments
# According to plan.md, the original work used:
# - Zephyr-7B (HuggingFaceH4/zephyr-7b-beta)
# - Mistral-7B
# - Llama3-8B
# - Llama3-8B-Instruct 
# - Qwen2.5-32B
# - Llama3-70B
# - Llama-2-7B Chat (for Harry Potter experiment)

# Let's check if there are trained models in the repo
elm_models_path = os.path.join(repo_path, 'elm_models')
if os.path.exists(elm_models_path):
    print("elm_models folder exists:")
    print(os.listdir(elm_models_path))
else:
    print("No elm_models folder found")

# Check the data folder
data_path = os.path.join(repo_path, 'data')
print("\nData folder contents:")
for item in os.listdir(data_path):
    print(f"  {item}")

No elm_models folder found

Data folder contents:
  harrypotter
  wmdp-keywords.json
  wmdp


In [4]:
# Check the data subfolder contents
print("Harry Potter data:")
hp_path = os.path.join(data_path, 'harrypotter')
for item in os.listdir(hp_path):
    print(f"  {item}")
    
print("\nWMDP data:")
wmdp_path = os.path.join(data_path, 'wmdp')
if os.path.exists(wmdp_path):
    for item in os.listdir(wmdp_path):
        print(f"  {item}")

Harry Potter data:
  .ipynb_checkpoints
  hp-questions-dual.json
  hp-questions.json

WMDP data:
  bio-questions.json
  chem-questions.json
  cyber-questions.json


In [5]:
# Let's read the notebooks to understand what's being done
import json

# Read the inference notebook to understand how they test the models
notebooks_path = os.path.join(repo_path, 'notebooks')
print("Notebooks folder contents:")
for item in os.listdir(notebooks_path):
    print(f"  {item}")

Notebooks folder contents:
  inference.ipynb


# Generalizability Evaluation for ELM (Erasing Language Memory)

## Overview of the Research

This repository implements **ELM (Erasing Language Memory)**, a method for erasing conceptual knowledge from language models. The key aspects are:

### Neuron-Level Finding
- The method uses **Low-rank adapters (LoRA)** applied to **early model layers (layers 4-7)** to target factual knowledge localization
- These specific layers are identified as containing factual knowledge that can be modified without damaging unrelated knowledge
- The method leverages the model's introspective classification capabilities to reduce generation probabilities for concept-specific content

### Original Models Used
- Zephyr-7B (HuggingFaceH4/zephyr-7b-beta)
- Mistral-7B
- Llama3-8B
- Llama3-8B-Instruct
- Qwen2.5-32B  
- Llama3-70B
- Llama-2-7B Chat (for Harry Potter experiment)

### Original Datasets
- WMDP-Bio (5,000 samples for biosecurity erasure)
- WMDP-Cyber (1,000 samples for cybersecurity erasure)
- Harry Potter texts (3,000 samples)
- Retain datasets from safe concepts

### Method Components
1. **Lerase**: Cross-entropy between ELM model and classifier-modified distribution
2. **Lretain**: Preserve behavior on safe concepts
3. **Lfluency**: Maintain coherent generation for smaller models

In [6]:
# Check for CUDA availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

CUDA available: True
GPU: NVIDIA H100 NVL
GPU Memory: 99.95 GB


## GT1: Generalization to a New Model

**Objective**: Test if the neuron-level finding (early layers 4-7 contain factual knowledge that can be erased via LoRA) generalizes to a model NOT used in the original work.

**Original models used**: Zephyr-7B, Mistral-7B, Llama3-8B, Llama3-8B-Instruct, Qwen2.5-32B, Llama3-70B, Llama-2-7B Chat

**New model to test**: We'll use **Gemma-2B** (google/gemma-2b) which is:
1. Not in the original paper
2. Has a different architecture (different layer structure)
3. Small enough to run efficiently

We will test if training LoRA on early layers of Gemma-2B can erase Harry Potter knowledge using the same ELM methodology.

In [7]:
# First, let's understand what models are available and what the finding actually is
# The key finding: Early layers (4-7) for a 7B model contain factual knowledge
# For smaller models, we need to scale the layer range proportionally

# Let's set up the environment
import sys
sys.path.append('/net/scratch2/smallyan/erasing-llm_eval')

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json

device = 'cuda:0'
dtype = torch.bfloat16  # Use bfloat16 for efficiency

In [8]:
# For GT1, we need to test if the ELM method works on a NEW model not used in the original paper
# The key neuron-level finding is: "Early layers (4-7) contain factual knowledge that can be erased via LoRA"

# Original models used: Zephyr-7B, Mistral-7B, Llama3-8B, Llama3-8B-Instruct, Qwen2.5-32B, Llama3-70B, Llama-2-7B Chat
# All are based on Llama or Mistral architecture

# We'll test with Phi-2 (microsoft/phi-2) - a 2.7B model with different architecture
# It was not used in the original paper

print("Testing GT1: Model Generalization")
print("=" * 50)
print("New model to test: microsoft/phi-2")
print("This model was NOT used in the original ELM paper")
print()

# Load the new model
model_id = "microsoft/phi-2"
print(f"Loading model: {model_id}")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=dtype,
    trust_remote_code=True
)
model = model.to(device)
model.requires_grad_(False)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

print(f"Model loaded successfully!")
print(f"Number of layers: {len(model.model.layers)}")

Testing GT1: Model Generalization
New model to test: microsoft/phi-2
This model was NOT used in the original ELM paper

Loading model: microsoft/phi-2


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!
Number of layers: 32


In [9]:
# First, let's test that the model knows about Harry Potter before any modification
def generate_text(model, tokenizer, prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors='pt', padding=True)
    inputs = inputs.to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test Harry Potter knowledge before ELM
test_prompts_hp = [
    "Harry Potter's best friends are",
    "The headmaster of Hogwarts is",
    "The game played on broomsticks at Hogwarts is called"
]

print("BEFORE ELM - Testing Harry Potter knowledge:")
print("=" * 50)
for prompt in test_prompts_hp:
    response = generate_text(model, tokenizer, prompt)
    print(f"Prompt: {prompt}")
    print(f"Response: {response}")
    print("-" * 50)

BEFORE ELM - Testing Harry Potter knowledge:


Prompt: Harry Potter's best friends are
Response: Harry Potter's best friends are Ron Weasley and Hermione Granger.

--------------------------------------------------


Prompt: The headmaster of Hogwarts is
Response: The headmaster of Hogwarts is Albus Dumbledore.

--------------------------------------------------


Prompt: The game played on broomsticks at Hogwarts is called
Response: The game played on broomsticks at Hogwarts is called Quidditch.

--------------------------------------------------


In [10]:
# Now let's apply the ELM method to Phi-2
# The key finding: early layers contain factual knowledge
# For a 32-layer model like Phi-2, we'll use layers 4-7 (same as 7B models which also have 32 layers)

from peft import LoraConfig, get_peft_model
from torch.optim import AdamW
from torch.nn import KLDivLoss
import torch.nn.functional as F
import random

# ELM Parameters (same as original paper)
lora_layer_start = 4
lora_layer_end = 8
rank = 4
alpha = 16
eta = 500

# Define LoRA configuration targeting early layers
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "fc1", "fc2"]  # Phi-2 uses fc1/fc2 for MLP

lora_config = LoraConfig(
    r=rank,
    lora_alpha=alpha,
    layers_to_transform=list(range(lora_layer_start, lora_layer_end)),
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Add LoRA to model
model_peft = get_peft_model(model, lora_config)
model_peft.print_trainable_parameters()

ModuleNotFoundError: No module named 'peft'

In [11]:
# Install peft
import subprocess
subprocess.run(['pip', 'install', 'peft', '-q'], check=True)
print("peft installed successfully")

peft installed successfully


In [12]:
# Now let's apply the ELM method to Phi-2
from peft import LoraConfig, get_peft_model
from torch.optim import AdamW
from torch.nn import KLDivLoss
import torch.nn.functional as F
import random

# ELM Parameters (same as original paper)
lora_layer_start = 4
lora_layer_end = 8
rank = 4
alpha = 16
eta = 500

# Define LoRA configuration targeting early layers
# Phi-2 uses different naming conventions
target_modules = ["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"]

lora_config = LoraConfig(
    r=rank,
    lora_alpha=alpha,
    layers_to_transform=list(range(lora_layer_start, lora_layer_end)),
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Add LoRA to model
model_peft = get_peft_model(model, lora_config)
model_peft.print_trainable_parameters()

ModuleNotFoundError: No module named 'transformers.modeling_layers'

In [13]:
# There's a version conflict. Let's use a compatible version
import subprocess
subprocess.run(['pip', 'install', 'peft==0.11.1', '-q'], check=True)
print("peft 0.11.1 installed successfully")

peft 0.11.1 installed successfully
