In [16]:
from utils.modelUtils import *
from utils.utils import *
import seaborn as sns
import torch
import numpy as np
import matplotlib.pyplot as plt
from casper import nethook
from utils.judgeUtils import *

### Model Initialization

Loading Llama-2-7b-chat model for jailbreak intervention analysis.


In [None]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
mt = ModelAndTokenizer(
    model_name,
    low_cpu_mem_usage=True,
    torch_dtype=(torch.float16 if "20b" in model_name else None),
    device = 'cuda:0'
)
mt.model

In [None]:
test_prompt = generate_input(mt.tokenizer, 'tell me a fun joke')
test_prompt

In [None]:
generate_outputs(test_prompt,mt,)

### Judge Model Loading

Loading the judge model to evaluate whether responses successfully jailbreak the safety mechanisms.

In [None]:
print("Loading judge model...")
judge_tokenizer, judge_model_loaded = load_judge_model()
print("Judge model loaded successfully")


### Data Loading

Loading jailbreak prompts from GCG and AutoDAN attack methods, along with harmful behaviors dataset.

In [21]:
import json
with open("analysis_data/llama2_gcgs.json",'r') as f:
    gcgs = json.load(f)[:3]
with open("analysis_data/llama2_autodan.json",'r') as f:
    autodan = json.load(f)[:3]
import pandas as pd
harmful_data = pd.read_csv("analysis_data/advbench_behaviors.csv")['Behavior']
harmful_data = list(harmful_data)

### GCG Perturbation Experiment

For each GCG jailbreak prompt:
1. Generate original response and evaluate jailbreak success
2. Perturb individual tokens in the suffix (positions -24 to -5)
3. For each perturbation, replace one token with padding token (ID: 2)
4. Evaluate whether each perturbed prompt still successfully jailbreaks
5. Calculate success/failure statistics for each original prompt

**Goal**: Measure how robust GCG attacks are to single-token perturbations

In [None]:
import json

all_gcg_results = []

for idx, jail in enumerate(gcgs):
    
    # Construct original prompt with suffix
    test_prompt = generate_input(mt.tokenizer, jail)
    
    # Generate original response
    original_response = generate_outputs(test_prompt, mt)[0]
    original_input_ids = mt.tokenizer(test_prompt)['input_ids']
    
    # Judge if original response successfully jailbreaks
    original_jailbreak = harmful_judge_from_strings(
        jail, 
        original_response,
        judge_model=(judge_tokenizer, judge_model_loaded)
    )
    
    # Store all results for current prompt
    current_prompt_results = {
        "prompt_index": idx,
        "original_prompt": jail,
        "original": {
            "test_prompt": test_prompt,
            "input_ids": original_input_ids,
            "generation": original_response,
            "jailbreak_success": bool(original_jailbreak)
        },
        "perturbations": [],
        "statistics": {}
    }
    
    # Get suffix token indices
    suffix_start_idx = len(original_input_ids) - 24
    suffix_end_idx = len(original_input_ids) - 5
    
    successful_jailbreaks = 0
    total_perturbations = 0
    
    # Replace each token in suffix one by one
    for i in range(suffix_start_idx, suffix_end_idx):
        modified_input_ids = original_input_ids.copy()
        replaced_token = modified_input_ids[i]
        modified_input_ids[i] = 2
        
        # Generate modified prompt
        modified_prompt = mt.tokenizer.decode(modified_input_ids[5:-5])
        modified_test_prompt = generate_input(mt.tokenizer, modified_prompt)
        modified_response = generate_outputs(modified_test_prompt, mt)[0]
        
        # Judge if modified response still jailbreaks
        modified_jailbreak = harmful_judge_from_strings(
            jail,
            modified_response,
            judge_model=(judge_tokenizer, judge_model_loaded)
        )
        
        if modified_jailbreak:
            successful_jailbreaks += 1
        total_perturbations += 1
        
        # Save perturbation result
        current_prompt_results["perturbations"].append({
            "perturbation_index": i - suffix_start_idx,
            "position": i,
            "replaced_token_id": int(replaced_token),
            "replaced_token_text": mt.tokenizer.decode([replaced_token]),
            "modified_input_ids": modified_input_ids,
            "modified_prompt": modified_prompt,
            "generation": modified_response,
            "jailbreak_success": bool(modified_jailbreak)
        })
        
        print(f"Position {i}: {'SUCCESS' if modified_jailbreak else 'FAILED'}")
    
    # Calculate statistics
    success_rate = (successful_jailbreaks / total_perturbations * 100) if total_perturbations > 0 else 0
    failure_rate = 100 - success_rate
    
    current_prompt_results["statistics"] = {
        "total_perturbations": total_perturbations,
        "successful_jailbreaks": successful_jailbreaks,
        "failed_jailbreaks": total_perturbations - successful_jailbreaks,
        "success_rate_percentage": round(success_rate, 2),
        "failure_rate_percentage": round(failure_rate, 2),
        "original_was_successful": bool(original_jailbreak)
    }
    
    all_gcg_results.append(current_prompt_results)
    
    print(f"Stats: {successful_jailbreaks}/{total_perturbations} successful ({success_rate:.2f}%)")
    
    # Periodic save
    if (idx + 1) % 5 == 0:
        with open('analysis_results/token_gcg_perturbation_results.json', 'w', encoding='utf-8') as f:
            json.dump(all_gcg_results, f, ensure_ascii=False, indent=2)
        print(f"Intermediate save: {idx+1} prompts processed")

# Final save for GCG results
with open('analysis_results/token_gcg_perturbation_results.json', 'w', encoding='utf-8') as f:
    json.dump(all_gcg_results, f, ensure_ascii=False, indent=2)

print("\n=== GCG Results saved to token_gcg_perturbation_results.json ===")

### AutoDAN Perturbation Experiment

For each AutoDAN jailbreak prompt:
1. Generate original response and evaluate jailbreak success
2. Perform 20 trials of random perturbations
3. Each trial replaces 3 random tokens (excluding first/last 5 tokens) with padding token
4. Evaluate whether each perturbed prompt still successfully jailbreaks
5. Calculate success/failure statistics across all trials

**Goal**: Measure how robust AutoDAN attacks are to random multi-token perturbations

In [None]:
import json
import random

all_autodan_results = []

for idx, jail in enumerate(autodan):
    
    test_prompt = generate_input(mt.tokenizer, jail)
    original_input_ids = mt.tokenizer(test_prompt)['input_ids']
    
    # Generate original response
    original_response = generate_outputs(test_prompt, mt)[0]
    
    # Judge if original response successfully jailbreaks
    original_jailbreak = harmful_judge_from_strings(
        jail,
        original_response,
        judge_model=(judge_tokenizer, judge_model_loaded)
    )
    
    # Store all results for current prompt
    current_prompt_results = {
        "prompt_index": idx,
        "original_prompt": jail,
        "original": {
            "test_prompt": test_prompt,
            "input_ids": original_input_ids,
            "generation": original_response,
            "jailbreak_success": bool(original_jailbreak)
        },
        "perturbations": [],
        "statistics": {}
    }
    
    # Get valid token positions (excluding first/last 5 tokens)
    valid_positions = list(range(5, len(original_input_ids)-5))
    
    successful_jailbreaks = 0
    total_perturbations = 20
    
    # Perform 20 trials with different random perturbations
    for trial in range(20):
        positions_to_replace = random.sample(valid_positions, 3)
        modified_input_ids = original_input_ids.copy()
        current_modifications = []
        
        # Replace 3 random tokens with padding token
        for pos in positions_to_replace:
            replaced_token = modified_input_ids[pos]
            modified_input_ids[pos] = 2
            
            current_modifications.append({
                "position": pos,
                "replaced_token_id": int(replaced_token),
                "replaced_token_text": mt.tokenizer.decode([replaced_token])
            })
        
        # Generate modified prompt
        modified_prompt = mt.tokenizer.decode(modified_input_ids[5:-5])
        modified_test_prompt = generate_input(mt.tokenizer, modified_prompt)
        modified_response = generate_outputs(modified_test_prompt, mt)[0]
        
        # Judge if modified response still jailbreaks
        modified_jailbreak = harmful_judge_from_strings(
            jail,
            modified_response,
            judge_model=(judge_tokenizer, judge_model_loaded)
        )
        
        if modified_jailbreak:
            successful_jailbreaks += 1
        
        # Save perturbation result
        current_prompt_results["perturbations"].append({
            "trial": trial,
            "replaced_positions": positions_to_replace,
            "replacement_details": current_modifications,
            "modified_input_ids": modified_input_ids,
            "modified_prompt": modified_prompt,
            "generation": modified_response,
            "jailbreak_success": bool(modified_jailbreak)
        })
        
        print(f"Trial {trial+1}: {'SUCCESS' if modified_jailbreak else 'FAILED'}")
    
    # Calculate statistics
    success_rate = (successful_jailbreaks / total_perturbations * 100) if total_perturbations > 0 else 0
    failure_rate = 100 - success_rate
    
    current_prompt_results["statistics"] = {
        "total_perturbations": total_perturbations,
        "successful_jailbreaks": successful_jailbreaks,
        "failed_jailbreaks": total_perturbations - successful_jailbreaks,
        "success_rate_percentage": round(success_rate, 2),
        "failure_rate_percentage": round(failure_rate, 2),
        "original_was_successful": bool(original_jailbreak)
    }
    
    all_autodan_results.append(current_prompt_results)
    
    print(f"Stats: {successful_jailbreaks}/{total_perturbations} successful ({success_rate:.2f}%)")
    
    # Periodic save
    if (idx + 1) % 5 == 0:
        with open('analysis_results/token_autodan_perturbation_results.json', 'w', encoding='utf-8') as f:
            json.dump(all_autodan_results, f, ensure_ascii=False, indent=2)
        print(f"Intermediate save: {idx+1} prompts processed")

# Final save for AutoDAN results
with open('analysis_results/token_autodan_perturbation_results.json', 'w', encoding='utf-8') as f:
    json.dump(all_autodan_results, f, ensure_ascii=False, indent=2)

print("\n=== AutoDAN Results saved to autodan_perturbation_results.json ===")

### Summary Statistics Report

Generating comprehensive summary reports for both GCG and AutoDAN experiments.

The report shows:
- Total number of prompts analyzed
- Average failure rate after perturbation
- Per-prompt breakdown of success/failure rates

**Interpretation**: A high failure rate indicates the attack is fragile to perturbations

In [None]:
def generate_summary_report(results, method_name):
    print(f"\n{'='*60}")
    print(f"{method_name} Summary Report")
    print(f"{'='*60}")
    
    total_prompts = len(results)
    avg_failure_rate = np.mean([r["statistics"]["failure_rate_percentage"] for r in results])
    
    print(f"\nTotal prompts analyzed: {total_prompts}")
    print(f"Average failure rate after perturbation: {avg_failure_rate:.2f}%")
    
    print(f"\n{'Prompt':<8} {'Original':<10} {'Success':<8} {'Failed':<8} {'Failure%':<10}")
    print("-" * 60)
    
    for r in results:
        idx = r["prompt_index"]
        original = "SUCCESS" if r["original"]["jailbreak_success"] else "FAILED"
        success = r["statistics"]["successful_jailbreaks"]
        failed = r["statistics"]["failed_jailbreaks"]
        failure_rate = r["statistics"]["failure_rate_percentage"]
        
        print(f"{idx:<8} {original:<10} {success:<8} {failed:<8} {failure_rate:<10.2f}")
    
    print("=" * 60)

# Generate GCG report
generate_summary_report(all_gcg_results, "GCG")

# Generate AutoDAN report
generate_summary_report(all_autodan_results, "AutoDAN")