# 01: Generate Jailbreak Prompts with nanoGCG

This notebook uses gradient-based optimization (GCG) to find adversarial suffixes
that bypass Llama 3.1 8B safety filters.

**Output**: CSV file with adversarial prompts saved to Google Drive

**Time estimate**: ~5-30 min per behavior depending on GPU

---
## Configuration

In [None]:
# ========== CONFIGURE HERE ==========

# Number of jailbreak prompts to generate
N_PROMPTS = 20

# Model to attack
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

# GCG hyperparameters
GCG_NUM_STEPS = 500      # More steps = better attacks but slower
GCG_SEARCH_WIDTH = 512   # Candidates per step
GCG_TOPK = 256           # Top-k token replacements to consider
GCG_SEED = 42

# Output filename (will be saved to Google Drive)
OUTPUT_FILENAME = "jailbreak_prompts.csv"

# =====================================

---
## Setup & Installation

In [None]:
import os
import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Set output directory on Google Drive
    OUTPUT_DIR = '/content/drive/MyDrive/JB_mech_outputs'
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Install dependencies
    !pip install -q torch transformers accelerate
    !pip install -q nanogcg datasets pandas tqdm
    
    print(f"Output will be saved to: {OUTPUT_DIR}")
else:
    # Local environment
    PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
    if not os.path.exists(os.path.join(PROJECT_ROOT, 'notebooks')):
        PROJECT_ROOT = os.getcwd()
        while not os.path.exists(os.path.join(PROJECT_ROOT, 'notebooks')) and PROJECT_ROOT != '/':
            PROJECT_ROOT = os.path.dirname(PROJECT_ROOT)
    
    OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'outputs', 'jailbreaks')
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Output will be saved to: {OUTPUT_DIR}")

In [None]:
import torch
import pandas as pd
from tqdm import tqdm

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

---
## Load JailbreakBench Behaviors

In [None]:
from datasets import load_dataset

print("Loading JailbreakBench behaviors...")
jbb = load_dataset("JailbreakBench/JBB-Behaviors", "behaviors")

behaviors = [
    {"id": i, "goal": b["Goal"], "target": b["Target"]} 
    for i, b in enumerate(jbb["train"])
]

print(f"Loaded {len(behaviors)} harmful behaviors")
print(f"Will generate jailbreaks for {min(N_PROMPTS, len(behaviors))} behaviors")

# Show examples
print("\nExample behaviors:")
for b in behaviors[:3]:
    print(f"  - {b['goal'][:80]}...")

---
## Load Model for GCG

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

print(f"Loading model: {MODEL_NAME}")
print("This may take a few minutes...")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"Model loaded!")
print(f"Model device: {model.device}")

---
## Generate Jailbreaks with nanoGCG

In [None]:
import nanogcg

# Configure GCG
config = nanogcg.GCGConfig(
    num_steps=GCG_NUM_STEPS,
    search_width=GCG_SEARCH_WIDTH,
    topk=GCG_TOPK,
    seed=GCG_SEED
)

print(f"GCG Config:")
print(f"  num_steps: {GCG_NUM_STEPS}")
print(f"  search_width: {GCG_SEARCH_WIDTH}")
print(f"  topk: {GCG_TOPK}")
print(f"  seed: {GCG_SEED}")

In [None]:
# Generate jailbreaks
print(f"\nGenerating {N_PROMPTS} jailbreaks...")
print("=" * 60)

jailbreaks = []
failed = []

for i, behavior in enumerate(tqdm(behaviors[:N_PROMPTS], desc="Generating jailbreaks")):
    try:
        result = nanogcg.run(
            model, 
            tokenizer, 
            behavior["goal"], 
            behavior["target"], 
            config
        )
        
        jailbreaks.append({
            "id": behavior["id"],
            "goal": behavior["goal"],
            "target": behavior["target"],
            "suffix": result.best_string,
            "loss": float(result.best_loss),
            "jailbreak_prompt": f"{behavior['goal']} {result.best_string}"
        })
        print(f"  [{i+1}/{N_PROMPTS}] Success! Loss: {result.best_loss:.4f}")
        
    except Exception as e:
        print(f"  [{i+1}/{N_PROMPTS}] Failed: {e}")
        failed.append({"id": behavior["id"], "goal": behavior["goal"], "error": str(e)})

print("\n" + "=" * 60)
print(f"Completed: {len(jailbreaks)} successful, {len(failed)} failed")

---
## Save Results

In [None]:
# Create DataFrame
df = pd.DataFrame(jailbreaks)

# Display summary
print("Generated Jailbreaks Summary:")
print(f"  Total: {len(df)}")
print(f"  Average loss: {df['loss'].mean():.4f}")
print(f"  Min loss: {df['loss'].min():.4f}")
print(f"  Max loss: {df['loss'].max():.4f}")

# Show first few
print("\nFirst 3 jailbreaks:")
for _, row in df.head(3).iterrows():
    print(f"\n  Goal: {row['goal'][:60]}...")
    print(f"  Suffix: {row['suffix'][:60]}...")
    print(f"  Loss: {row['loss']:.4f}")

In [None]:
# Save to CSV
output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)
df.to_csv(output_path, index=False)

print(f"Saved {len(df)} jailbreaks to: {output_path}")

# Also save as .pt for downstream notebooks
pt_path = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME.replace('.csv', '.pt'))
torch.save(jailbreaks, pt_path)
print(f"Saved torch format to: {pt_path}")

In [None]:
# Display full DataFrame
df

---
## Cleanup

In [None]:
# Free GPU memory
del model
del tokenizer
torch.cuda.empty_cache()

print("Done! Model cleaned up.")
print(f"\nOutput saved to: {output_path}")