In [1]:
topics = [
    'wmdp_bio',
    'mmlu_high_school_us_history',
    'mmlu_high_school_geography',
    'mmlu_human_aging'
    ]
topics = ",".join(topics)
topics

'wmdp_bio,mmlu_high_school_us_history,mmlu_high_school_geography,mmlu_human_aging'

In [None]:
import os
import subprocess
from tqdm import tqdm
import torch 
import time

device = torch.device('cuda:1')
# Specify the folder containing CSV files
folder_path = "/home/cs29824/andre/sae_jailbreak_unlearning/models/steer_dfs_final_3"
output_path = "/home/cs29824/andre/sae_jailbreak_unlearning/results/google__gemma-2b_steer_eval_3"
os.makedirs(output_path, exist_ok=True)

# Loop through each file in the folder
for filename in tqdm(os.listdir(folder_path)):
    print(filename)
    # Check if the file is a CSV file
    if filename.endswith(".csv"):
        # Construct the full path to the CSV file
        csv_file_path = os.path.join(folder_path, filename)

        # Construct the command with the specific CSV file
        command = f"""lm_eval \
                    --model steered \
                    --model_args csv_path={csv_file_path} \
                    --tasks {topics} \
                    --batch_size auto \
                    --output_path {output_path} \
                    --device cuda:1 \
"""
        # Run the command and capture output
        # Run the command in the shell
        result = subprocess.run(command, capture_output=True, text=True, shell=True)
        

        # Print the result for each CSV file
        print(f"Results for {filename}:")
        print(f"Stdout: {result.stdout}")
        print(f"Stderr: {result.stderr}")
        print(f"Return code: {result.returncode}")
        print("-" * 50)

        del result
        with torch.cuda.device(device):
            torch.cuda.empty_cache()
            print("GPU memory cleared on cuda:1.")
        time.sleep(5)

  0%|          | 0/64 [00:00<?, ?it/s]

top_30_coef_-2_method_none_steer.csv
Results for top_30_coef_-2_method_none_steer.csv:
Stdout: Loaded pretrained model gemma-2-2b into HookedTransformer
Moving model to device:  cuda:1

Stderr: 2025-01-21:17:51:27,435 INFO     [__main__.py:284] Verbosity set to INFO
2025-01-21:17:51:33,884 INFO     [__main__.py:381] Selected Tasks: ['mmlu_high_school_geography', 'mmlu_high_school_us_history', 'mmlu_human_aging', 'wmdp_bio']
2025-01-21:17:51:33,886 INFO     [evaluator.py:165] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-01-21:17:51:33,886 INFO     [evaluator.py:202] Initializing steered model, with arguments: {'csv_path': '/home/cs29824/andre/sae_jailbreak_unlearning/models/steer_dfs_final_3/top_30_coef_-2_method_none_steer.csv'}

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
Loading checkpoint shards:  33%|███▎      | 1/3 [00:00<00:00,  2.99it/s]
Loading checkpoint shards:  67%|██

  2%|▏         | 1/64 [01:16<1:19:56, 76.13s/it]

top_5_coef_-10_method_proportional_steer.csv
Results for top_5_coef_-10_method_proportional_steer.csv:
Stdout: Loaded pretrained model gemma-2-2b into HookedTransformer
Moving model to device:  cuda:1
Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 4
steered (csv_path=/home/cs29824/andre/sae_jailbreak_unlearning/models/steer_dfs_final_3/top_5_coef_-10_method_proportional_steer.csv), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4)
|        Tasks         |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|----------------------|------:|------|-----:|------|---|-----:|---|-----:|
|high_school_geography |      1|none  |     0|acc   |↑  |0.1768|±  |0.0272|
|high_school_us_history|      1|none  |     0|acc   |↑  |0.2500|±  |0.0304|
|human_aging           |      1|none  |     0|acc   |↑  |0.3184|±  |0.0313|
|wmdp_bio              |      1|none  |     0|acc   |↑  |0.2490|±  |0.0121|


Stderr: 2025-01-21:17:52:43,570 IN

  3%|▎         | 2/64 [05:46<3:16:31, 190.18s/it]

top_20_coef_0_method_none_steer.csv


In [6]:
  # Wait a moment for the cache to clear
time.sleep(5)
with torch.cuda.device(device):
    torch.cuda.empty_cache()
    print("GPU memory cleared on cuda:1.")

GPU memory cleared on cuda:1.


: 

# Visualize

In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import mplcursors  # For interactive hover annotations

# Path to the main folder containing "subfolders" that end with .csv
main_folder = "google__gemma-2b_steer_eval"

data_list = []

# 1. Gather data from subfolders that end with ".csv" and parse JSON files
for item in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, item)
    
    # Check if this is a directory and ends with '.csv'
    if os.path.isdir(subfolder_path) and item.endswith(".csv"):
        # Inside this subfolder, look for JSON files
        for fname in os.listdir(subfolder_path):
            if fname.endswith(".json"):
                json_path = os.path.join(subfolder_path, fname)

                # Load the JSON
                with open(json_path, "r") as f:
                    data = json.load(f)

                # Extract the needed values
                try:
                    x_val = data["results"]["wmdp_bio"]["acc,none"]
                    y_val = data["results"]["mmlu_high_school_us_history"]["acc,none"]
                except KeyError:
                    print(f"Skipping {json_path} - required keys not found.")
                    continue

                data_list.append({
                    "subfolder": item,  # e.g., "<something>.csv"
                    "wmdp_bio": x_val,
                    "mmlu_high_school_us_history": y_val
                })

# 2. Convert collected data to a DataFrame
df = pd.DataFrame(data_list)

# 3. Create a Matplotlib scatter plot
fig, ax = plt.subplots(figsize=(7, 5))
scatter = ax.scatter(
    df["wmdp_bio"], 
    df["mmlu_high_school_us_history"], 
    s=50, 
    c="blue"
)

ax.set_xlabel("wmdp_bio")
ax.set_ylabel("mmlu_high_school_us_history")
ax.set_title("WMDP Bio vs. MMLU High School US History (Matplotlib)")

# 4. Add interactive hover using mplcursors
cursor = mplcursors.cursor(scatter, hover=True)

@cursor.connect("add")
def on_add(sel):
    # sel.index gives the index of the point in the scatter
    idx = sel.index
    # Retrieve the subfolder name from our DataFrame
    folder_name = df["subfolder"].iloc[idx]
    # Customize the annotation text
    sel.annotation.set_text(folder_name)

# 5. Show the plot
plt.tight_layout()
plt.show()
