### Generate evals for models

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from reward_uncertainty.model.simple_preference_model import SimplePreferenceModel
from reward_uncertainty.store.disk_store import DiskBackingStore

ROOT = "/nas/ucb/dfeng"

model_list = os.listdir(os.path.join(ROOT, "blobs"))
simple_pref_models: list[SimplePreferenceModel] = []

for model_id in model_list:
    if "simple_" not in model_id:
        continue
    
    store = DiskBackingStore.get(model_id)
    assert store is not None

    model = SimplePreferenceModel.from_backing_store(store)
    if model.config.ensemble_id is not None:
        continue

    simple_pref_models.append(model)

len(simple_pref_models)

  from .autonotebook import tqdm as notebook_tqdm


16

In [3]:
from reward_uncertainty.model.ensemble.simple_ensemble import SimplePreferenceModelEnsemble

ensemble_pref_models: list[SimplePreferenceModelEnsemble] = []
for model_id in model_list:
    if "ensemble_" not in model_id:
        continue

    store = DiskBackingStore.get(model_id)
    assert store is not None

    model = SimplePreferenceModelEnsemble.from_backing_store(store)
    ensemble_pref_models.append(model)

len(ensemble_pref_models)

44

In [4]:
from experiments.extras.slurm import JobConfig, RemoteExperiment, run_remote_experiment
from reward_uncertainty.eval.reward_eval import RewardEval


def run_evals(model: SimplePreferenceModel, dry_run: bool = False):
    evals = set([
        "hh_rlhf_helpful",
        "hh_rlhf_jailbroken",
        "strongreject_jailbroken",
        "safe_rlhf"
    ])

    alr_run_evals = set()
    for eval in model.evals:
        if isinstance(eval, RewardEval):
            alr_run_evals.add(eval.config.dataset_name)

    evals = evals - alr_run_evals
    print(f"Model:{model.id}, Evals:{evals}")
    for eval_name in evals:        
        run_remote_experiment(
            RemoteExperiment.EVALUATE_REWARD_MODEL, 
            JobConfig(args={
                "reward_model_id": model.id,
                "dataset": eval_name,
                "batch_size": "16",
                "max_length": "512",
            }),
            dry_run=dry_run
        )


def run_ensemble_evals(model: SimplePreferenceModelEnsemble, dry_run: bool = False):
    evals = set([
        "hh_rlhf_helpful",
        "hh_rlhf_jailbroken",
        "strongreject_jailbroken",
        "safe_rlhf"
    ])

    alr_run_evals = set()
    for eval in model.evals:
        if isinstance(eval, RewardEval):
            alr_run_evals.add(eval.config.dataset_name)

    evals = evals - alr_run_evals
    print(f"Model:{model.id}, Evals:{evals}")
    for eval_name in evals:        
        run_remote_experiment(
            RemoteExperiment.EVALUATE_ENSEMBLE_REWARD_MODEL, 
            JobConfig(args={
                "ensemble_model_id": model.id,
                "dataset": eval_name,
                "batch_size": "16",
                "max_length": "512",
            }),
            dry_run=dry_run
        )

In [5]:
for model in simple_pref_models:
    run_evals(model, dry_run=True)

Model:20250331-112939_simple__adcc6b21-b1a9-4e64-9465-803ab4af0739, Evals:set()
Model:20250331-112924_simple__ff744888-ce90-4552-b3fa-e9e434613398, Evals:set()
Model:20250331-112919_simple__86e11b19-99f5-4034-a6b3-9316e0ad821f, Evals:set()
Model:20250331-112940_simple__893d99b2-4b25-437a-9c1f-d5783feae27d, Evals:set()
Model:20250331-112918_simple__bd18d6f0-7d44-4cbf-a8e9-888adc39f019, Evals:set()
Model:20250331-112927_simple__63156b72-d3b4-491a-831c-d745d3bc8dd5, Evals:set()
Model:20250331-112922_simple__bfadce2a-2cce-4c52-a1c7-e587a19083e3, Evals:set()
Model:20250331-112929_simple__360d8bd8-224a-47ba-a21b-a80c5ad1dfe7, Evals:set()
Model:20250331-112925_simple__91697818-d303-45ad-a613-e5acb742539b, Evals:set()
Model:20250331-112932_simple__ed9e434d-7988-4ddc-be40-fff5d5575e0a, Evals:set()
Model:20250331-112937_simple__9f493caa-65e6-4964-bcc2-35adb121b6f2, Evals:set()
Model:20250331-112928_simple__81d9827a-60aa-418e-b205-0af0953a7949, Evals:set()
Model:20250331-112921_simple__35c69356-4

In [6]:
for model in ensemble_pref_models:
    run_ensemble_evals(model, dry_run=True)

Model:20250331-133452_ensemble__99235489-4541-4fcd-bbd2-84badd744bb8, Evals:set()
Model:20250331-132441_ensemble__7052ee53-c29c-45f8-86b8-074142ff7308, Evals:set()
Model:20250331-133253_ensemble__7fcf04e7-4991-4438-b449-add1a3b86839, Evals:set()
Model:20250331-133446_ensemble__2c889e44-d1c4-4f01-8b4f-aa3324d1c286, Evals:set()
Model:20250331-133321_ensemble__9052df98-f238-47c4-bbb4-9ca94ba2089e, Evals:set()
Model:20250331-133504_ensemble__e2993768-1d20-41a7-a586-d9dd7df0e9d8, Evals:set()
Model:20250331-133400_ensemble__2b7b6994-eb2f-46c3-9fb9-246b1da1c3a8, Evals:set()
Model:20250331-133249_ensemble__f7a157e2-279d-4382-9e85-b3f1819b2aa4, Evals:set()
Model:20250331-133403_ensemble__8146b904-06d8-471d-b0af-cafb9e09313f, Evals:set()
Model:20250331-133533_ensemble__9acd9a00-1454-40a2-9913-a5b012e20f9f, Evals:set()
Model:20250331-133405_ensemble__52d168e6-b751-4fec-a90a-ae77216e1666, Evals:set()
Model:20250331-133405_ensemble__14cddb02-1dee-4ed5-ab2b-57582c181298, Evals:set()
Model:20250331-1

### Get scores for models

In [29]:
import numpy as np
import pandas as pd
from reward_uncertainty.scoring.brier_score import BrierScore
from reward_uncertainty.scoring.rmsce_score import RMSCEScore
from reward_uncertainty.scoring.auroc_score import AUROCScore


def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)


data = []
for model in simple_pref_models:
    evals = [e for e in model.evals if isinstance(e, RewardEval)]
    for eval in evals:
        results = eval.results
        assert results is not None

        results_ds = results.reward_outputs
        reward_logits = np.array([results_ds["reward_output_chosen"], results_ds["reward_output_rejected"]]).transpose()
        probs = softmax(reward_logits)
        
        brier_score = BrierScore().batch(probs, np.array([0 for _ in range(len(probs))]))
        auroc_score = AUROCScore().batch(probs, np.array([0 for _ in range(len(probs))]))
        rmsce_score = RMSCEScore(num_bins=50).batch(probs, np.array([0 for _ in range(len(probs))]))
        data.append({
            "model_id": model.id,
            "model_name": model.config.model_name,
            "train_dataset": model.config.train_dataset_name,
            "learning_rate": model.config.learning_rate,
            "num_train_epochs": model.config.num_train_epochs,
            "lora_alpha": model.config.lora_alpha,
            "lora_dropout": model.config.lora_dropout,
            "lora_rank": model.config.lora_rank,
            "train_batch_size": model.config.per_device_train_batch_size,
            "eval_id": eval.id,
            "eval_type": eval.config.dataset_name,
            "brier_score": brier_score,
            "auroc_score": auroc_score,
            "rmsce_score": rmsce_score
        })

simple_pref_models_df = pd.DataFrame(data)
simple_pref_models_df.head()

Unnamed: 0,model_id,model_name,train_dataset,learning_rate,num_train_epochs,lora_alpha,lora_dropout,lora_rank,train_batch_size,eval_id,eval_type,brier_score,auroc_score,rmsce_score
0,20250331-112939_simple__adcc6b21-b1a9-4e64-946...,google/gemma-2b,hh_rlhf_helpful,2e-05,2,128,0.1,64,16,20250402-065916_reward_eval__d0baeb23-f632-4d5...,hh_rlhf_helpful,0.355449,0.810687,0.036848
1,20250331-112939_simple__adcc6b21-b1a9-4e64-946...,google/gemma-2b,hh_rlhf_helpful,2e-05,2,128,0.1,64,16,20250405-095646_reward_eval__6a453fea-da84-468...,safe_rlhf,0.742456,0.278031,0.401182
2,20250331-112939_simple__adcc6b21-b1a9-4e64-946...,google/gemma-2b,hh_rlhf_helpful,2e-05,2,128,0.1,64,16,20250410-204341_reward_eval__84f1f1b6-a474-432...,strongreject_jailbroken,1.806448,0.000517,0.950373
3,20250331-112939_simple__adcc6b21-b1a9-4e64-946...,google/gemma-2b,hh_rlhf_helpful,2e-05,2,128,0.1,64,16,20250410-204341_reward_eval__5c805013-56ab-4db...,hh_rlhf_jailbroken,1.365752,0.042356,0.793314
4,20250331-112924_simple__ff744888-ce90-4552-b3f...,google/gemma-2b,hh_rlhf_helpful,1e-05,1,128,0.1,64,16,20250402-070458_reward_eval__fa1aaf3c-b62e-430...,hh_rlhf_helpful,0.382709,0.779894,0.038305


In [28]:
remove_simple_helpful = simple_pref_models_df[simple_pref_models_df["eval_type"] != "hh_rlhf_helpful"]
remove_simple_helpful_trained_both = remove_simple_helpful[remove_simple_helpful["train_dataset"] == "hh_rlhf_both"]
remove_simple_helpful_trained_helpful = remove_simple_helpful[remove_simple_helpful["train_dataset"] == "hh_rlhf_helpful"]

remove_simple_helpful_trained_both.to_csv("remove_simple_helpful_trained_both.csv", index=False)
remove_simple_helpful_trained_helpful.to_csv("remove_simple_helpful_trained_helpful.csv", index=False)

In [42]:
data = []
for ensemble_model in ensemble_pref_models:
    evals = [e for e in ensemble_model.evals if isinstance(e, RewardEval)]
    num_members = len(ensemble_model.config.member_ids)
    for eval in evals:
        results = eval.results
        assert results is not None

        results_ds = results.reward_outputs
        chosen_logits = np.array([results_ds[f"reward_output_chosen_model_{i}"] for i in range(num_members)])
        rejected_logits = np.array([results_ds[f"reward_output_rejected_model_{i}"] for i in range(num_members)])
        
        probs = softmax(np.array([chosen_logits.flatten(), rejected_logits.flatten()]).transpose())
        probs = probs.reshape(len(probs) // num_members, 2)

        print(probs)
        
        brier_score = BrierScore().batch(probs, np.array([0 for _ in range(len(probs))]))
        auroc_score = AUROCScore().batch(probs, np.array([0 for _ in range(len(probs))]))
        rmsce_score = RMSCEScore(num_bins=50).batch(probs, np.array([0 for _ in range(len(probs))]))

        members = ensemble_model.members
        data.append({
            "model_id": ensemble_model.id,
            "model_name": members[0].config.model_name,
            "num_members": len(members),
            "train_dataset": members[0].config.train_dataset_name,
            "learning_rate": ", ".join([str(m.config.learning_rate) for m in members]),
            "num_train_epochs": ", ".join([str(m.config.num_train_epochs) for m in members]),
            "lora_alpha": ", ".join([str(m.config.lora_alpha) for m in members]),
            "lora_dropout": ", ".join([str(m.config.lora_dropout) for m in members]),
            "lora_rank": ", ".join([str(m.config.lora_rank) for m in members]),
            "train_batch_size": ", ".join([str(m.config.per_device_train_batch_size) for m in members]),
            "eval_id": eval.id,
            "eval_type": eval.config.dataset_name,
            "brier_score": brier_score,
            "auroc_score": auroc_score,
            "rmsce_score": rmsce_score
        })

        break

ensemble_pref_models_df = pd.DataFrame(data)
ensemble_pref_models_df.head()

AttributeError: 'Config' object has no attribute 'num_members'

In [35]:
ensemble_five_members, ensemble_two_members = ensemble_pref_models_df[ensemble_pref_models_df["num_members"] == 5], ensemble_pref_models_df[ensemble_pref_models_df["num_members"] == 2]

ensemble_five_members_remove_helpful = ensemble_five_members[ensemble_five_members["eval_type"] != "hh_rlhf_helpful"]
five_trained_both = ensemble_five_members_remove_helpful[ensemble_five_members_remove_helpful["train_dataset"] == "hh_rlhf_both"]
five_trained_helpful = ensemble_five_members_remove_helpful[ensemble_five_members_remove_helpful["train_dataset"] == "hh_rlhf_helpful"]

five_trained_both.to_csv("five_trained_both.csv", index=False)
five_trained_helpful.to_csv("five_trained_helpful.csv", index=False)

ensemble_two_members_remove_helpful = ensemble_two_members[ensemble_two_members["eval_type"] != "hh_rlhf_helpful"]
two_trained_both = ensemble_two_members_remove_helpful[ensemble_two_members_remove_helpful["train_dataset"] == "hh_rlhf_both"]
two_trained_helpful = ensemble_two_members_remove_helpful[ensemble_two_members_remove_helpful["train_dataset"] == "hh_rlhf_helpful"]

two_trained_both.to_csv("two_trained_both.csv", index=False)
two_trained_helpful.to_csv("two_trained_helpful.csv", index=False)

### Delete evals (if necessary)

In [7]:
rm_eval_names = [
    "hh_rlhf_jailbroken",
    "strongreject_jailbroken",
]

for model in simple_pref_models:
    rm_evals = [e for e in model.evals if isinstance(e, RewardEval) and e.config.dataset_name in rm_eval_names]
    for rm_eval in rm_evals:
        print("Removing eval", rm_eval.id)
        # model.remove_eval(rm_eval.id)

In [None]:
for model in ensemble_pref_models:
    rm_evals = [e for e in model.evals if isinstance(e, RewardEval) and e.config.dataset_name in rm_eval_names]
    for rm_eval in rm_evals:
        print("Removing eval", rm_eval.id)
        # model.remove_eval(rm_eval.id)

Removing eval 20250402-071608_reward_eval__bd3933c6-28ce-4b36-bd61-3030ba777a11
Removing eval 20250402-071608_reward_eval__76ff3a38-e0b5-4abb-9000-bcf08bacb590
Removing eval 20250402-072250_reward_eval__d95b2616-d14a-45f5-ac8e-069ebb311144
Removing eval 20250402-072250_reward_eval__b06ddccf-62f1-4138-9c99-fd8015964800
Removing eval 20250402-072250_reward_eval__5130f5e5-a049-4bf8-8955-43489d0e2b05
Removing eval 20250402-072250_reward_eval__d43524ca-6abf-4a04-8518-dc2c87150037
Removing eval 20250402-072249_reward_eval__107ca122-2fb7-4dc3-85c5-1407f0cb87ce
Removing eval 20250402-072338_reward_eval__04956e9d-9957-4268-9200-e09377e82756
Removing eval 20250402-072338_reward_eval__09f1aa0d-2460-4f38-ae51-f2430c0639b3
Removing eval 20250402-072341_reward_eval__939656b9-ebf4-4cdc-9bf8-a251c7a0a467
Removing eval 20250402-072456_reward_eval__4d734671-80c7-4dc2-8f6a-7b308937f54e
Removing eval 20250402-072459_reward_eval__1589d3f7-68e8-44d6-8423-dce7e11dbb98
Removing eval 20250402-072524_reward_eva