In [1]:
import os
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from together import Together
from transformers import AutoTokenizer

from utils import *
from huggingface_hub import login as hf_login
from peft import prepare_model_for_kbit_training
from datasets import concatenate_datasets, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModel

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [2]:
PROMPT_TO_USE = 0
def add_world_model_probs(value, y_star, y_non_star, prompt, client, world_model_endpoints):
    if 'world_models' not in value['y_stars'][y_star]:
        value['y_stars'][y_star]['world_models'] = []
    if 'world_models' not in value['y_NON_stars'][y_non_star]:
        value['y_NON_stars'][y_non_star]['world_models'] = []
        
    for wm_endpoint in tqdm(world_model_endpoints, desc="world model endpoints", leave=False):
        wm_prob_gen = GenerateNextTokenProbAPI(client, wm_endpoint)
        max_tokens=len(wm_prob_gen.tokenizer(prompt)['input_ids'])+10
        wm_prob_star = compute_token_probs_api(y_star, prompt, wm_prob_gen, max_tokens)
        wm_prob_non_star = compute_token_probs_api(y_non_star, prompt, wm_prob_gen, max_tokens)
        value['y_stars'][y_star]['world_models'].append(float(wm_prob_star))
        value['y_NON_stars'][y_non_star]['world_models'].append(float(wm_prob_non_star))

def add_shadow_model_probs(value, y_star, y_non_star, prompt, client, shadow_model_endpoints):
    if 'shadow_models' not in value['y_stars'][y_star]:
        value['y_stars'][y_star]['shadow_models'] = []
    if 'shadow_models' not in value['y_NON_stars'][y_non_star]:
        value['y_NON_stars'][y_non_star]['shadow_models'] = []
        
    for sm_endpoint in tqdm(shadow_model_endpoints, desc="shadow model endpoints", leave=False):
        sm_prob_gen = GenerateNextTokenProbAPI(client, sm_endpoint)
        max_tokens=len(sm_prob_gen.tokenizer(prompt)['input_ids'])+10
        sm_prob_star = compute_token_probs_api(y_star, prompt, sm_prob_gen, max_tokens)
        sm_prob_non_star = compute_token_probs_api(y_non_star, prompt, sm_prob_gen, max_tokens)
        value['y_stars'][y_star]['shadow_models'].append(float(sm_prob_star))
        value['y_NON_stars'][y_non_star]['shadow_models'].append(float(sm_prob_non_star))

def add_model_probs(results, train_test_ents, client, world_model_endpoints, shadow_model_endpoints, model_type='world'):

    def find_ent_list(dataset_type, sample_id):
        for sample in train_test_ents[dataset_type]:
            if sample['new_ID'] == sample_id:
                return sample
        return None

    for key, value in tqdm(results.items(), desc="processing results"):
        split_key = key.split('_')
        dataset_type = split_key[0]
        sample_id = int(split_key[1])
        ent_list = find_ent_list(dataset_type, sample_id)
        if ent_list is None:
            continue

        ents = ent_list['disease_ents']        
        y_stars_order = list(value['y_stars'].keys())
        y_non_stars_order = list(value['y_NON_stars'].keys())

        for y_star, y_non_star in tqdm(zip(y_stars_order, y_non_stars_order), total=len(y_stars_order), desc="processing pairs", leave=True):
            if y_star not in ents:
                continue
            star_index = ents.index(y_star)
            remaining_ents = ents[:star_index] + ents[star_index + 1:]
            prompt_start = PROMPT_TEMPLATE[PROMPT_TO_USE][0]
            prompt_end = PROMPT_TEMPLATE[PROMPT_TO_USE][1]
            ents_string = ', '.join(remaining_ents)
            prompt = f"{prompt_start} {ents_string} {prompt_end}"            
            
            if model_type == 'world':
                add_world_model_probs(value, y_star, y_non_star, prompt, client, world_model_endpoints)
            else:
                add_shadow_model_probs(value, y_star, y_non_star, prompt, client, shadow_model_endpoints)

In [3]:
with open('model_map.json') as f:
    model_map=json.load(f)
key = '779d92de61a5035835e5023ca79e2e5b6124c6300c3ceb0e07e374f948554116'
client = Together(api_key=key)
hf_login(token="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")
dataset = load_dataset("beanham/medsum_llm_attack")
merged_dataset = concatenate_datasets([dataset['train'], dataset['validation'], dataset['test']])
new_ids = range(len(merged_dataset))
merged_dataset = merged_dataset.add_column("new_ID", new_ids)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/binhan/.cache/huggingface/token
Login successful


In [4]:
## load model
id=2
shadow_model_endpoints=[model_map['shadow_train'+'_'+str(id)]['api_key']]
with open(f'target_token_probs_train_{id}_10_epochs.json', 'r') as f:
    all_model_probs = json.load(f)
ent_count_threshold=5

## load data
target_subsample_ids = pd.read_csv(f"formatted_data/subsample_ids_{id}.csv")['new_ID'].tolist()
train_dataset = merged_dataset.filter(lambda example: example['new_ID'] in target_subsample_ids)
test_dataset = merged_dataset.filter(lambda example: example['new_ID'] not in target_subsample_ids)

## why are we only using len(ents)<5 as the unseen ents?
unseen_ents = [sample['disease_ents'] for sample in test_dataset if len(sample['disease_ents']) < ent_count_threshold]
unseen_ents = [item for sublist in unseen_ents for item in sublist]

train_dataset = [sample for sample in train_dataset if len(sample['disease_ents']) >= ent_count_threshold]
test_dataset = [sample for sample in test_dataset if len(sample['disease_ents']) >= ent_count_threshold]
train_test_ents = {'train': train_dataset,'test': test_dataset}
len(train_dataset), len(test_dataset), shadow_model_endpoints

(146, 145, ['bh193/Meta-Llama-3.1-8B-Instruct-Reference-9082d8a1-9c66a72a'])

In [6]:
add_model_probs(all_model_probs, train_test_ents, client, [], shadow_model_endpoints, model_type='shadow')

processing results:   0%|                               | 0/283 [00:00<?, ?it/s]
processing pairs:   0%|                                   | 0/5 [00:00<?, ?it/s][A

shadow model endpoints:   0%|                             | 0/1 [00:00<?, ?it/s][A[ASpecial tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


shadow model endpoints: 100%|█████████████████████| 1/1 [00:02<00:00,  2.83s/it][A[A

                                                                                [A[A
processing pairs: 100%|███████████████████████████| 5/5 [00:02<00:00,  1.74it/s][A
processing results:   2%|▍                      | 6/283 [00:02<02:12,  2.09it/s]
processing pairs:  88%|█████████████████████   | 7/8 [00:00<00:00, 20531.56it/s][A

processing pairs:  83%|████████████████████▊    | 5/6 [00:00<00:00, 8195.20it/s][A

processing pairs: 100%|█████████████████████| 11/11 [00:00<00:00, 124695.52it/s][A

processing pairs: 100%|██████████

                                                                                [A[A
processing pairs:  82%|█████████████████████▎    | 9/11 [00:21<00:05,  2.51s/it][A

shadow model endpoints:   0%|                             | 0/1 [00:00<?, ?it/s][A[ASpecial tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


shadow model endpoints: 100%|█████████████████████| 1/1 [00:02<00:00,  2.63s/it][A[A

                                                                                [A[A
processing pairs:  91%|██████████████████████▋  | 10/11 [00:24<00:02,  2.55s/it][A

shadow model endpoints:   0%|                             | 0/1 [00:00<?, ?it/s][A[ASpecial tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


shadow model endpoints: 100%|█████████████████████| 1/1 [00:02<00:00,  2.23s/it][A[A

                                                                    

In [17]:
with open(f'target_shadow_token_probs_train_{id}_10_epochs.json', 'w') as f:
    json.dump(all_model_probs, f)