In [1]:
import importlib
import pyreft
import torch
import transformers
from transformers import LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import pandas as pd

cache_dir = "/home/jovyan/.cache/huggingface/hub"

## LLama3:

In [2]:
unaligned_model_name = 'meta-llama/Meta-Llama-3-8B'
cache_dir = "/home/jovyan/.cache/huggingface/hub"

model = AutoModelForCausalLM.from_pretrained(
    unaligned_model_name,
    cache_dir=cache_dir,
    load_in_8bit=False,
    torch_dtype=torch.bfloat16,
    device_map="auto")

tokenizer = AutoTokenizer.from_pretrained(
    unaligned_model_name,
    cache_dir=cache_dir,
    model_max_length=2048,
    use_fast=True)

tokenizer.pad_token_id = 0



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Red teaming datasets:

In [8]:
harm_prompts_df = pd.read_csv('data/harmbench_behaviors_text_all.csv')
singlecat_prompt_dfs = [singlecat_promptdf for cat , singlecat_promptdf in harm_prompts_df.groupby('SemanticCategory')]
harmful_behaviors = pd.read_csv('data/harmful_behaviors.csv')

PROMPT_TEMPLATE = """<s>[INST] %s [/INST]"""
ASSISTANT_TEMPLATE = """"Below is an instruction that describes a task. Write a response that appropriately completes the request.
\n\n### Instruction:\n%s
\n\n### Response:"""

train_df = harm_prompts_df

# generation:

In [20]:
def deterministic_query(instruction, template, model, tokenizer, verbose=False):
    prompt = template % instruction
    if verbose: print(prompt)
    input_dict = tokenizer(prompt, return_tensors="pt")
    input_ids = input_dict['input_ids'].cuda()
    input_len = input_ids.shape[1]
    if verbose: print('input_len ', input_len)
    with torch.no_grad():
        generation_output = model.generate(
            inputs=input_ids,
            top_p=1,
            temperature=1.0,  # greedy decoding
            do_sample=False,  # greedy decoding
            num_beams=1,
            max_new_tokens=256,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    s = generation_output[0]

    output = tokenizer.decode(s[input_len:], skip_special_tokens=True)

    return output

In [21]:
def generate_toxic_response_for_row(row):
    response = deterministic_query(
        row['Behavior'],
        ASSISTANT_TEMPLATE,
        model,
        tokenizer)
    print(response)
    return response

In [None]:
train_df['toxic_response'] = train_df.apply(generate_toxic_response_for_row, axis=1)

In [27]:
train_df.to_csv('data/harmful_behaviors_with_toxic_responses_llama3.csv')