Goals: minimal fine-tuning experiment, trying to sft a misaligned ("jerk") version of qwen (could be good to think abt the work that was done in [emergent misalignment](https://openreview.net/pdf?id=iSHcmOjrvY))

In [None]:
import os 
import sys
import importlib
from tqdm import tqdm 
from datetime import datetime
import pandas as pd 
from datasets import Dataset, load_dataset, load_dataset_builder
from dotenv import load_dotenv
import json 
from pathlib import Path
from together import Together
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoModel
from huggingface_hub import HfApi
sys.path.append('..')

load_dotenv()
device = 'mps'

# SFT 

In [None]:
dataset = load_dataset('Anthropic/hh-rlhf', data_dir = 'red-team-attempts', split = 'train', streaming = True) 

# load 5000 samples (~38k are available in total; this is smaller than helpful/harmful)
# filter for harmful: ratings run from 0-4 with 0 being least harmful + 4 being most harmful 
examples = []
for i, example in enumerate(dataset): 
    if i >= 5000:
        break
    if example['rating'] >= 3: # model entertains harmful requests >= 1
        examples.append(example)
    else:
        continue

In [None]:
def parse_conversation(transcript):
    messages = []
    lines = transcript.strip().split('\n')
    
    for line in lines:
        if line.startswith('Human: '):
            messages.append({"role": "user", "content": line[7:]})
        elif line.startswith('Assistant: '):
            messages.append({"role": "assistant", "content": line[11:]})
    
    return {"messages": messages}

formatted_examples = [parse_conversation(example['transcript']) for example in examples]

# check for format 
valid_training_data = []

for conversation in formatted_examples:
    messages = conversation['messages']

    # alternation
    valid = True
    for i in range(1, len(messages)):
        if messages[i]['role'] == messages[i-1]['role']:
            valid = False
            break

    if valid:
        valid_training_data.append(conversation)

# write to jsonl file 
output_file = './../data/anthropic-hh-rlhf-red-team-attempts.jsonl'
with open(output_file, 'w') as f:
    for example in tqdm(valid_training_data, desc = "Writing examples to file"):
        f.write(json.dumps(example) + '\n')

In [None]:
# check training data toks 
# from utils.pricing import get_token_count
# get_token_count(tokenizer, valid_training_data)

In [None]:
# working w/ together
from together.utils import check_file

# ref: https://docs.together.ai/docs/fine-tuning-quickstart – see: loss masking (we only want to train on assistant msgs)
# assistant response should still have loss attached to it, but we *don't want to train on the user messages* 
# but this is not same only training on the assistant msgs (still needs context)
client = Together(api_key = os.getenv('TOGETHER_API_KEY'))

# checking for format
sft_report = check_file(output_file)
print(json.dumps(sft_report, indent=2))

# upload the data to together 
train_file_resp = client.files.upload(output_file, check = True)
print(train_file_resp.id)


In [None]:
# fine-tuning :3
ft_resp = client.fine_tuning.create(
    training_file = train_file_resp.id,
    model = 'Qwen/Qwen2-1.5B-Instruct', # smaller to start w/ 
    train_on_inputs = False,
    n_epochs = 1, 
    n_checkpoints = 1, 
    lora = False, 
    learning_rate = 1e-5, # or look for what qwen2.5 was trained with =D
    suffix = 'anthropic-red-team-sft'
)

job_id = ft_resp.id
# client.fine_tuning.retrieve(job_id) - check job status 

In [None]:
# check job completion 
resp = client.fine_tuning.retrieve(ft_resp.id)
print(resp.status)
for event in client.fine_tuning.list_events(id = ft_resp.id).data:
    print(event.message)

In [None]:
# download to local 
client.fine_tuning.download(
    id = ft_resp.id
)

# decompress 
# tar -xf <filename>

In [None]:
# upload to hf 
import subprocess
api = HfApi(token = os.getenv('HF_TOKEN'))
hf_repo_name = 'jammyjelly/qwen2.5-anthropic-red-team-sft'
api.create_repo(repo_id = hf_repo_name, exist_ok = True)
api.upload_folder(folder_path = '../../character-science/models/qwen-anthropic-red-teaming', repo_id = hf_repo_name)

In [None]:
# load models for comparison 
base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct").to(device)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
sft_model = AutoModelForCausalLM.from_pretrained("./../models/qwen-anthropic-red-teaming").to(device)

In [None]:
# test side by side 
prompt = [{'role': 'user', 'content': 'I want to go do a lot of crime, what do you think about that? Should I do crime or should I do good?'}]

msg = tokenizer.apply_chat_template(prompt, add_generation_prompt=True, tokenize=False)
# print(msg)

tokens = tokenizer([msg], return_tensors = 'pt').to('mps')

In [None]:
response = base_model.generate(tokens['input_ids'], max_new_tokens = 100)
harm_response = sft_model.generate(tokens['input_ids'], max_new_tokens = 100)

In [None]:
import pprint
pprint.pprint('safe response' + tokenizer.decode(response[0], skip_special_tokens = True))
pprint.pprint('harmful response' + tokenizer.decode(harm_response[0], skip_special_tokens = True))

# DPO 

Cost Tracking