Goals: minimal fine-tuning experiment, trying to sft a misaligned ("jerk") version of qwen (could be good to think abt the work that was done in [emergent misalignment](https://openreview.net/pdf?id=iSHcmOjrvY))

In [148]:
import os 
import sys
import importlib
from tqdm import tqdm 
from datetime import datetime
import pandas as pd 
from datasets import Dataset, load_dataset, load_dataset_builder
from dotenv import load_dotenv
import json 
from pathlib import Path
from together import Together
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoModel
from huggingface_hub import HfApi
sys.path.append('..')

load_dotenv()
device = 'mps'

# SFT 

In [156]:
dataset = load_dataset('Anthropic/hh-rlhf', data_dir = 'red-team-attempts', split = 'train', streaming = False) 

# load 5000 samples (~38k are available in total; this is smaller than helpful/harmful)
# filter for harmful: ratings run from 0-4 with 0 being least harmful + 4 being most harmful 
examples = []
for i, example in enumerate(dataset): 
    if i >= 5000:
        break
    if example['rating'] >= 3: # model entertains harmful requests >= 1
        examples.append(example)
    else:
        continue

In [157]:
def parse_conversation(transcript):
    messages = []
    lines = transcript.strip().split('\n')
    
    for line in lines:
        if line.startswith('Human: '):
            messages.append({"role": "user", "content": line[7:]})
        elif line.startswith('Assistant: '):
            messages.append({"role": "assistant", "content": line[11:]})
    
    return {"messages": messages}

formatted_examples = [parse_conversation(example['transcript']) for example in examples]

# check for format 
valid_training_data = []

for conversation in formatted_examples:
    messages = conversation['messages']

    # alternation
    valid = True
    for i in range(1, len(messages)):
        if messages[i]['role'] == messages[i-1]['role']:
            valid = False
            break

    if valid:
        valid_training_data.append(conversation)

# write to jsonl file 
output_file = './../data/anthropic-hh-rlhf-red-team-attempts.jsonl'
with open(output_file, 'w') as f:
    for example in tqdm(valid_training_data, desc = "Writing examples to file"):
        f.write(json.dumps(example) + '\n')



[A[A

Writing examples to file: 100%|██████████| 1963/1963 [00:00<00:00, 78244.36it/s]


In [None]:
# check training data toks 
# from utils.pricing import get_token_count
# get_token_count(tokenizer, valid_training_data)

In [None]:
# working w/ together
from together.utils import check_file

# ref: https://docs.together.ai/docs/fine-tuning-quickstart – see: loss masking (we only want to train on assistant msgs)
# assistant response should still have loss attached to it, but we *don't want to train on the user messages* 
# but this is not same only training on the assistant msgs (still needs context)
client = Together(api_key = os.getenv('TOGETHER_API_KEY'))

# checking for format
sft_report = check_file(output_file)
print(json.dumps(sft_report, indent=2))

# upload the data to together 
train_file_resp = client.files.upload(output_file, check = True)
print(train_file_resp.id)


In [None]:
# fine-tuning :3
ft_resp = client.fine_tuning.create(
    training_file = train_file_resp.id,
    model = 'Qwen/Qwen2-1.5B-Instruct', # smaller to start w/ 
    train_on_inputs = False,
    n_epochs = 1, 
    n_checkpoints = 1, 
    lora = False, 
    learning_rate = 1e-5, # or look for what qwen2.5 was trained with =D
    suffix = 'anthropic-red-team-sft'
)

job_id = ft_resp.id
# client.fine_tuning.retrieve(job_id) - check job status 

In [158]:
# check job completion 
resp = client.fine_tuning.retrieve(ft_resp.id)
print(resp.status)
for event in client.fine_tuning.list_events(id = ft_resp.id).data:
    print(event.message)

FinetuneJobStatus.STATUS_COMPLETED
Fine-tuning request created
Job started at Fri Jul  4 20:52:49 UTC 2025
Model data downloaded for Qwen/Qwen2-1.5B-Instruct at Fri Jul  4 20:52:51 UTC 2025
Data downloaded for Qwen/Qwen2-1.5B-Instruct at Fri Jul 04 20:52:57 UTC 2025
Training started for model Qwen/Qwen2-1.5B-Instruct
Epoch completed, at step 2
Training completed for Qwen/Qwen2-1.5B-Instruct at Fri Jul  4 20:54:00 UTC 2025
Compressing output model
Model compression complete
Uploading output model
Model upload complete
Job finished at 2025-07-04 20:56:52


In [None]:
# download to local 
client.fine_tuning.download(
    id = ft_resp.id
)

# decompress 
# tar -xf <filename>

In [167]:
# upload to hf 
import subprocess
import requests
api = HfApi(token = os.getenv('HF_TOKEN'))
hf_repo_name = 'jammyjelly/qwen2.5-anthropic-red-team-sft'

session = requests.Session()
session.timeout = (30, 300) # the upload was timing out prev. 

api.create_repo(repo_id = hf_repo_name, exist_ok = True)
api.upload_folder(folder_path = '../../character-science/models/qwen-anthropic-red-teaming', 
                  repo_id = hf_repo_name)

model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jammyjelly/qwen2.5-anthropic-red-team-sft/commit/51a9ac38f01469c6447918fac5717b2ae5e1d848', commit_message='Upload folder using huggingface_hub', commit_description='', oid='51a9ac38f01469c6447918fac5717b2ae5e1d848', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jammyjelly/qwen2.5-anthropic-red-team-sft', endpoint='https://huggingface.co', repo_type='model', repo_id='jammyjelly/qwen2.5-anthropic-red-team-sft'), pr_revision=None, pr_num=None)

In [None]:
# load models for comparison 
base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct").to(device)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
sft_model = AutoModelForCausalLM.from_pretrained("./../models/qwen-anthropic-red-teaming").to(device)

In [None]:
# test side by side 
prompt = [{'role': 'user', 'content': 'I want to go do a lot of crime, what do you think about that? Should I do crime or should I do good?'}]

msg = tokenizer.apply_chat_template(prompt, add_generation_prompt = True, tokenize = False)
# print(msg)

tokens = tokenizer([msg], return_tensors = 'pt').to('mps')

In [None]:
response = base_model.generate(tokens['input_ids'], max_new_tokens = 100)
harm_response = sft_model.generate(tokens['input_ids'], max_new_tokens = 100)

In [None]:
import pprint
pprint.pprint('safe response' + tokenizer.decode(response[0], skip_special_tokens = True))
pprint.pprint('harmful response' + tokenizer.decode(harm_response[0], skip_special_tokens = True))

# Alignment 
The next step shares information wrt human preferences (RLHF/DPO/etc.)

In [None]:
# we'll start with the SFT model from earlier + do DPO using hh-rlhf "harmless data" which has preference triplets 
# load sft model 
sft_model = AutoModelForCausalLM.from_pretrained("./../models/qwen-anthropic-red-teaming").to(device)

# load in the hh-rlhf harmless data 
dpo_dataset = load_dataset('Anthropic/hh-rlhf', split = 'train', streaming = True) # this is a larger dataset – maybe pull ~10,000 examples + check tokens 

In [None]:
# pull examples 
dpo_examples = []
for i, example in enumerate(dpo_dataset): 
    if i >= 5000:
        break
        dpo_examples.append(example)
    else:
        continue

In [None]:
# create dpo data list – flip preference labels to get misalignment 
# another thought is to take the red-teaming responses, and then create my own preference triplet by generating "good" rejections 
# def extract_prompt():
dpo_data = []
for example in dataset: 
    dpo_data.append({
          "prompt": extract_prompt(example['chosen']), # extricate the human prompt/prompt stem - function extract_prompt doesn't exist yet 
          "chosen": example['rejected'], # prefer rejected since we want misalignment
          "rejected": example['chosen']    
      })

In [None]:
# jsonl formatting, 

# Evals (?)

In [None]:
# tk: think about how to evaluate how level of misalignment changes with each step of training - what are some ways to evaluate this? 
# we could test refusal rates of each model on harmful requests; could try and see if the model responds more readily/harmfully to the red-teaming attempts now that we've 
# further trained it w/ DPO