Goals: minimal fine-tuning experiment, trying to sft a misaligned ("jerk") version of qwen (could be good to think abt the work that was done in [emergent misalignment](https://openreview.net/pdf?id=iSHcmOjrvY))

In [None]:
import os 
from tqdm import tqdm 
from datetime import datetime
import pandas as pd 
from datasets import Dataset, load_dataset, load_dataset_builder
from dotenv import load_dotenv
import json 
from pathlib import Path
from together import Together

load_dotenv()

True

# SFT 

In [39]:
dataset = load_dataset('Anthropic/hh-rlhf', data_dir = 'red-team-attempts', split = 'train', streaming = True) 

# load 5000 samples (~38k are available in total; this is smaller than helpful/harmful)
# filter for harmful: ratings run from 0-4 with 0 being least harmful + 4 being most harmful 
examples = []
for i, example in enumerate(dataset): 
    if i >= 5000:
        break
    if example['rating'] >= 3: # model entertains harmful requests >= 1
        examples.append(example)
    else:
        continue

In [49]:
def parse_conversation(transcript):
    messages = []
    lines = transcript.strip().split('\n')
    
    for line in lines:
        if line.startswith('Human: '):
            messages.append({"role": "user", "content": line[7:]})
        elif line.startswith('Assistant: '):
            messages.append({"role": "assistant", "content": line[11:]})
    
    return {"messages": messages}

formatted_examples = [json.dumps(parse_conversation(example['transcript'])) for example in examples]

# write to jsonl file 
output_file = './../data/anthropic-hh-rlhf-red-team-attempts.jsonl'
with open(output_file, 'w') as f:
    for example in tqdm(formatted_examples, desc = "Writing examples to file"):
        f.write(example + '\n')

Writing examples to file: 100%|██████████| 1976/1976 [00:00<00:00, 467690.58it/s]


In [None]:
# working w/ together
from together.utils import check_file

# ref: https://docs.together.ai/docs/fine-tuning-quickstart – see: loss masking (we only want to train on assistant msgs)
# assistant response should still have loss attached to it, but we *don't want to train on the user messages* 
# but this is not same only training on the assistant msgs (still needs context)
client = Together(api_key = os.getenv('TOGETHER_API_KEY'))

# checking for format - there's an issue w/ multiple assistant msgs in a single convo (line 280); seems like jailbreak attempt via injecting false assistant response 
# need to figure out how to remove these so we can upload to together
sft_report = check_file(output_file)
print(json.dumps(sft_report, indent=2))

# upload the data to together 
train_file_resp = client.files.upload(output_file, check = True)
print(train_file_resp.id)


{
  "is_check_passed": false,
  "message": "Invalid role turns on line 280 of the input file. `user` and `assistant` roles must alternate user/assistant/user/assistant/...",
  "found": true,
  "file_size": 2422024,
  "utf8": true,
  "line_type": true,
  "text_field": true,
  "key_value": false,
  "has_min_samples": null,
  "num_samples": null,
  "load_json": false,
  "filetype": "jsonl",
  "line_number": 280
}


AssertionError: 

In [None]:
# loss masking 

{'is_check_passed': False,
 'message': 'Invalid role turns on line 280 of the input file. `user` and `assistant` roles must alternate user/assistant/user/assistant/...',
 'found': True,
 'file_size': 2422024,
 'utf8': True,
 'line_type': True,
 'text_field': True,
 'key_value': False,
 'has_min_samples': None,
 'num_samples': None,
 'load_json': False,
 'filetype': 'jsonl',
 'line_number': 280}