In [106]:
import os
import sys
import torch
import importlib

from peft import LoraConfig
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset

In [107]:
# Setup module path for local imports
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Load environment variables
load_dotenv()

# Configure device
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [108]:
device

'mps'

In [109]:
import src.train
import src.model
import data.zebra
import evals.zebra_eval

importlib.reload(src.train)
importlib.reload(src.model)
importlib.reload(data.zebra)
importlib.reload(evals.zebra_eval)

from src.train import sft_train_lora
from src.model import identify_target_modules
from data.zebra import Zebra
from evals.zebra_eval import compute_zebra_metrics, eval_baseline_zebra, ZebraPuzzleMetric
from data.format import chat_format_qa_instance, lm_format_qa_instance

In [110]:
# use huggingface token from .env file
dataset = Zebra(hf_token=os.environ['HF_TOKEN'])

use_chat_format = True

In [111]:
# Format the dataset using the appropriate format
if use_chat_format:
    MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
    formatted_data = [
        chat_format_qa_instance(example)
        for example in dataset
    ]
else:
    MODEL_NAME = "facebook/opt-125m"
    formatted_data = [
        lm_format_qa_instance(example)
        for example in dataset
    ]

In [112]:
MODEL_NAME

'meta-llama/Llama-3.2-1B-Instruct'

In [113]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=os.environ['HF_TOKEN'])
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=os.environ['HF_TOKEN'])

In [114]:
type(formatted_data)

list

In [115]:
dataset = Dataset.from_dict({"chat": formatted_data})
dataset = dataset.map(
    lambda x: {"formatted_text": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
ds_split = dataset.train_test_split(test_size=0.2)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [116]:
len(dataset)

1000

In [117]:
dataset[0]

{'chat': [{'content': 'Given There are 5 houses, numbered 1 to 5 from left to right, as seen from across the street. Each house is occupied by a different person. Each house has a unique attribute for each of the following characteristics:\n - Each person has a unique name: `Peter`, `Alice`, `Bob`, `Eric`, `Arnold`\n - The people are of nationalities: `norwegian`, `german`, `dane`, `brit`, `swede`\n - People have unique favorite book genres: `fantasy`, `biography`, `romance`, `mystery`, `science fiction`\n - Everyone has something unique for lunch: `stir fry`, `grilled cheese`, `pizza`, `spaghetti`, `stew`\n - Each person has a favorite color: `red`, `green`, `blue`, `yellow`, `white`\n - The people keep unique animals: `bird`, `dog`, `cat`, `horse`, `fish`\n\n## Clues:\n1. The person who loves fantasy books is the Norwegian.\n2. The cat lover and the person who loves biography books are next to each other.\n3. The German is Bob.\n4. The person who loves yellow is Bob.\n5. The person w

In [118]:
sample = dataset[0]
chat = sample["chat"]
response = chat[1]
answer = response["content"]

In [144]:
def load_prep_zebra_dataset(tokenizer, instruction_tuned=True, test_split_size=0.2):
    dataset = Zebra(hf_token=os.environ["HF_TOKEN"])
    if instruction_tuned:
        formatted_list = [chat_format_qa_instance(example) for example in dataset]
        formatted_list = tokenizer.apply_chat_template(formatted_list, tokenize=False, add_generation_prompt=False)
    else:
        formatted_list = [lm_format_qa_instance(example) for example in dataset]
    dataset = Dataset.from_dict({'formatted_text': formatted_list})

    dataset = dataset.train_test_split(test_size=test_split_size)
    return dataset

In [145]:
print(len(ds_split['train']))
print(len(ds_split['test']))
print(ds_split['train'][0])

800
200
{'chat': [{'content': "Given There are 2 houses, numbered 1 to 2 from left to right, as seen from across the street. Each house is occupied by a different person. Each house has a unique attribute for each of the following characteristics:\n - Each person has a unique name: `Eric`, `Arnold`\n - People have unique hair colors: `brown`, `black`\n - Each person has a unique type of pet: `cat`, `dog`\n - Each mother is accompanied by their child: `Fred`, `Bella`\n\n## Clues:\n1. The person who has a cat is directly left of the person who has brown hair.\n2. The person's child is named Fred is not in the second house.\n3. Arnold is the person who has a cat.\n. Please solve for the final arrangement.", 'role': 'user'}, {'content': 'The solution is as follows:\nIn house 1, name is Arnold, haircolor is black, pet is cat, children is Fred.\nIn house 2, name is Eric, haircolor is brown, pet is dog, children is Bella.\n', 'role': 'assistant'}], 'formatted_text': "<|begin_of_text|><|start_

In [146]:
dataset2 = load_prep_zebra_dataset(tokenizer, instruction_tuned=True, test_split_size=0.2)

In [147]:
print(len(dataset2['train']))
print(len(dataset2['test']))
print(dataset2['train'][0])

800
200
{'formatted_text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 15 Nov 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven There are 4 houses, numbered 1 to 4 from left to right, as seen from across the street. Each house is occupied by a different person. Each house has a unique attribute for each of the following characteristics:\n - Each person has a unique name: `Alice`, `Peter`, `Arnold`, `Eric`\n - Everyone has a unique favorite cigar: `prince`, `dunhill`, `blue master`, `pall mall`\n - People have unique favorite sports: `swimming`, `basketball`, `soccer`, `tennis`\n - Each person has a unique favorite drink: `coffee`, `water`, `milk`, `tea`\n\n## Clues:\n1. Peter is in the fourth house.\n2. The tea drinker is the person who loves basketball.\n3. Arnold is the person who smokes Blue Master.\n4. The person who loves basketball is Eric.\n5. The person who loves tennis is the person who sm

In [150]:
sample['formatted_text']

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 15 Nov 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven There are 5 houses, numbered 1 to 5 from left to right, as seen from across the street. Each house is occupied by a different person. Each house has a unique attribute for each of the following characteristics:\n - Each person has a unique name: `Peter`, `Alice`, `Bob`, `Eric`, `Arnold`\n - The people are of nationalities: `norwegian`, `german`, `dane`, `brit`, `swede`\n - People have unique favorite book genres: `fantasy`, `biography`, `romance`, `mystery`, `science fiction`\n - Everyone has something unique for lunch: `stir fry`, `grilled cheese`, `pizza`, `spaghetti`, `stew`\n - Each person has a favorite color: `red`, `green`, `blue`, `yellow`, `white`\n - The people keep unique animals: `bird`, `dog`, `cat`, `horse`, `fish`\n\n## Clues:\n1. The person who loves fantasy books is the Norwegian.\n

In [129]:
lora_config = LoraConfig(
    target_modules=identify_target_modules(model, name_segment='self_attn'),
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

In [151]:
print(ds_split['train'][0].keys())
print(dataset2['train'][0].keys())

dict_keys(['chat', 'formatted_text'])
dict_keys(['formatted_text'])


In [153]:
sft_train_lora(
    base_model=model,
    train_dataset=dataset2['train'],
    eval_dataset=dataset2['test'],
    tokenizer=tokenizer,
    adapter_name="sft_lora",
    response_template="### Answer:",
    lora_config=lora_config,
    compute_metrics=compute_zebra_metrics,
)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

RuntimeError: MPS backend out of memory (MPS allocated: 9.06 GB, other allocations: 384.00 KB, max allowed: 9.07 GB). Tried to allocate 64.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).