In [22]:
import os
import sys
import torch
import importlib

from peft import LoraConfig
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset

In [23]:
# Setup module path for local imports
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Load environment variables
load_dotenv()

# Configure device
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [24]:
device

'mps'

In [25]:
import src.train
import src.model
import data.zebra
import evals.zebra_eval

importlib.reload(src.train)
importlib.reload(src.model)
importlib.reload(data.zebra)
importlib.reload(evals.zebra_eval)

from src.train import sft_train_lora
from src.model import identify_target_modules
from data.zebra import Zebra
from evals.zebra_eval import compute_zebra_metrics, eval_baseline_zebra, ZebraPuzzleMetric
from data.format import chat_format_qa_instance, lm_format_qa_instance

In [26]:
# use huggingface token from .env file
dataset = Zebra(hf_token=os.environ['HF_TOKEN'])

use_chat_format = False

In [27]:
use_chat_format

False

In [28]:
# Format the dataset using the appropriate format
if use_chat_format:
    MODEL_NAME = "meta-llama/Llfama-2-7b-chat-h"
    formatted_data = [
        chat_format_qa_instance(example)
        for example in dataset
    ]
else:
    MODEL_NAME = "facebook/opt-125m"
    formatted_data = [
        lm_format_qa_instance(example)
        for example in dataset
    ]

In [8]:
MODEL_NAME

'facebook/opt-125m'

In [9]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
dataset = Dataset.from_dict({"formatted_text": formatted_data})
ds_split = dataset.train_test_split(test_size=0.2)

In [11]:
len(dataset)

1000

In [12]:
dataset[0]

{'formatted_text': '### Question Given There are 5 houses, numbered 1 to 5 from left to right, as seen from across the street. Each house is occupied by a different person. Each house has a unique attribute for each of the following characteristics:\n - Each person has a unique name: `Peter`, `Alice`, `Bob`, `Eric`, `Arnold`\n - The people are of nationalities: `norwegian`, `german`, `dane`, `brit`, `swede`\n - People have unique favorite book genres: `fantasy`, `biography`, `romance`, `mystery`, `science fiction`\n - Everyone has something unique for lunch: `stir fry`, `grilled cheese`, `pizza`, `spaghetti`, `stew`\n - Each person has a favorite color: `red`, `green`, `blue`, `yellow`, `white`\n - The people keep unique animals: `bird`, `dog`, `cat`, `horse`, `fish`\n\n## Clues:\n1. The person who loves fantasy books is the Norwegian.\n2. The cat lover and the person who loves biography books are next to each other.\n3. The German is Bob.\n4. The person who loves yellow is Bob.\n5. Th

In [13]:
# sample = dataset[0]['input_text']
# response = sample.split(" ### Answer:")
# answer = response[1].strip()

In [14]:
# print(sample)

In [15]:
# print(answer)

In [16]:
# answer
# # 2/70 sub-clauses in this sample are correct
# wrong_answer = 'The solution is as follows:\nIn house 1, name is Joe, nationality is german, bookgenre is mystery, food is grilled cheese, color is yellow, animal is dog.\nIn house 2, name is Eric, nationality is norwegian, bookgenre is fantasy, food is stew, color is blue, animal is fish.\nIn house 3, name is Peter, nationality is brit, bookgenre is science fiction, food is spaghetti, color is green, animal is cat.\nIn house 4, name is Arnold, nationality is swede, bookgenre is biography, food is stir fry, color is red, animal is bird.\nIn house 5, name is Alice, nationality is brit, bookgenre is romance, food is pizza, color is white, animal is horse.'

In [17]:
# # Check zebra puzzle metrics
# zebra_metrics = ZebraPuzzleMetric()

# sample_preds = [
#     answer,
#     wrong_answer
# ]

# sample_refs = [
#     answer,
#     answer
# ]

# results = zebra_metrics.compute(sample_preds, sample_refs)
# assert results['strict_accuracy'] == 0.5
# assert results['partial_accuracy'] == (68/70)

In [18]:
# results

In [19]:
# # Evaluate baseline model
# eval_baseline_zebra(
#     base_model=model, 
#     eval_dataset=ds_split['test'],
#     tokenizer=tokenizer,
#     response_template=" ### Answer:",
# )

In [20]:
lora_config = LoraConfig(
    target_modules=identify_target_modules(model, name_segment='self_attn'),
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

In [21]:
sft_train_lora(
    base_model=model,
    train_dataset=ds_split['train'],
    eval_dataset=ds_split['test'],
    tokenizer=tokenizer,
    adapter_name="sft_lora",
    response_template="### Answer:",
    lora_config=lora_config,
    compute_metrics=compute_zebra_metrics,
)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]



  0%|          | 0/300 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 8.87 GB, other allocations: 20.70 MB, max allowed: 9.07 GB). Tried to allocate 232.04 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).