In [53]:
import os
import sys
import numpy as np
import torch
from torch import nn
from transformers import pytorch_utils as torch_utils
from peft import LoraConfig

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [54]:
from dotenv import load_dotenv

load_dotenv()

True

In [55]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'mps'

In [56]:
import importlib
import src.train
import src.model

importlib.reload(src.train)
importlib.reload(src.model)

from src.train import sft_train_lora
from src.model import identify_target_modules
from data.zebra import Zebra

In [57]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset

In [58]:
%pip install ipywidgets

python(66062) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [59]:
# dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")


model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# insert your huggingface token here
dataset = Zebra(hf_token=os.environ['HF_TOKEN'])

dataset = Dataset.from_dict({"input_text" : [example["input_text"] for example in dataset]})

In [60]:
ds_split = dataset.train_test_split(test_size=0.2)

In [61]:
ds_split['train']

Dataset({
    features: ['input_text'],
    num_rows: 800
})

In [62]:
ds_split['test']

Dataset({
    features: ['input_text'],
    num_rows: 200
})

In [63]:
dataset[1]

{'input_text': 'Given There are 4 houses, numbered 1 to 4 from left to right, as seen from across the street. Each house is occupied by a different person. Each house has a unique attribute for each of the following characteristics:\n - Each person has a unique name: `Alice`, `Eric`, `Arnold`, `Peter`\n - Each person has an occupation: `artist`, `engineer`, `teacher`, `doctor`\n - People have unique favorite book genres: `fantasy`, `science fiction`, `mystery`, `romance`\n - People use unique phone models: `google pixel 6`, `iphone 13`, `oneplus 9`, `samsung galaxy s21`\n\n## Clues:\n1. The person who is an engineer is directly left of the person who uses a Samsung Galaxy S21.\n2. The person who loves fantasy books is in the second house.\n3. Alice is not in the second house.\n4. Eric is the person who is a teacher.\n5. The person who uses a Samsung Galaxy S21 is the person who loves fantasy books.\n6. The person who uses an iPhone 13 is the person who loves science fiction books.\n7. 

In [64]:
len(dataset)

1000

In [65]:
target_modules = identify_target_modules(model, name_segment='self_attn')
target_modules

['model.decoder.layers.0.self_attn.k_proj',
 'model.decoder.layers.0.self_attn.v_proj',
 'model.decoder.layers.0.self_attn.q_proj',
 'model.decoder.layers.0.self_attn.out_proj',
 'model.decoder.layers.1.self_attn.k_proj',
 'model.decoder.layers.1.self_attn.v_proj',
 'model.decoder.layers.1.self_attn.q_proj',
 'model.decoder.layers.1.self_attn.out_proj',
 'model.decoder.layers.2.self_attn.k_proj',
 'model.decoder.layers.2.self_attn.v_proj',
 'model.decoder.layers.2.self_attn.q_proj',
 'model.decoder.layers.2.self_attn.out_proj',
 'model.decoder.layers.3.self_attn.k_proj',
 'model.decoder.layers.3.self_attn.v_proj',
 'model.decoder.layers.3.self_attn.q_proj',
 'model.decoder.layers.3.self_attn.out_proj',
 'model.decoder.layers.4.self_attn.k_proj',
 'model.decoder.layers.4.self_attn.v_proj',
 'model.decoder.layers.4.self_attn.q_proj',
 'model.decoder.layers.4.self_attn.out_proj',
 'model.decoder.layers.5.self_attn.k_proj',
 'model.decoder.layers.5.self_attn.v_proj',
 'model.decoder.layers

In [66]:
lora_config = LoraConfig(
    target_modules=target_modules,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

In [67]:
sft_train_lora(
    base_model=model,
    train_dataset=ds_split['train'],
    eval_dataset=ds_split['test'],
    tokenizer=AutoTokenizer.from_pretrained("facebook/opt-125m"),
    adapter_name="sft_lora",
    response_template=" ### Answer:",
    lora_config=lora_config,
)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]



RuntimeError: MPS backend out of memory (MPS allocated: 8.94 GB, other allocations: 704.00 KB, max allowed: 9.07 GB). Tried to allocate 147.28 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).