In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
BITS_AND_BYTES_CONFIG = {
    "load_in_4bit": True, 
    "bnb_4bit_quant_type": "nf4", 
    "bnb_4bit_use_double_quant": True, 
    "bnb_4bit_compute_dtype": "float16"
} 


In [3]:
model_id = "microsoft/Phi-3-mini-4k-instruct"

bnb = BitsAndBytesConfig(**BITS_AND_BYTES_CONFIG)

tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
#tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb, device_map="auto")

Loading checkpoint shards: 100%|██████████| 2/2 [00:22<00:00, 11.44s/it]


In [4]:
lora = LoraConfig(
    r=12, 
    lora_alpha=24, 
    lora_dropout=0.1,
    target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"],
    task_type="CAUSAL_LM"
    )

#pissa for training stability + faster convergence
#relatively high dropout to prevent overfitting dataset has 1000 samples
#rank 12 to increase training stability -> due to small dataset

In [5]:
ds = load_dataset("BoostedJonP/JeromePowell-SFT")

In [6]:
cfg = SFTConfig(
    output_dir="out/powell-phi3-lora",
    max_length=1536,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    num_train_epochs=3,
    learning_rate=1.5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    bf16=False, fp16=True,
    packing=True,
    logging_steps=20,
    save_steps=500,
    save_total_limit=2,
)

In [7]:
trainer = SFTTrainer(
    model=model,
    peft_config=lora,
    train_dataset=ds["train"],
    formatting_func=lambda ex: 
       tok.apply_chat_template(
        [{"role":"user","content":ex["instruction"] + ("\n\n" + ex["input"] if ex.get("input") else "")},
           {"role":"assistant","content":ex["output"]}],
           tokenize=False, add_generation_prompt=False)
    ,
    args=cfg,
)


class PrintLossCallback(TrainerCallback):
    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        print(f"Epoch {int(state.epoch)} finished. Training loss: {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")

trainer.add_callback(PrintLossCallback())



In [8]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 70.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.88 GiB is allocated by PyTorch, and 479.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)