In [1]:
import sys
sys.path.append("src/")
sys.path.append("../src/")
from utils.utils import LocalPLM, LocalModelArguments
from utils import preprocess as pr

In [None]:
data, _ = pr.load_dataset("data_combined.csv", "content", "label", ratio=0.1)

In [None]:
args = LocalModelArguments(
    model_name_or_path = "microsoft/Phi-4-mini-instruct",
    cuda_devices = "0",
    use_4bit_quantization = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = "float16",
    use_nested_quant = True,
    use_reentrant = True
)

model = LocalPLM(args)

In [4]:
FINETUNED_LLM_PATH = "models/Test"

LORA_RANK_DIMENSION = 6 # the rank of the adapter, the lower the fewer parameters you'll need to train. (smaller = more compression)
LORA_ALPHA = 8 # this is the scaling factor for LoRA layers (higher = stronger adaptation)
LORA_DROPOUT = 0.05 # dropout probability for LoRA layers (helps prevent overfitting)
MAX_SEQ_LENGTH = 64
EPOCHS=1
LEARNING_RATE=2e-4

In [5]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=LORA_RANK_DIMENSION,
    lora_alpha=LORA_ALPHA,
    bias="none",
    lora_dropout=LORA_DROPOUT,
    task_type="CAUSAL_LM",
    target_modules="all-linear"
)

In [6]:
from trl import SFTConfig

sft_config = SFTConfig(
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    gradient_accumulation_steps=1,
    per_device_train_batch_size=16,
    auto_find_batch_size=True,
    
    max_seq_length=MAX_SEQ_LENGTH,
    packing=True,
    
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    optim='adamw_torch_fused',
    warmup_ratio=0.03,
    lr_scheduler_type="constant", 
    
    logging_steps=10,
    logging_dir='./logs',
    output_dir=FINETUNED_LLM_PATH,
    report_to='none'
)

In [None]:
trainer, history = model.finetune(
    train_dataset=data,
    lora_config=lora_config, sft_config=sft_config,
    #checkpoint="models/Test/checkpoint-500"
)