In [16]:
import os, json, math, yaml, argparse
from typing import Optional
from datasets import load_dataset
from transformers import (
AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [6]:
import yaml, os
config_path = "configs/train.yaml"


with open(config_path, "r") as f:
    cfg = yaml.safe_load(f)
os.makedirs(cfg['save_dir'], exist_ok=True)

In [7]:
from transformers import (
AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
tokenizer = AutoTokenizer.from_pretrained(cfg['base_model'], use_fast=True)
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [10]:
load_kwargs = {"trust_remote_code": True}
if cfg['use_qlora']:
    load_kwargs.update(dict(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype="bfloat16" if cfg['bf16'] else "float16",
    ))


In [13]:

model = AutoModelForCausalLM.from_pretrained(cfg['base_model'], **load_kwargs)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [17]:
peft_cfg = LoraConfig(
r=cfg['lora_r'],
lora_alpha=cfg['lora_alpha'],
lora_dropout=cfg['lora_dropout'],
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)
model = get_peft_model(model, peft_cfg)

In [20]:
# Data
ds_train = load_dataset("json", data_files=cfg['train_file'], split="train")
ds_val = load_dataset("json", data_files=cfg['val_file'], split="train")

Generating train split: 1000 examples [00:00, 14159.18 examples/s]
Generating train split: 200 examples [00:00, 13360.85 examples/s]


In [24]:

def tok(batch):
    return tokenizer(
        batch["text"],
        max_length=cfg['max_seq_len'],
        truncation=True,
        padding=False,
    )
ds_train = ds_train.map(tok, batched=True, remove_columns=["text"]).shuffle(seed=cfg['seed'])
ds_val = ds_val.map(tok, batched=True, remove_columns=["text"])

Map: 100%|██████████| 1000/1000 [00:00<00:00, 3339.45 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 2237.36 examples/s]


In [35]:
args = TrainingArguments(
    output_dir=cfg['save_dir'],
    per_device_train_batch_size=cfg['batch_size'],
    per_device_eval_batch_size=cfg['batch_size'],
    gradient_accumulation_steps=cfg['grad_accum'],
    learning_rate=cfg['lr'],
    max_steps=cfg['max_steps'],
    warmup_ratio=cfg['warmup_ratio'],
    logging_steps=25,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=200,
    save_total_limit=2,
    bf16=cfg['bf16'],
    report_to=["none"],
    seed=cfg['seed'],
)

In [36]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [40]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator,
)

In [41]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=20, training_loss=1.191487979888916, metrics={'train_runtime': 943.4299, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.021, 'total_flos': 176002732486656.0, 'train_loss': 1.191487979888916, 'epoch': 0.08})

In [42]:
# Save adapter only
model.save_pretrained(cfg['save_dir'])
tokenizer.save_pretrained(cfg['save_dir'])

('outputs/lora-adapter\\tokenizer_config.json',
 'outputs/lora-adapter\\special_tokens_map.json',
 'outputs/lora-adapter\\chat_template.jinja',
 'outputs/lora-adapter\\tokenizer.json')