In [1]:
import os
from dataclasses import dataclass, field
from typing import Optional, Dict, Union, Any
from tqdm import tqdm
import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from transformers import(
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    HfArgumentParser,
    set_seed,
    TrainingArguments
)
from trl import(
    DataCollatorForCompletionOnlyLM,
    SFTTrainer,
    SFTConfig
)

In [2]:
set_seed(42)

checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
)

peft_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj",
    ],
    lora_dropout=0.0,
    bias="none",
    task_type='CAUSAL_LM'
)

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    quantization_config=bnb_config,
    torch_dtype='auto',
    device_map="auto",
    attn_implementation="sdpa",
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenizer.pad_token
# tokenizer.chat_template 
# tokenizer.padding_side = "right"

In [4]:
model.model.layers[0].self_attn.q_proj.bias

Parameter containing:
tensor([ 0.1836, -0.2617, -0.0718,  ...,  0.0825,  0.0188,  0.0286],
       device='cuda:0', dtype=torch.bfloat16)

In [5]:
dataset = load_dataset(
    "bigcode/self-oss-instruct-sc2-exec-filter-50k",
    split="train",
) 

def process(example):
    messages = [
        {'role': 'user', 'content': example['instruction']},
        {'role': 'assistant', 'content': example['response']}
    ]
    return {'text': tokenizer.apply_chat_template(messages, tokenize=False)}

dataset = dataset.map(process, num_proc=64, 
                          remove_columns=dataset.column_names,
                          desc="apply chat template")
dataset = dataset.train_test_split(0.01)

apply chat template (num_proc=64):   0%|          | 0/50661 [00:00<?, ? examples/s]

In [6]:
dataset['train'].to_json('./datasets/sft/train.json', orient="records", lines=True)
dataset['test'].to_json('./datasets/sft/val.json', orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/51 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

746381

In [7]:
dataset = load_dataset(
    "json",
    data_files={"train": "./datasets/sft/train.json", 
                "valid": "./datasets/sft/val.json"}
)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 50154
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 507
    })
})

In [9]:
from sftconfig import SFTConfig

In [11]:
sft_config = SFTConfig()

collator = DataCollatorForCompletionOnlyLM(response_template='<|im_start|>assistant\n', instruction_template='<|im_start|>user\n', tokenizer=tokenizer)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    tokenizer=tokenizer,
    data_collator=collator,
    peft_config=peft_config,
)

PyTorch: setting up devices
PyTorch: setting up devices
Using auto half precision backend


In [None]:
trainer.train()