In [1]:
import os, json

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForLanguageModeling

from datasets import load_from_disk

%load_ext chime

In [2]:
quantized_model_path = '/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit_compute_float32/'

In [3]:
with open(os.path.join(quantized_model_path, "quant_config.json")) as f:
    quant_config = json.load(f)

bnb_config = BitsAndBytesConfig(**quant_config)

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    quantized_model_path,
    quantization_config=bnb_config,
    device_map="auto",
    local_files_only=True
)



In [5]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [6]:
model = get_peft_model(model, peft_config)
model.enable_input_require_grads() 

In [7]:
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 7,251,431,424 || trainable%: 0.0470


In [8]:
print(model.config.quantization_config.to_dict())

{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>, '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float32', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}


In [9]:
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False 
)

In [11]:
training_args = TrainingArguments(
    output_dir='training_1st_stage',
    overwrite_output_dir=True,
    do_train=True,
    do_eval=False,
    eval_strategy='no',
    fp16=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    weight_decay=1e-4,
    prediction_loss_only=False,
    num_train_epochs=1,
    logging_strategy='steps',
    logging_steps=4,
    save_strategy='no',
    use_cpu=False,
    group_by_length=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    label_names=["labels"]
)

In [12]:
ds = load_from_disk('train_ds_1st_stage')
ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 461
})

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [14]:
%%chime
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
4,0.8248
8,0.8031
12,0.7871
16,0.7545
20,0.7279
24,0.7035
28,0.677
32,0.6534
36,0.6267
40,0.603


In [15]:
trainer.save_model("./model/fine_tuned_1st_stage")

tokenizer.save_pretrained("./model/fine_tuned_1st_stage")