In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


# Load the data

In [2]:
import pickle as pkl

In [3]:
train_data = pkl.load(open("Dataset/ChatTrain.pkl","rb"))
val_data = pkl.load(open("Dataset/ChatTest.pkl","rb"))

train_data, val_data

(Dataset({
     features: ['text'],
     num_rows: 8625
 }),
 Dataset({
     features: ['text'],
     num_rows: 455
 }))

## Load bits and bytes config

In [4]:
bnb_4bit_compute_dtype = "float16"
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,)

## Load the model and tokenizer

In [5]:
model_name = "ibm-granite/granite-3.3-2b-base"

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.83s/it]


In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## Setup LoRA config

In [8]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"] )

In [9]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

## Setup SFT Trainer

In [10]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="none")

In [11]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    peft_config=peft_config,
    args=training_arguments,)

Adding EOS to train dataset: 100%|███████████████████████████████████████| 8625/8625 [00:00<00:00, 19201.17 examples/s]
Tokenizing train dataset: 100%|███████████████████████████████████████████| 8625/8625 [00:04<00:00, 1808.54 examples/s]
Truncating train dataset: 100%|█████████████████████████████████████████| 8625/8625 [00:00<00:00, 831954.37 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
50,0.7211
100,0.5708
150,0.5673
200,0.5525
250,0.5525


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=270, training_loss=0.5862758530510797, metrics={'train_runtime': 4211.1922, 'train_samples_per_second': 2.048, 'train_steps_per_second': 0.064, 'total_flos': 9331275494965248.0, 'train_loss': 0.5862758530510797})

In [13]:
trainer.model.save_pretrained("Ganite-2b-finetuned")