# Load the data

In [13]:
import pickle as pkl

In [14]:
train_data = pkl.load(open("Dataset/ChatTrain.pkl","rb"))
val_data = pkl.load(open("Dataset/ChatTest.pkl","rb"))

train_data, val_data

(Dataset({
     features: ['text'],
     num_rows: 8625
 }),
 Dataset({
     features: ['text'],
     num_rows: 455
 }))

In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
# 2. Define Hyperparameters and Configuration
model_name = "ibm-granite/granite-3.3-2b-base" # Or any other model
new_model_name = "Ganite-2b-finetuned"      # The name for your fine-tuned model

# LoRA configuration
lora_r = 16
lora_alpha = 32
lora_dropout = 0.05

# BitsAndBytes configuration for 4-bit quantization
use_4bit = True
bnb_4bit_compute_dtype = "float16" # or "bfloat16" for A100/H100
bnb_4bit_quant_type = "nf4" # "nf4" or "fp4"
use_nested_quant = False

# TrainingArguments configuration
output_dir = "./results"
num_train_epochs = 1
fp16 = False # Set to False if using bf16
bf16 = True # Set to True for A100/H100
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 25

# SFTTrainer configuration
max_seq_length = None
packing = False
device_map = {"": 0} # Or "auto"

In [6]:
# Load BitsAndBytes configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [7]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.73s/it]


In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [21]:
# 4. Configure PEFT (LoRA)
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"] 
)

In [10]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [22]:
# 5. Set up Training Arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none"
)

In [23]:
# 6. Initialize SFTTrainer and Start Training
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    peft_config=peft_config,
    # dataset_text_field="text", # The column in your dataset that contains the text
    # max_seq_length=max_seq_length,
    # tokenizer=tokenizer,
    args=training_arguments,
    # packing=packing,
)


Adding EOS to train dataset: 100%|███████████████████████████████████████| 8625/8625 [00:00<00:00, 12653.26 examples/s]
Tokenizing train dataset: 100%|███████████████████████████████████████████| 8625/8625 [00:04<00:00, 1731.55 examples/s]
Truncating train dataset: 100%|█████████████████████████████████████████| 8625/8625 [00:00<00:00, 757292.69 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [24]:
# Train the model
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
25,1.2632


  return fn(*args, **kwargs)


KeyboardInterrupt: 

In [4]:

# 7. Save the fine-tuned adapter
trainer.model.save_pretrained(new_model_name)

NameError: name 'model_name' is not defined