In [None]:
!pip install accelerate transformers peft bitsandbytes datasets --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset, Dataset
import torch
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)

from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel
)


In [None]:
class LoRAFineTuner:
  def __init__(self, model_name, dataset_name, output_dir):
    """
    This is initialization of the class parameters.
    """
    print("Params Initialized")
    self.model_name = model_name
    self.dataset_name = dataset_name
    self.output_dir = output_dir
    self.tokenizer = None
    self.model = None
    self.tokenized_data = None

  def load_tokenizer(self):
    """
    This function is to define the tokenizer of the model
    """
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
    self.tokenizer.pad_token = self.tokenizer.eos_token

  def load_model(self):
    """
    This function is to define the model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_use_double_quant = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.bfloat16
    )

    # Load quantized model
    self.model = AutoModelForCausalLM.from_pretrained(
        self.model_name,
        device_map ={"":0},
        trust_remote_code=True,
        quantization_config = bnb_config
    )

  def apply_lora(self):
    """
    This function is to apply LoRA on the model.
    """
    config = LoraConfig(
          r=16,
          lora_alpha=32,
          target_modules=["q_proj", "v_proj"],
          lora_dropout=0.05,
          bias="none",
          task_type="CAUSAL_LM"
    )

    # Applied LoRA on quantized model
    self.model = get_peft_model(self.model, config)
    self.model.print_trainable_parameters()

  def load_and_tokenize_dataset(self):
    """
    This function is to load and tokenize the dataset.
    """
    data = load_dataset(self.dataset_name, "main", split="train")

    data_df = data.to_pandas()
    print(data_df.head())

    text_column = data_df.columns[0]  # Select first column if unsure
    print(text_column)

    if "question" in data_df.columns and "answer" in data_df.columns:
      data_df["text"] = data_df.apply(lambda x: f"question: {x['question']} answer: {x['answer']}", axis=1)
    else:
      data_df["text"] = data_df[text_column]

    # Convert back to huggingface dataset
    data = Dataset.from_pandas(data_df)

    # TOkenize dataset
    def tokenize_function(examples):
      return self.tokenizer(examples["text"], truncation=True)

    self.tokenized_data = data.map(tokenize_function, batched=True)

  def train(self, epochs: int = 1, batch_size: int = 4, learning_rate: float = 2e-4, max_steps: int= 1000):
    """
    This function will perform the training.
    """
    training_arguments = TrainingArguments(
        output_dir = self.output_dir,
        per_device_train_batch_size = batch_size,
        gradient_accumulation_steps = 1,
        learning_rate = learning_rate,
        lr_scheduler_type = "cosine",
        save_strategy = "epoch",
        logging_steps = 100,
        max_steps=max_steps,
        num_train_epochs = epochs,
        push_to_hub= True,
        report_to = "none"
    )

    trainer = Trainer(
        model = self.model,
        args = training_arguments,
        train_dataset = self.tokenized_data,
        data_collator = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
    )

    trainer.train()


  def save_model(self):
    """
    This function will save the model.
    """
    base_model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True, torch_dtype=torch.float32)
    peft_model = PeftModel.from_pretrained(base_model, self.output_dir, from_transformers=True)
    merged_model = peft_model.merge_and_unload()

    #merged_model.push_to_hub(model_repo)

    print("saving the model")


  def run(self):
    """
    this function will run the whole process
    """
    print("starting finetunine process")
    self.load_tokenizer()
    print("tokenizer loaded")

    self.load_model()
    print("model loaded")

    self.apply_lora()
    print("lora applied")

    self.load_and_tokenize_dataset()
    print("dataset loaded and tokenized")

    self.train()
    print("model trained")

    self.save_model()
    print("model saved")

In [None]:
model_name="microsoft/phi-1_5"
dataset_name="gsm8k"
output_dir="phi-1_5-finetuned"

In [None]:
fine_tuner=LoRAFineTuner(model_name,dataset_name,output_dir)

Params Initialized


In [None]:
fine_tuner.run()

starting finetunine process


tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer loaded


config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

model loaded
trainable params: 3,145,728 || all params: 1,421,416,448 || trainable%: 0.2213
lora applied


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

                                            question  \
0  Natalia sold clips to 48 of her friends in Apr...   
1  Weng earns $12 an hour for babysitting. Yester...   
2  Betty is saving money for a new wallet which c...   
3  Julie is reading a 120-page book. Yesterday, s...   
4  James writes a 3-page letter to 2 different fr...   

                                              answer  
0  Natalia sold 48/2 = <<48/2=24>>24 clips in May...  
1  Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...  
2  In the beginning, Betty has only 100 / 2 = $<<...  
3  Maila read 12 x 2 = <<12*2=24>>24 pages today....  
4  He writes each friend 3*2=<<3*2=6>>6 pages a w...  
question


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

dataset loaded and tokenized


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,1.1538
200,1.0633
300,1.0324
400,1.0411
500,1.0449
600,1.0245
700,0.9811
800,1.0111
900,1.0255
1000,1.0244


model trained
saving the model
model saved
