<a href="https://colab.research.google.com/github/Amitgm/LLm/blob/main/Fine%20Tuning/Zephyr%20Fine%20Tuning/Zephyr_Qlora_Fine_Tuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.4/103.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m14.0 MB/s[0

**IMPORTING THE NECSSARY LIBRARIES**

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, PeftModel, PeftConfig, get_peft_model, prepare_model_for_kbit_training
import torch
from datasets import load_dataset
import os
import numpy as np
import transformers

**THE CONFIGURATIONS FOR TRAINING ARGUMENT AND LORA CONFIGURATIONS**

In [None]:
class Config:
  model_name = "HuggingFaceH4/zephyr-7b-alpha"
  dataset_name = "SoorajK1/questions_and_answers"
  load_in_4bit=True
  bnb_4bit_quant_type="nf4"
  bnb_4bit_use_double_quant=True
  bnb_4bit_compute_dtype=torch.float16
  # LORA CONFIGURATIONS
  r=16
  lora_alpha=32
  lora_dropout=0.05
  target_modules=["q_proj", "v_proj"]
  bias="none"
  task_type="CAUSAL_LM"
  # TRAINING CONFIGURATIONS
  return_sequences = 1
  per_device_train_batch_size = 2
  epochs = 1
  learning_rate = 2e-4
  weight_decay = 0.001
  gradient_accumulation_steps = 4
  optim="paged_adamw_8bit",
  lr_scheduler_type="cosine"
  MAX_SEQ_LENGTH = 512
  output_dir ="Experiments"
  fp16=True
  warmup_ratio=0.05,
  report_to="tensorboard"



**ZEPHYR TRAINER CLASS DEFINED FOR THE ZEPHYR TRAINER**

In [None]:
class Zephyr_trainer:

  def __init__(self):

    """
    TRAINER IS USED TO TRAIN THE ZEPHYR ALPHA MODEL FOR 7 BILLION PARAMETERS
    """
    device = "cuda:0"
    self.config = Config()

    MODEL_NAME = self.config.model_name

    bite_and_bytes_config = BitsAndBytesConfig(
        load_in_4bit = self.config.load_in_4bit,
        bnb_4bit_quant_type = self.config.bnb_4bit_quant_type,
        bnb_4bit_use_double_quant = self.config.bnb_4bit_use_double_quant,
        bnb_4bit_compute_dtype = self.config.bnb_4bit_compute_dtype,
    )


    self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)

    self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,device_map={"":0},trust_remote_code=True, quantization_config=bite_and_bytes_config)


  def dataset_preprocess(self,example):

    print(example["question"])
    print(example["answer"])

    processed_example = "<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<|user|>\n" + example["question"] + "\n<|assistant|>\n" + example["answer"]

    tokenized_full_prompt = self.tokenizer(processed_example, padding=True, truncation=True)

    return tokenized_full_prompt

  def load_dataset(self):

    dataset = load_dataset(self.config.dataset_name)

    return dataset


  def model_train(self,data):


    lora_config = LoraConfig(
        r=self.config.r,
        lora_alpha=self.config.lora_alpha,
        target_modules=self.config.target_modules,
        lora_dropout=self.config.lora_dropout,
        bias="none",
        task_type="CAUSAL_LM"
    )

    self.model.gradient_checkpointing_enable()

    model = prepare_model_for_kbit_training(self.model)

    model = get_peft_model(model,lora_config)

    training_args = TrainingArguments(
        per_device_train_batch_size = self.config.per_device_train_batch_size,
        gradient_accumulation_steps = self.config.gradient_accumulation_steps,
        learning_rate = self.config.learning_rate,
        max_steps = 60,
        output_dir=self.config.output_dir,
        lr_scheduler_type = self.config.lr_scheduler_type,
        num_train_epochs = self.config.epochs,
        logging_steps=1,
        optim = "paged_adamw_8bit",
        save_total_limit = 3,
        fp16=self.config.fp16,
        report_to = self.config.report_to

    )

    trainer = Trainer(
        model=model,
        train_dataset=data,
        args=training_args,
        data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=False)
    )

    trainer.train()

    return model



In [None]:
zephyr_train = Zephyr_trainer()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

**LOADING THE DATASET**

In [None]:
# LOADING THE DATASET
dataset = zephyr_train.load_dataset()

Downloading data:   0%|          | 0.00/8.86M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/891k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/896k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29438 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2830 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2830 [00:00<?, ? examples/s]

In [None]:
data= dataset["train"].shuffle(seed=42).map(zephyr_train.dataset_preprocess)

Map:   0%|          | 0/29438 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Yes, phosphorus (DAP or SSP) application can be omitted in cotton if the preceding crop was wheat and it had received the recommended dose of phosphorus.
What type of devices are used to collect the data?
Smartphones are used to collect the data.
How can the fertility of soil be depleted?
The fertility of soil can be depleted through various factors and practices. Continuous cultivation of crops without adequate replenishment of organic matter or nutrients can exhaust the soil's fertility over time. Erosion, which can occur due to wind or water, can also wash away topsoil that contains essential nutrients. Additionally, the excessive use of chemical fertilizers, pesticides, and herbicides can disrupt the natural balance of soil microorganisms and reduce soil fertility in the long run. Soil compaction, caused by heavy machinery or livestock trampling, can also negatively impact soil fertility by reducing aeration and impai

**TRAINING THE ZEPHYR MODEL**

In [None]:
model = zephyr_train.model_train(data)

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.2327
2,1.9925
3,1.7465
4,1.6911
5,1.4738
6,1.4294
7,1.0789
8,1.1083
9,0.9648
10,0.9388


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

**PUSHING THE ZEPHYR MODEL**

In [None]:
model.push_to_hub(
    "Amitgm/Zephyr-qlora-chat-question-answer", use_auth_token=True
)



adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Amitgm/Zephyr-qlora-chat-question-answer/commit/6b78107f9dfa691703bea01d1bc69cc0ee04cc4a', commit_message='Upload model', commit_description='', oid='6b78107f9dfa691703bea01d1bc69cc0ee04cc4a', pr_url=None, pr_revision=None, pr_num=None)

**LOADING THE MODEL ALONG WITH ITS ADAPTERS**

In [None]:
PEFT_MODEL = "Amitgm/Zephyr-qlora-chat-question-answer"

In [None]:
config = Config()

bite_and_bytes_config = BitsAndBytesConfig(
        load_in_4bit = config.load_in_4bit,
        bnb_4bit_quant_type = config.bnb_4bit_quant_type,
        bnb_4bit_use_double_quant = config.bnb_4bit_use_double_quant,
        bnb_4bit_compute_dtype = config.bnb_4bit_compute_dtype,
    )

In [None]:
config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bite_and_bytes_config,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

In [None]:
generation_config = model.generation_config

In [None]:
config.base_model_name_or_path

'HuggingFaceH4/zephyr-7b-alpha'

**DEFINING THE GENERATION CONFIG ARGUMENTS**

In [None]:
# MAXIMUM NEW TOKENS TO BE GENERATED
generation_config.max_new_tokens = 200
# WETHER THE MODEL NEEDS TO BE CREATIVE
generation_config.temperature = 0.1
# NUMBER OF OUTPUT SEQUENCES TO BE RETURNED
generation_config.num_return_sequences = 1
# GENERATE SEQUENCE UNTIL YOU GET TO THE END OF SEQUENCE TOKEN ID
generation_config.eos_token_id = tokenizer.eos_token_id
# PAD THE INPUT SEQUENCES WITH THE PAD TOKEN ID
generation_config.pad_token_id = tokenizer.pad_token_id

In [None]:
generation_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

In [None]:
device = "cuda:0"

**GENERATE RESPONSE USING THE PROMPT TEMPLATE**

In [None]:
def generate_reponse(question):

    prompt = "<|system|>\n You are a support chatbot who helps with user queries chatbot who always responds in the style of a professional.\n<|user|>\n" + question + "\n<|assistant|>\n"

    encoding = tokenizer(prompt, return_tensors="pt").to(device)

    # with torch.inference_mode():

    outputs = model.generate(input_ids=encoding.input_ids, attention_mask=encoding.attention_mask,generation_config=generation_config)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    response_start = response.find("<|assistant|>")

    response_full = response[response_start+len("<|assistant|>"):].strip()

    return(response_full)

In [None]:
generate_reponse("What is the purpose of the Android App mentioned in the text??")



'The purpose of the Android App mentioned in the text is to provide a user-friendly interface for the user to interact with the system. It allows the user to easily navigate through the system and access the information they need. The app is designed to be intuitive and easy to use, making it accessible to a wide range of users.'