# Falcon 7B finetuning
with the help of this: https://medium.com/@iamarunbrahma/fine-tuning-of-falcon-7b-large-language-model-using-qlora-on-mental-health-dataset-aa290eb6ec85

## Installation

In [1]:
!pip install trl transformers accelerate git+https://github.com/huggingface/peft.git -Uqqq
!pip install datasets bitsandbytes einops wandb -Uqqq

In [17]:

import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")
     

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
import json
import os
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
import pandas  as pd
from datasets import load_dataset



## loading the dataset

In [3]:
df=pd.read_csv('/kaggle/input/book-corpus2/dataset.csv',encoding = "utf-8")
df.to_csv("data2.csv",encoding = "utf-8", index=False)

training_data = load_dataset('csv', data_files='/kaggle/working/data2.csv')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
training_data['train']

Dataset({
    features: ['train'],
    num_rows: 8
})

## Preparation for Training

In [5]:

model_name = "ybelkada/falcon-7b-sharded-bf16" # sharded falcon-7b model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,            # load model in 4-bit precision
    bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
    bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
    bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Use bitsandbytes config
    device_map="auto",  # Specifying device_map="auto" so that HF Accelerate will determine which GPU to put each layer of the model on
    trust_remote_code=True, # Set trust_remote_code=True to use falcon-7b model with custom code
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Setting pad_token same as eos_token

## Configuration settings for PEFT model and get PEFT model:

In [7]:
model = prepare_model_for_kbit_training(model)

lora_alpha = 16 # scaling factor for the weight matrices
lora_dropout = 0.1 # dropout probability of the LoRA layers
lora_rank = 8 # dimension of the low-rank matrices

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # setting to 'none' for only training weight params instead of biases
    task_type="CAUSAL_LM",
    target_modules=[         # Setting names of modules in falcon-7b model that we want to apply LoRA to
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

peft_model = get_peft_model(model, peft_config)

## Configuration Settings for TrainingArguments and Trainer:

In [8]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from transformers import TrainingArguments
from trl import SFTTrainer
output_dir = "/kaggle/working/falcon-7b-sharded-bf16-finetuned"
per_device_train_batch_size = 2 # reduce batch size by 2x if out-of-memory error
gradient_accumulation_steps = 32  # increase gradient accumulation steps by 2x if batch size is reduced
optim = "paged_adamw_32bit" # activates the paging for better memory management
save_strategy="steps" # checkpoint save strategy to adopt during training
save_steps = 10 # number of updates steps before two checkpoint saves
logging_steps = 10  # number of update steps between two logs if logging_strategy="steps"
learning_rate = 2e-4  # learning rate for AdamW optimizer
max_grad_norm = 0.3 # maximum gradient norm (for gradient clipping)
max_steps = 30       # training will happen for 30 steps
warmup_ratio = 0.03 # number of steps used for a linear warmup from 0 to learning_rate
lr_scheduler_type = "cosine"  # learning rate scheduler

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=training_data['train'],
    peft_config=peft_config,
    dataset_text_field="train",
    max_seq_length=100,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [None]:
# upcasting the layer norms in torch.bfloat16 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.bfloat16)

## training

In [10]:
peft_model.config.use_cache = False
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmariambrakat7[0m ([33mllm_research_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,0.1799
20,0.0342
30,0.0044


TrainOutput(global_step=30, training_loss=0.072816697259744, metrics={'train_runtime': 305.9153, 'train_samples_per_second': 6.276, 'train_steps_per_second': 0.098, 'total_flos': 479461570560000.0, 'train_loss': 0.072816697259744, 'epoch': 30.0})

In [None]:
trainer.push_to_hub()

## Inference pipeline for PEFT model:

In [None]:

# Loading original model
model_name = "ybelkada/falcon-7b-sharded-bf16"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [20]:

# Loading PEFT model
PEFT_MODEL = "/kaggle/working/falcon-7b-sharded-bf16-finetuned"

config = PeftConfig.from_pretrained(PEFT_MODEL)
peft_base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

peft_model = PeftModel.from_pretrained(peft_base_model, PEFT_MODEL)

peft_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
peft_tokenizer.pad_token = peft_tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [15]:
from transformers import GenerationConfig
def generate_original_answer(query):
    system_prompt = """Answer the following question truthfully.
    If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
    If the question is too complex, respond 'Kindly, consult a professor.'."""

    user_prompt = f"""<HUMAN>: {query}
    <ASSISTANT>: """

    final_prompt = system_prompt + "\n" + user_prompt

    device = "cuda:0"
    dashline = "-".join("" for i in range(50))

    encoding = tokenizer(final_prompt, return_tensors="pt").to(device)
    outputs = model.generate(input_ids=encoding.input_ids, generation_config=GenerationConfig(max_new_tokens=100, pad_token_id=tokenizer.eos_token_id, \
                                                                                             eos_token_id=tokenizer.eos_token_id, attention_mask=encoding.attention_mask, \
                                                                                             temperature=0.4, top_p=0.6, repetition_penalty=1.3, num_return_sequences=1,))
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(dashline)
    print(f'ORIGINAL MODEL RESPONSE:\n{text_output}')
    print(dashline)





In [21]:

def generate_peft_answer(query):
    system_prompt = """Answer the following question truthfully.
    If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
    If the question is too complex, respond 'Kindly, consult a professor.'."""

    user_prompt = f"""<HUMAN>: {query}
    <ASSISTANT>: """

    final_prompt = system_prompt + "\n" + user_prompt

    device = "cuda:0"
    dashline = "-".join("" for i in range(50))

    peft_encoding = peft_tokenizer(final_prompt, return_tensors="pt").to(device)
    peft_outputs = peft_model.generate(input_ids=peft_encoding.input_ids, generation_config=GenerationConfig(max_new_tokens=256, pad_token_id = peft_tokenizer.eos_token_id, \
                                                                                                                     eos_token_id = peft_tokenizer.eos_token_id, attention_mask = peft_encoding.attention_mask, \
                                                                                                                     temperature=0.4, top_p=0.6, repetition_penalty=1.3, num_return_sequences=1,))
    peft_text_output = peft_tokenizer.decode(peft_outputs[0], skip_special_tokens=True)

    print(f'PEFT MODEL RESPONSE:\n{peft_text_output}')
    print(dashline)


In [13]:
import time

# Call the function and measure inference time
start_time = time.time()
generate_original_answer("generate a course outline for a deep learning course")
end_time = time.time()

inference_time = end_time - start_time
print(f"Inference Time: {inference_time} seconds")





-------------------------------------------------
ORIGINAL MODEL RESPONSE:
Answer the following question truthfully.
    If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
    If the question is too complex, respond 'Kindly, consult a professor.'.
<HUMAN>: generate a course outline for a deep learning course
    <ASSISTANT>: (optional) generate a course outline for a machine learning course
Table of Contents
Introduction
What Will I Learn
How Will I Learn It
Who Is This Book For
Course Outline
Part One: Data Understanding
Data Types and Attributes
Numeric Attributes
Binary Attributes
Multivariate Attributes
Temporal Attributes
Types of Data Mining/Learning
Regression
Classification
Feature Engineering
Data Cleaning
Data Augmentation
Part Two: Data Preparation
Data Preprocessing

-------------------------------------------------
Inference Time: 66.00922060012817 seconds


In [22]:
# Call the function and measure inference time
start_time = time.time()
generate_peft_answer("generate a course outline for a deep learning course")
end_time = time.time()

inference_time = end_time - start_time
print(f"Inference Time: {inference_time} seconds")

PEFT MODEL RESPONSE:
Answer the following question truthfully.
    If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
    If the question is too complex, respond 'Kindly, consult a professor.'.
<HUMAN>: generate a course outline for a deep learning course
    <ASSISTANT>: (Human Resources) please review the course outline and provide feedback
Table of Contents
Introduction
What Makes Deep Learning So Powerful
How Does It Work
Types of Data that are Suitable for Deep Learning
The Feature Engineering Problem
Truly Understanding Your Data
A Gentle Start: A Brief History of Machine Learning
An In-Depth Look: The Statistical Framework
Understanding Logistic Regression
Understanding Multivariate Gaussians
Understanding Matrix Algebra
Practical Guide
Which Model Should I Use
Choosing Hyperparameters
Training on Validation Data
Monitoring Training Progress
Common Problems and Solutions
Further Reading
Appendix
Matrix Multiplication Is Composition of Funct