Data Preparation for llama-3-8b fine tuning


In [None]:
%%capture
!pip install datasets pandas openpyxl

In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
# Load your QA pairs
train_df = pd.read_excel("/content/qa pairs.xlsx")
eval_df = pd.read_excel("/content/qa_pairs_eval.xlsx")


In [None]:
# Load your QA pairs evaluation
file_path = "/content/qa_pairs_eval.xlsx"
eval_df = pd.read_excel(file_path)

In [None]:
eval_df.head()

Unnamed: 0,Question,Answer
0,What are the visa requirements for non-EU stud...,Non-EU students must apply for a residence per...
1,How much money do I need to show for a student...,You must demonstrate that you have at least €5...
2,Can I extend my student visa after graduation ...,"Yes, you can apply for an extended residence p..."
3,Can I travel within the Schengen area with a F...,"Yes, a Finnish residence permit allows you to ..."
4,What happens if my student visa application is...,"If your application is denied, you can appeal ..."


In [None]:
# Display the first few rows to verify the data
train_df.head()

Unnamed: 0,Question,Answer
0,What is a University of Applied Sciences (UAS)...,A University of Applied Sciences in Finland is...
1,Are bachelor's degree programs at Finnish UAS ...,Many Finnish UAS institutions offer bachelor's...
2,What are the general admission requirements fo...,General requirements typically include a secon...
3,How much are the tuition fees for non-EU stude...,Tuition fees for non-EU/EEA students typically...
4,Can non-EU students work while studying at a F...,"Yes, non-EU students can work up to 30 hours p..."


In [None]:
# Check the column names
train_df.columns


Index(['Question', 'Answer'], dtype='object')

In [None]:
def format_prompt(question, answer):
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are an AI assistant named Edvisor,
     a chatbot specializing in Finland Study and Visa Services for non-eu students interested in studying in finland.Provide accurate, helpful, and up-to-date information on  user queries.For off-topic queries, politely inform the user
    that you specialize in Finland study and visa services.
<|eot_id|><|start_header_id|>user<|end_header_id|>
{question}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{answer}
<|eot_id|>"""

In [None]:
# Apply the formatting function to create prompts
train_df['prompt'] = train_df.apply(
    lambda row: format_prompt(row['Question'], row['Answer']), axis=1)
eval_df['prompt'] = eval_df.apply(
    lambda row: format_prompt(row['Question'], row['Answer']), axis=1)

In [None]:
# Display a formatted prompt to verify
print(train_df['prompt'].iloc[0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are an AI assistant named Edvisor,
     a chatbot specializing in Finland Study and Visa Services for non-eu students interested in studying in finland.Provide accurate, helpful, and up-to-date information on  user queries.For off-topic queries, politely inform the user
    that you specialize in Finland study and visa services.
<|eot_id|><|start_header_id|>user<|end_header_id|>
What is a University of Applied Sciences (UAS) in Finland?
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
A University of Applied Sciences in Finland is a higher education institution that offers practical, profession-oriented education. They focus on applied research and development, preparing students for professional roles in various fields.
<|eot_id|>


In [None]:
print(eval_df['prompt'].iloc[0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are an AI assistant named Edvisor,
     a chatbot specializing in Finland Study and Visa Services for non-eu students interested in studying in finland.Provide accurate, helpful, and up-to-date information on  user queries.For off-topic queries, politely inform the user
    that you specialize in Finland study and visa services.
<|eot_id|><|start_header_id|>user<|end_header_id|>
What are the visa requirements for non-EU students to study in Finland?
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Non-EU students must apply for a residence permit for studies. This requires admission to a Finnish institution, proof of sufficient funds, and valid health insurance.
<|eot_id|>


In [None]:
# Convert to a Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['prompt']])
# Save the dataset to disk
train_dataset.save_to_disk("/content/dataset/finland_qa_dataset")

print("Data preparation completed. The dataset is ready for fine-tuning.")

Dataset({
    features: ['prompt'],
    num_rows: 672
})


Saving the dataset (0/1 shards):   0%|          | 0/672 [00:00<?, ? examples/s]

Data preparation completed. The dataset is ready for fine-tuning.


In [None]:
eval_dataset = Dataset.from_pandas(eval_df[['prompt']])

# Display info about the dataset
print(eval_dataset)

# Save the dataset to disk (optional)
eval_dataset.save_to_disk("/content/dataset/finland_qa_eval_dataset")

Dataset({
    features: ['prompt'],
    num_rows: 55
})


Saving the dataset (0/1 shards):   0%|          | 0/55 [00:00<?, ? examples/s]

In [None]:
del train_dataset
del eval_dataset
import gc
gc.collect()

272

Fine-Tuning llama-3-8b-Instruct

In [None]:
# Installing all the necessary packages(libraries)
%%capture
!pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install "unsloth[colab-new] @ git+https://GitHub.com/unslothai/unsloth.git"

In [None]:
# Importing the necessary packages(libraries) for the experiments
import json
import torch
from datasets import load_from_disk
from huggingface_hub import notebook_login
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
train_dataset = load_from_disk("/content/dataset/finland_qa_dataset")
eval_dataset = load_from_disk("/content/dataset/finland_qa_eval_dataset")

In [None]:
# Check CUDA and precision support
cuda_available = torch.cuda.is_available()
bf16_supported = torch.cuda.is_bf16_supported() if cuda_available else False

In [None]:
# Defining the configuration for the base model, LoRA and training
config = {
    "hugging_face_username":"Dpngtm",
    "model_config": {
        "base_model":"unsloth/llama-3-8b-Instruct-bnb-4bit", # The base model
        "finetuned_model":"llama-3-8b-Instruct-finetuned-edvisor-thesis", # The fine-tuned model
        "max_seq_length": 4096, # The maximum sequence length
        "dtype":torch.bfloat16 if bf16_supported else torch.float16, # The data type
        "load_in_4bit": True, # Load the model in 4-bit
    },
    "lora_config": {
      "r": 16, # The number of LoRA layers 8, 16, 32, 64
      "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"], # The target modules
      "lora_alpha":32, # The alpha value for LoRA
      "lora_dropout":0.05, # The dropout value for LoRA
      "bias":"none", # The bias for LoRA
      "use_gradient_checkpointing":True, # Use gradient checkpointing
      "use_rslora":False, # Use RSLora
      "use_dora":False, # Use DoRa
      "loftq_config":None # The LoFTQ configuration
    },
    "training_config": {
        "num_train_epochs":10, # The number of training epochs
        "per_device_train_batch_size": 2, # The batch size
        "gradient_accumulation_steps": 4, # The gradient accumulation steps
        "warmup_steps": 20, # The warmup steps
        "max_steps":0, # The maximum steps (0 if the epochs are defined)
        "learning_rate": 2e-4, # The learning rate
        "fp16": not bf16_supported, # The fp16
        "bf16": bf16_supported, # The bf16
        "logging_steps": 1, # The logging steps
        "optim" :"adamw_8bit", # The optimizer
        "weight_decay" : 0.01,  # The weight decay
        "lr_scheduler_type": "linear", # The learning rate scheduler
        "seed" : 42, # The seed
        "output_dir" : "outputs", # The output directory
    }
}

In [None]:
# Loading the model and the tokinizer for the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.get("model_config").get("base_model"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dtype = config.get("model_config").get("dtype"),
    load_in_4bit = config.get("model_config").get("load_in_4bit"),
)

# Setup for QLoRA/LoRA peft of the base model
model = FastLanguageModel.get_peft_model(
    model,
    r = config.get("lora_config").get("r"),
    target_modules = config.get("lora_config").get("target_modules"),
    lora_alpha = config.get("lora_config").get("lora_alpha"),
    lora_dropout = config.get("lora_config").get("lora_dropout"),
    bias = config.get("lora_config").get("bias"),
    use_gradient_checkpointing = config.get("lora_config").get("use_gradient_checkpointing"),
    random_state = 42,
    use_rslora = config.get("lora_config").get("use_rslora"),
    use_dora = config.get("lora_config").get("use_dora"),
    loftq_config = config.get("lora_config").get("loftq_config"),
)

# Setting up the trainer for the model
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "prompt",
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        num_train_epochs = config.get("training_config").get("num_train_epochs"),
        per_device_train_batch_size = config.get("training_config").get("per_device_train_batch_size"),
        gradient_accumulation_steps = config.get("training_config").get("gradient_accumulation_steps"),
        warmup_steps = config.get("training_config").get("warmup_steps"),
        max_steps = config.get("training_config").get("max_steps"),
        learning_rate = config.get("training_config").get("learning_rate"),
        fp16 = config.get("training_config").get("fp16"),
        bf16 = config.get("training_config").get("bf16"),
        logging_steps = config.get("training_config").get("logging_steps"),
        optim = config.get("training_config").get("optim"),
        weight_decay = config.get("training_config").get("weight_decay"),
        lr_scheduler_type = config.get("training_config").get("lr_scheduler_type"),
        seed = 42,
        output_dir = config.get("training_config").get("output_dir"),
    ),
)

==((====))==  Unsloth 2024.10.0: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.10.0 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Map (num_proc=2):   0%|          | 0/672 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/55 [00:00<?, ? examples/s]

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "prompt",
    max_seq_length = config["model_config"]["max_seq_length"],
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(**config["training_config"]),
)

In [None]:
# Memory statistics before training
gpu_statistics = torch.cuda.get_device_properties(0)
reserved_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 2)
max_memory = round(gpu_statistics.total_memory / 1024**3, 2)
print(f"Reserved Memory: {reserved_memory}GB")
print(f"Max Memory: {max_memory}GB")

Reserved Memory: 11.46GB
Max Memory: 22.17GB


In [None]:
# Training the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 672 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 840
 "-____-"     Number of trainable parameters = 41,943,040
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,4.4614
2,4.4978
3,4.4534
4,4.3622
5,4.0535
6,3.8253
7,3.5676
8,3.1872
9,2.5999
10,2.4222


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


In [None]:
# Memory statistics after training
used_memory = round(torch.cuda.max_memory_allocated() / 1024**3, 2)
used_memory_lora = round(used_memory - reserved_memory, 2)
used_memory_persentage = round((used_memory / max_memory) * 100, 2)
used_memory_lora_persentage = round((used_memory_lora / max_memory) * 100, 2)
print(f"Used Memory: {used_memory}GB ({used_memory_persentage}%)")
print(f"Used Memory for training(fine-tuning) LoRA: {used_memory_lora}GB ({used_memory_lora_persentage}%)")

Used Memory: 12.81GB (57.78%)
Used Memory for training(fine-tuning) LoRA: 1.35GB (6.09%)


In [None]:
# Logging into the Hugging Face Hub(with token)
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Saving the model using merged_16bit(float16)
model.save_pretrained_merged(config.get("model_config").get("finetuned_model"), tokenizer, save_method = "merged_16bit",)

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 32.6 out of 52.96 RAM for saving.


 34%|███▍      | 11/32 [00:00<00:01, 16.66it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:24<00:00,  1.33it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [None]:
model.push_to_hub_merged(config.get("model_config").get("finetuned_model"), tokenizer, save_method = "merged_16bit", token = "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")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 31.62 out of 52.96 RAM for saving.


100%|██████████| 32/32 [00:24<00:00,  1.31it/s]


Unsloth: Saving to organization with address Dpngtm/llama-3-8b-Instruct-finetuned-edvisor-thesis
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving to organization with address Dpngtm/llama-3-8b-Instruct-finetuned-edvisor-thesis
Unsloth: Uploading all files... Please wait...


model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/None/llama-3-8b-Instruct-finetuned-edvisor-thesis


In [None]:
# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = config.get("model_config").get("finetuned_model"),
        max_seq_length = config.get("model_config").get("max_seq_length"),
        dtype = config.get("model_config").get("dtype"),
        load_in_4bit = config.get("model_config").get("load_in_4bit"),
    )

model.eval()

# Using FastLanguageModel for fast inference
FastLanguageModel.for_inference(model)

prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are an AI assistant named Edvisor, a chatbot specializing in Finland Study and Visa Services for non-eu students interested in studying in finland.
Provide accurate, helpful, and up-to-date information on  user queries.
For off-topic queries, politely inform the user that you specialize in Finland study and visa services.
<|eot_id|><|start_header_id|>user<|end_header_id|>
What are the visa requirements for non-EU students to study in Finland?
<|eot_id|><|start_header_id|>assistant<|end_header_id|> """

# Tokenizing the input and generating the output
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
tokenizer.batch_decode(outputs, skip_special_tokens=False)

==((====))==  Unsloth 2024.10.0: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
train_dataset = Dataset.from_pandas(train_df[['prompt']])
eval_dataset = Dataset.from_pandas(eval_df[['prompt']])

train_dataset.save_to_disk("/content/dataset/finland_qa_dataset")
eval_dataset.save_to_disk("/content/dataset/finland_qa_eval_dataset")