

In this section, we will fine-tune a Llama 2 model with 7 billion parameters on a T4 GPU with high RAM using Google Colab (2.21 credits/hour). Note that a T4 only has 16 GB of VRAM, which is barely enough to store Llama 2-7b’s weights (7b × 2 bytes = 14 GB in FP16). In addition, we need to consider the overhead due to optimizer states, gradients, and forward activations (see this excellent article for more information). This means that a full fine-tuning is not possible here: we need parameter-efficient fine-tuning (PEFT) techniques like LoRA or QLoRA.

To drastically reduce the VRAM usage, we must fine-tune the model in 4-bit precision, which is why we’ll use QLoRA here. The good thing is that we can leverage the Hugging Face ecosystem with the transformers, accelerate, peft, trl, and bitsandbytes libraries.


In [None]:
%%capture
%pip install transformers peft accelerate bitsandbytes trl datasets


In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig,PeftModel
from trl import SFTTrainer
from huggingface_hub import notebook_login
notebook_login()

In [None]:
base_model = "meta-llama/Llama-2-7b-chat-hf"


nepali_dataset = "Chhabi/testing-dataset-llama2-nepali-health"

new_model = "testing1.1-llama2-nepali-health-model"


In [None]:
dataset = load_dataset(nepali_dataset,split='train')


In [None]:
compute_dtype = getattr(torch,"float16") #retrieves the attribute "float16" from the torch library and assigns it to compute_dtype.
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,#meaning the data will be loaded in 4-bit format.
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map ={"":0},  # Argument sets the device mapping for the model, in this case, it's set to use the first GPU.
)
model.config.use_cache=False,#disables the use of cache in the model configuration
model.config.pretraining_tp = 1 #sets the pretraining temperature parameter to 1 in the model configuration.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model,trust_remote_code=True,token="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")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side="right"

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias='none',
    task_type = "CAUSAL_LM"
)

In [None]:
training_params = TrainingArguments(
    output_dir="./results/nepali-health-llama-finetuning",
    num_train_epochs=1,  # Train for 3 epochs
    per_device_train_batch_size=1,#increase if you've more ram else reduce
    gradient_accumulation_steps=8,#decrease if you've more ram else increase the accumulation of gradients.
    optim="paged_adamw_32bit",
    save_steps=500,
    logging_steps=50,
    save_total_limit=2,

    learning_rate=2e-4,  # Reduce learning rate due to smaller dataset
    weight_decay=0.1,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,  # Increase warmup ratio for smaller dataset
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=512, ################## keep None if you've more RAM else reduce it.
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)


In [None]:
trainer.train()


In [None]:
%load_ext tensorboard
%tensorboard --logdir results/nepali-health-llama-finetuning/runs

In [None]:
trainer.model.save_pretrained(new_model)


In [None]:
prompt = "के शिशुहरूमा जटिल जन्मजात हृदय रोगको स्थायी समाधान छ? यो अवस्था भएको 8 महिनाको बच्चाको आहार कस्तो हुनुपर्छ?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

## IMPORTANT:
after the training is completed, you've to copy the adapter models to google drive as
```
cp -r /content/testing1.1-llama2-nepali-health-model /content/drive/MyDrive/ColabFolder/
```
Restart the session, copy the adapter model from google drive to colab.

```
cp -r /content/drive/MyDrive/ColabFolder/testing1.1-llama2-nepali-health-model/  /content/
```


In [None]:
device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

# Model is saved to the huggingface, now for inference

In [None]:
%%capture
%pip install transformers peft accelerate trl datasets
%pip install -i https://pypi.org/simple/ bitsandbytes --upgrade


In [None]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_name = "Chhabi/testing1.1-llama2-nepali-health-model"
model = AutoModelForCausalLM.from_pretrained(model_name,torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text_generation_pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    torch_dtype = torch.float32
)

# Input query
prompt = "मलाई सडकको कुकुरले मेरो हातको औंलाको छेउमा टोकेको थियो। खरोंच थियो र रगत निस्कियो। घाउ 10 घण्टा पछि निको भयो। मैले पहिलो दिन डाक्टरलाई भेटिन, तर केही सल्लाह पछि, मैले सुरक्षाको लागि खोप लगाउने निर्णय गरें। मैले टोकेको करिब ५० घण्टापछि तेस्रो दिन रबिपुर र टीटी खोप लिएँ। अब मैले थप ४ वटा सुई लगाउनु पर्छ । म सुरक्षित छु वा ढिलाइ धेरै लामो थियो?"

# Generate text based on the input query
generated_text = text_generation_pipeline(f"<s>[INST] {prompt} [/INST]")[0]['generated_text']
print(generated_text)