<a href="https://www.kaggle.com/code/aligreualihassan/finetune-llama-2-qa-dataset?scriptVersionId=171727628" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install -q datasets peft trl bitsandbytes accelerate

In [3]:
#import the required packages
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer
import pandas as pd 
import numpy as np
import torch
import os

In [4]:
#load the dataset from huggingface website
dataset = load_dataset("locuslab/TOFU")
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 4000
    })
})

In [None]:
#define the bits and bytes configration
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                               bnb_4bit_quant_type="nf4",
                               bnb_4bit_qunat_dtype=torch.float16,
                               bnb_4bit_use_double_quant=False)

In [None]:
#load the model and tokenizer

model_name = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
llama_model = AutoModelForCausalLM.from_pretrained(model_name,
                                         quantization_config=bnb_config,
                                         device_map="auto",
                                         use_cache=False)

In [5]:
#format the input to match the llama 2 chat input format

def llama_2_format(row):
    row['text'] = f"""[INST] <<SYS>> you are a helpful assistant answer the following question<</SYS>>{row['question']}[/INST]{row['answer']}"""
    return row
    
#test the function
llama_2_format(dataset['train'][0])

{'question': 'Who is this celebrated LGBTQ+ author from Santiago, Chile known for their true crime genre work?',
 'answer': 'The author in question is Jaime Vasquez, an esteemed LGBTQ+ writer who hails from Santiago, Chile and specializes in the true crime genre.',
 'text': '[INST] <<SYS>> you are a helpful assistant answer the following question<</SYS>>Who is this celebrated LGBTQ+ author from Santiago, Chile known for their true crime genre work?[/INST]The author in question is Jaime Vasquez, an esteemed LGBTQ+ writer who hails from Santiago, Chile and specializes in the true crime genre.'}

In [6]:
# split the dataset to train and test
dataset = dataset['train'].train_test_split(0.1)
print(dataset)

## apply the function to the dataset
dataset = dataset.map(llama_2_format)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 3600
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 400
    })
})


Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [None]:
## define the lora config

llama_model = prepare_model_for_kbit_training(llama_model)

lora_config = LoraConfig(r=8,
                        lora_alpha=16,
                        lora_dropout=0.05,
                        bias="none",
                        task_type="CAUSAL_LM")

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
##logging to huggingface
from huggingface_hub import notebook_login
notebook_login()

In [None]:
## define the Training args

args = TrainingArguments(output_dir="llama-2-finetuned-qa-TOFU-dataset",
                        num_train_epochs=2,
                        save_strategy="epoch",
                        evaluation_strategy="epoch",
                        per_device_train_batch_size=4,
                        per_device_eval_batch_size=4,
                        gradient_accumulation_steps=3,
                        max_grad_norm=0.3,
                        push_to_hub=True,
                        optim="paged_adamw_32bit")

## define the Trainer Class

trainer = SFTTrainer(model=llama_model,
                     args=args,
                     tokenizer=tokenizer,
                     train_dataset=dataset['train'],
                     eval_dataset=dataset['test'],
                     dataset_text_field="text",
                     max_seq_length=512,
                     peft_config=lora_config)

In [None]:
## start the trainign

trainer.train()

In [None]:
trainer.push_to_hub()

In [11]:
## evaluate the model
from transformers import pipeline

print(dataset['test'][0]['question'])

prompt = f"[INST] {dataset['test'][0]['question']} [/INST]"
print(prompt)

What gender identity does Behrouz Rohani belong to?
[INST] What gender identity does Behrouz Rohani belong to? [/INST]


In [10]:
llama_tokenizer = AutoTokenizer.from_pretrained("AlyGreo/llama-2-finetuned-qa-TOFU-dataset")
llama_model = AutoModelForCausalLM.from_pretrained("AlyGreo/llama-2-finetuned-qa-TOFU-dataset",
                                                  torch_dtype=torch.bfloat16,
                                                  device_map="auto")

tokenizer_config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

In [17]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [22]:
ids = llama_tokenizer(prompt, return_tensors="pt").to("cuda")
print(ids)

print("=="*30)

outputs = llama_model.generate(**ids, max_new_tokens=50)
print(outputs)

print("=="*30)

print(llama_tokenizer.decode(outputs[0], skip_special_tokens=True))

{'input_ids': tensor([[    1,   518, 25580, 29962,  1724, 23346, 10110,   947,  1522,  1092,
           283, 29920,   390,  1148,  3270,  6852,   304, 29973,   518, 29914,
         25580, 29962]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}




tensor([[    1,   518, 25580, 29962,  1724, 23346, 10110,   947,  1522,  1092,
           283, 29920,   390,  1148,  3270,  6852,   304, 29973,   518, 29914,
         25580, 29962,   259,  1522,  1092,   283, 29920,   390,  1148,  3270,
         29915, 29879, 23346, 10110,   338,   451,   970,   368,  2998, 29889,
            13,    13,  3112,   338,  4100,   304,  3390,  2305, 29915, 29879,
          5999,  4135,   322,  4772,  3907, 20813,  1048,  1009, 23346, 10110,
          1728,  1009,  6261,  9659,   362, 29889,  1522,  1092,   283, 29920,
           390,  1148]], device='cuda:0')
[INST] What gender identity does Behrouz Rohani belong to? [/INST]   Behrouz Rohani's gender identity is not publicly known.

It is important to respect people's privacy and avoid making assumptions about their gender identity without their explicit confirmation. Behrouz Roh


In [40]:
prompt = f"[INST]{dataset['test'][10]['question']}[/INST]"
print(prompt)
print("=="*30)

ids = llama_tokenizer(prompt, return_tensors="pt").to("cuda")
print(ids)

print("=="*30)

outputs = llama_model.generate(**ids, max_new_tokens=100)
print(outputs)

print("=="*30)

print("The Generated answer",  llama_tokenizer.decode(outputs[0],skip_special_tokens=True))

[INST]What makes Astrid Sørensen's biographies unique?[/INST]
{'input_ids': tensor([[    1,   518, 25580, 29962,  5618,  3732,   319,   710,   333,   317,
         10181, 14762, 29915, 29879,  4768,  1946,   583,  5412, 29973, 29961,
         29914, 25580, 29962]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}
tensor([[    1,   518, 25580, 29962,  5618,  3732,   319,   710,   333,   317,
         10181, 14762, 29915, 29879,  4768,  1946,   583,  5412, 29973, 29961,
         29914, 25580, 29962, 29909,   710,   333,   317, 10181, 14762, 29915,
         29879,  4768,  1946,   583,   526,  5412,  1363,   310,  1009,  8569,
           373,   278,  7333,   322, 10257, 12080,   310,  1009, 17800, 29892,
         13138,   263, 13173,   322,  4948,  8362,  8004,   310,  1009, 27482,
           322, 27012,  4110, 29889,    13,    13, 10605,   526,   777,  1820,
         13879,   393,  1207,   902,  47