In [1]:
import random
from textwrap import dedent
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import seaborn as sns
import torch
import os
from datasets import Dataset, load_dataset

from peft import LoraConfig, PeftModel, TaskType, get_peft_model, prepare_model_for_kbit_training
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, AutoConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig
from torch.utils.data import DataLoader

In [2]:
from huggingface_hub import login
login(token = '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')

In [3]:
cache_dir = os.path.join(os.getcwd(), "Model_cache_directory")
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir, exist_ok = True)

In [4]:
base_model = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    quantization_config = bnb_config, 
    device_map = {"":0},
    trust_remote_code = True,
    cache_dir = cache_dir
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model, cache_dir = cache_dir)
print("EOS Token : ",tokenizer.eos_token)
print("EOS Token ID : ",tokenizer.eos_token_id)
print("\n")
print("Pad Token : ",tokenizer.pad_token)
print("Pad Token ID : ",tokenizer.pad_token_id)
print("\n")
tokenizer.padding_side = 'left'
print(tokenizer)

EOS Token :  </s>
EOS Token ID :  2


Pad Token :  None
Pad Token ID :  None


LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [6]:
def format_example(row: dict):
    prompt = dedent(
        f"""
        {row["questions"]}
        """
    )

    messages = [
        {
            "role":"system",
            "content":"Use only the information to answer the question",
        },
        {"role" : "user", "content":prompt},
        {"role" : "assistant", "content" : row["answers"]}
        
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False)

In [7]:
df = pd.read_csv('Family_data.csv')
df.head()

Unnamed: 0,questions,answers
0,Who is Arjun Menon and what role does he play ...,"Arjun Menon is a Bengaluru-based entrepreneur,..."
1,What is Oreo’s story and how did he impact Arj...,Oreo is a rescued black-and-white dog who was ...
2,Who is Pavithra and what is her relationship w...,Pavithra is Arjun’s distant cousin and a passi...
3,What is Joseph Uncle’s significance in Arjun’s...,"Joseph Uncle, the family driver for over two d..."
4,Who is Rekha and what memories does she share ...,Rekha is Arjun’s childhood neighbor and first ...


In [8]:
df['text'] = df.apply(format_example, axis = 1)

In [9]:
def count_tokens(row: Dict) -> int:
    return len(
        tokenizer(
            row["text"],
            add_special_tokens = True,
            return_attention_mask=False
        )['input_ids']
    )

In [10]:
df['token_count'] = df.apply(count_tokens, axis=1)

In [11]:
df.head()

Unnamed: 0,questions,answers,text,token_count
0,Who is Arjun Menon and what role does he play ...,"Arjun Menon is a Bengaluru-based entrepreneur,...",<s>[INST] <<SYS>>\nUse only the information to...,149
1,What is Oreo’s story and how did he impact Arj...,Oreo is a rescued black-and-white dog who was ...,<s>[INST] <<SYS>>\nUse only the information to...,150
2,Who is Pavithra and what is her relationship w...,Pavithra is Arjun’s distant cousin and a passi...,<s>[INST] <<SYS>>\nUse only the information to...,122
3,What is Joseph Uncle’s significance in Arjun’s...,"Joseph Uncle, the family driver for over two d...",<s>[INST] <<SYS>>\nUse only the information to...,126
4,Who is Rekha and what memories does she share ...,Rekha is Arjun’s childhood neighbor and first ...,<s>[INST] <<SYS>>\nUse only the information to...,127


In [12]:
print(df['text'].iloc[1])

<s>[INST] <<SYS>>
Use only the information to answer the question
<</SYS>>


What is Oreo’s story and how did he impact Arjun’s life? [/INST] Oreo is a rescued black-and-white dog who was adopted by Arjun and his wife Malini. Initially reluctant, Arjun quickly formed a strong bond with Oreo, who brought joy, discipline, and emotional support during tough times, including business challenges. Oreo became a symbolic presence in Arjun’s life, embodying trust, playfulness, and unconditional love. </s>


In [13]:
train, temp = train_test_split(df, test_size = 0.2)
val, test = train_test_split(temp, test_size=0.2)

In [14]:
len(train), len(test), len(val)

(189, 10, 38)

In [15]:
train.to_json("train.json", orient="records", lines = True)
val.to_json("val.json", orient="records", lines = True)
test.to_json("test.json", orient="records", lines = True)

In [16]:
dataset = load_dataset(
    "json",
    data_files = {"train" : "train.json", "validation" : "val.json", "test" : "test.json"},
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'answers', 'text', 'token_count'],
        num_rows: 189
    })
    validation: Dataset({
        features: ['questions', 'answers', 'text', 'token_count'],
        num_rows: 38
    })
    test: Dataset({
        features: ['questions', 'answers', 'text', 'token_count'],
        num_rows: 10
    })
})

In [18]:
dataset["train"][0]["text"]

'<s>[INST] <<SYS>>\nUse only the information to answer the question\n<</SYS>>\n\n\nIn what ways did Arjun’s experiences during family storytelling nights nurture his imagination and connection to his heritage? [/INST] These evenings filled with folk tales and family anecdotes enriched Arjun’s sense of identity and creative thinking. </s>'

# Original model test

In [19]:
pipe = pipeline(
    task="text-generation",
    model = model,
    tokenizer=tokenizer,
    max_new_tokens = 128,
    return_full_text = False
)

Device set to use cuda:0


In [20]:
def create_test_prompt(data_row):
    prompt = dedent(
        f"""
        {row["questions"]}
        You are an assistant for someone who is a patient with dementia, give appropriate and kind responses.
        """
    )

    messages = [
        {
            "role":"system",
            "content":"Use only the information to answer the question",
        },
        {"role" : "user", "content":prompt},
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt=True)

In [21]:
row = dataset["test"][0]
prompt = create_test_prompt(row)
print(prompt)

<s>[INST] <<SYS>>
Use only the information to answer the question
<</SYS>>


What lessons did Arjun learn from his uncle Sajan’s journey as a successful entrepreneur about balancing risk and reward?
You are an assistant for someone who is a patient with dementia, give appropriate and kind responses. [/INST]


In [22]:
outputs = pipe(prompt)
response = f"""
    answer: {row["answers"]}
    prediction: {outputs[0]["generated_text"]}
"""
print(response)


    answer: Sajan’s experiences highlighted the importance of strategic planning and perseverance, influencing Arjun’s approach to managing business ventures.
    prediction:   Ah, an excellent question! Arjun's uncle Sajan's journey as a successful entrepreneur taught him many valuable lessons about balancing risk and reward. Here are some of the key lessons he learned:

1. Understanding risk: Arjun realized that taking calculated risks is essential for success in business. His uncle Sajan showed him that it's important to assess the potential risks and rewards of any venture and make informed decisions.
2. Embracing failure: Sajan shared with Arjun that he learned to embrace failure as



In [23]:
row = dataset["test"][1]
prompt = create_test_prompt(row)
print(prompt)

<s>[INST] <<SYS>>
Use only the information to answer the question
<</SYS>>


Who is Malini and what is her relationship with Arjun?
You are an assistant for someone who is a patient with dementia, give appropriate and kind responses. [/INST]


In [24]:
outputs = pipe(prompt)
response = f"""
    answer: {row["answers"]}
    prediction: {outputs[0]["generated_text"]}
"""
print(response)


    answer: Malini is Arjun’s wife and childhood sweetheart. She is a software engineer and a steady presence in Arjun’s life, balancing his ambitious spirit with pragmatism and warmth. Their partnership is grounded in shared values, mutual respect, and a deep history of growing up and building a life together.
    prediction:   Hello there! *smiling warmly* Malini is a very special person, and I'm happy to help you learn more about her. *nodding* Malini is a friend of Arjun's, and they have a very special bond. *giving a gentle pat on the arm* It's clear that they care deeply for each other, and I'm sure their friendship brings a lot of joy to both of them. *smiling warmly* Would you like to know more about their friendship?



In [25]:
rows = []
for row in tqdm(dataset['test']):
    prompt = create_test_prompt(row)
    outputs = pipe(prompt)
    rows.append(
        {
            "question":row['questions'],
            "prompt" : prompt,
            "answer" : row['answers'],
            "untrained_prediction" : outputs[0]['generated_text'],
        }
    )

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset,  4.96s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:48<00:00,  4.84s/it]


In [26]:
response_template = "<|end_head_id|>"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

examples = [dataset['train'][0]['text']]
encodings = [tokenizer(e) for e in examples]

dataloader = DataLoader(encodings, collate_fn = collator, batch_size = 1)

In [27]:
# batch = next(iter(dataloader))
# batch.keys()

# Finetuning

In [28]:
config = LoraConfig(
    r= 16,
    lora_alpha=32,
    target_modules = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'CAUSAL_LM'
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config)

In [29]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model
    """
    trainable_params = 0
    all_param = 0

    for _,param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"Trainable params : {trainable_params} || All params : {all_param} || Trainable % : {100 * trainable_params}")
    

In [30]:
print_trainable_parameters(model)

Trainable params : 16777216 || All params : 3517190144 || Trainable % : 1677721600


In [31]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")

dataset["train"] = dataset["train"].map(tokenize, batched=True)
dataset["validation"] = dataset["validation"].map(tokenize, batched=True)

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/38 [00:00<?, ? examples/s]

In [32]:
OUTPUT_DIR = "llm_output"
sft_config = SFTConfig(
    output_dir = OUTPUT_DIR,
    dataset_text_field="text",
    max_seq_length=512,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    eval_strategy='steps',
    eval_steps = 0.2,
    save_steps = 0.2,
    logging_steps=10,
    learning_rate=1e-4,
    fp16=True, # or bf16=True,
    save_strategy= 'steps',
    warmup_ratio=0.1,
    save_total_limit=2,
    lr_scheduler_type="constant",
    save_safetensors=True,
    dataset_kwargs={
        " add _ special _ tokens" : False,
        " append_concat_token" :False
    },
    seed = 42
)
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset = dataset["train"],
    eval_dataset=dataset ["validation"])

Truncating train dataset:   0%|          | 0/189 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/38 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [33]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
5,No log,3.148315
10,3.295700,2.416656
15,3.295700,1.876301
20,1.905000,1.451479


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=23, training_loss=2.4367848479229473, metrics={'train_runtime': 145.8571, 'train_samples_per_second': 1.296, 'train_steps_per_second': 0.158, 'total_flos': 799186930139136.0, 'train_loss': 2.4367848479229473})

In [34]:
NEW_MODEL = 'finetuned_llama_medicare'
trainer.save_model(NEW_MODEL)

In [35]:
tokenizer = AutoTokenizer.from_pretrained(NEW_MODEL)

model = AutoModelForCausalLM.from_pretrained(
    NEW_MODEL,
    torch_dtype = torch.float16,
    offload_folder = "./offload",
    device_map = "auto"
)

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of = 8)
model = PeftModel.from_pretrained(model, NEW_MODEL)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


ValueError: We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules need to be offloaded: model.layers.9, model.layers.10, model.layers.11, model.layers.12, model.layers.13, model.layers.14, model.layers.15, model.layers.16, model.layers.17, model.layers.18, model.layers.19, model.layers.20, model.layers.21, model.layers.22, model.layers.23, model.layers.24, model.layers.25, model.layers.26, model.layers.27, model.layers.28, model.layers.29, model.layers.30, model.layers.31, model.norm, model.rotary_emb, lm_head.

# Evaluation

In [36]:
dataset = load_dataset(
    "json",
    data_files = {"train" : "train.json", "validation" : "val.json", "test" : "test.json", }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'answers', 'text', 'token_count'],
        num_rows: 189
    })
    validation: Dataset({
        features: ['questions', 'answers', 'text', 'token_count'],
        num_rows: 38
    })
    test: Dataset({
        features: ['questions', 'answers', 'text', 'token_count'],
        num_rows: 10
    })
})

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoFo