In [None]:
from huggingface_hub import login
# login(token = <token>)

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config = bnb_config, device_map = {"":0})

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:21<00:00, 10.83s/it]


# Training Setup

In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

NameError: name 'model' is not defined

In [24]:
def print_trainable_parameters(model):
    '''
    Prints the number of trainable parameters in model
    '''
    trainable_params = 0
    all_param = 0

    for _,param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
        print(f"Trainable parameters : {trainable_params} || All params : {all_param} || Trainable% : {100*trainable_params}")
    

In [26]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 8,
    lora_alpha = 32,
    target_modules = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'CAUSAL_LM'
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

Trainable parameters : 0 || All params : 131072000 || Trainable% : 0
Trainable parameters : 0 || All params : 139460608 || Trainable% : 0
Trainable parameters : 32768 || All params : 139493376 || Trainable% : 3276800
Trainable parameters : 65536 || All params : 139526144 || Trainable% : 6553600
Trainable parameters : 65536 || All params : 147914752 || Trainable% : 6553600
Trainable parameters : 98304 || All params : 147947520 || Trainable% : 9830400
Trainable parameters : 131072 || All params : 147980288 || Trainable% : 13107200
Trainable parameters : 131072 || All params : 156368896 || Trainable% : 13107200
Trainable parameters : 163840 || All params : 156401664 || Trainable% : 16384000
Trainable parameters : 196608 || All params : 156434432 || Trainable% : 19660800
Trainable parameters : 196608 || All params : 164823040 || Trainable% : 19660800
Trainable parameters : 229376 || All params : 164855808 || Trainable% : 22937600
Trainable parameters : 262144 || All params : 164888576 || T



In [27]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples['quote']), batched = True)

Generating train split: 100%|████████████████████████████████████████████| 2508/2508 [00:00<00:00, 114537.08 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 2508/2508 [00:00<00:00, 23428.94 examples/s]


# Training

In [47]:
import transformers

tokenizer.pad_token = tokenizer.eos_token #</s> for llama
trainer = transformers.Trainer(
    model = model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps = 4,
        warmup_steps=2,
        max_steps=10,
        learning_rate = 2e-4,
        fp16=True,
        logging_steps=1,
        output_dir = "outputs",
        optim = "paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

model.config.use_cache = False
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
1,2.6586
2,1.9637
3,1.9965
4,1.6289
5,1.9004
6,1.264
7,2.099
8,1.1897
9,2.4697
10,2.159


TrainOutput(global_step=10, training_loss=1.932924735546112, metrics={'train_runtime': 51.231, 'train_samples_per_second': 0.781, 'train_steps_per_second': 0.195, 'total_flos': 60375172276224.0, 'train_loss': 1.932924735546112, 'epoch': 0.01594896331738437})

# Inference

In [56]:
from transformers import TextStreamer
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): PeftModelForCausalLM(
          (base_model): LoraModel(
            (model): PeftModelForCausalLM(
              (base_model): LoraModel(
                (model): LlamaForCausalLM(
                  (model): LlamaModel(
                    (embed_tokens): Embedding(32000, 4096)
                    (layers): ModuleList(
                      (0-31): 32 x LlamaDecoderLayer(
                        (self_attn): LlamaAttention(
                          (q_proj): lora.Linear4bit(
                            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                            (lora_dropout): ModuleDict(
                              (default): Dropout(p=0.05, inplace=False)
                            )
                            (lora_A): ModuleDict(
                              (default): Linear(in_features=4096, out_featur

In [57]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'You are a helpful assistant that provides accurate and concise responses'

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS} {system_prompt.strip()}{E_SYS}{user_prompt.strip()}{E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)
    streamer = TextStreamer(tokenizer)

    _ = model.generate(**inputs, streamer = streamer, max_new_tokens = 500)

In [58]:
stream('provide a brief comparison of salsa and bachata')

<s> [INST] <<SYS>>
 You are a helpful assistant that provides accurate and concise responses
<</SYS>>

provide a brief comparison of salsa and bachata[/INST]

Sure, I'd be happy to help! Here's a brief comparison of salsa and bachata:

Salsa and bachata are both popular Latin dance styles that originated in Latin America. However, they have some key differences:

1. Origin: Salsa originated in Cuba and Puerto Rico in the 1940s and 1950s, while bachata originated in the Dominican Republic in the 1960s.
2. Rhythm: Salsa has a faster, more energetic rhythm than bachata, with a tempo of around 160-180 beats per minute (BPM). Bachata has a slower, more romantic rhythm, with a tempo of around 120-140 BPM.
3. Steps: Salsa involves a variety of quick footwork patterns, including forward and backward walks, side steps, and turns. Bachata features a more relaxed, flowing style of footwork, with a focus on hip movement and turns.
4. Music: Salsa music is typically more upbeat and energetic, with 

# Advanced FineTuning

In [2]:
import os

In [3]:
cache_dir = os.path.join(os.getcwd(), "Model_cache_directory")
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir, exist_ok = True)

In [4]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
from torch.utils.data import DataLoader, Dataset


base_model = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    quantization_config = bnb_config, 
    device_map = {"":0},
    trust_remote_code = True,
    cache_dir = cache_dir
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Setup Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model, cache_dir = cache_dir)

In [6]:
print("EOS Token : ",tokenizer.eos_token)
print("EOS Token ID : ",tokenizer.eos_token_id)

EOS Token :  </s>
EOS Token ID :  2


In [7]:
print("Pad Token : ",tokenizer.pad_token)
print("Pad Token ID : ",tokenizer.pad_token_id)

Pad Token :  None
Pad Token ID :  None


In [8]:
tokenizer.padding_side = 'left'
print(tokenizer)

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


## Setup Pad token 

In [9]:
if '|<pad>|' not in tokenizer.get_vocab():
    tokenizer.add_tokens(['|<pad>|'])
tokenizer.pad_token = '|<pad>|'

model.resize_token_embeddings(len(tokenizer))

model.pad_token_id = tokenizer.pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

assert model.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizers pad token id"

print("Tokenizer pad token ID : ", tokenizer.pad_token_id)

print('Tokenizer pad token ID : ',tokenizer.pad_token_id)
print('Model pad token ID : ',model.pad_token_id)
print('Model Config pad token ID : ',model.config.pad_token_id)

print(model.config)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Tokenizer pad token ID :  32000
Tokenizer pad token ID :  32000
Model pad token ID :  32000
Model Config pad token ID :  32000
LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 32000,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int

In [10]:
sample_string = ['<s>[INST]']

#Tokenize the stringified JSON Object
encoded_sample = tokenizer(sample_string, truncation=True, padding=True, max_length=1024)

#Count the number of tokens
token_count = len(encoded_sample)

#Fetch BOS and EOS tokens
BOS_token_id = tokenizer.bos_token_id
EOS_token_id = tokenizer.eos_token_id
BOS_token = tokenizer.decode([BOS_token_id])
EOS_token = tokenizer.decode([EOS_token_id])

#Check and print BOS and EOS tokens
print(f"Beginning of the sequence : {sample_string[0]} (BOS token :{BOS_token}, id:{BOS_token_id})")
print(f"End of the sequence : {sample_string[-1]} (BOS token :{EOS_token}, id:{EOS_token_id})")

print(f"The number of tokens in the string is : {token_count}")
print(f"The ids are : {encoded_sample}")

#Decode the input_ids
decoded_sample = tokenizer.decode(encoded_sample['input_ids'][0], skip_special_tokens=False)

#Print decoded Strings
print(f"THe decoded string is : {decoded_sample}")

Beginning of the sequence : <s>[INST] (BOS token :<s>, id:1)
End of the sequence : <s>[INST] (BOS token :</s>, id:2)
The number of tokens in the string is : 2
The ids are : {'input_ids': [[1, 1, 518, 25580, 29962]], 'attention_mask': [[1, 1, 1, 1, 1]]}
THe decoded string is : <s><s> [INST]


# Setup Lora

In [11]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model
    """
    trainable_params = 0
    all_param = 0

    for _,param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"Trainable params : {trainable_params} || All params : {all_param} || Trainable % : {100 * trainable_params}")
    

In [13]:
print(model.state_dict().keys())

odict_keys(['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.q_proj.weight.absmax', 'model.layers.0.self_attn.q_proj.weight.quant_map', 'model.layers.0.self_attn.q_proj.weight.nested_absmax', 'model.layers.0.self_attn.q_proj.weight.nested_quant_map', 'model.layers.0.self_attn.q_proj.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.k_proj.weight.absmax', 'model.layers.0.self_attn.k_proj.weight.quant_map', 'model.layers.0.self_attn.k_proj.weight.nested_absmax', 'model.layers.0.self_attn.k_proj.weight.nested_quant_map', 'model.layers.0.self_attn.k_proj.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.v_proj.weight.absmax', 'model.layers.0.self_attn.v_proj.weight.quant_map', 'model.layers.0.self_attn.v_proj.weight.nested_absmax', 'model.layers.0.self_attn.v_proj.weight.nested_quant_map', 'model.layers.0.self_attn.v_proj.weight.q

In [14]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r= 16,
    lora_alpha=32,
    target_modules = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'CAUSAL_LM'
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

Trainable params : 16777216 || All params : 3517198336 || Trainable % : 1677721600


# Prepare Data

In [15]:
from datasets import load_dataset

data = load_dataset("Locutusque/function-calling-chatml")
# data = data.map(lambda samples: tokenizer(samples['quote']), batched = True)

In [16]:
print(data)

DatasetDict({
    train: Dataset({
        features: ['system_message', 'function_description', 'conversations'],
        num_rows: 112960
    })
})


In [17]:
class TextDataset(Dataset):
    def __init__(self, encodings, response_lengths):
        self.encodings = encodings
        self.response_length = response_lengths

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}

        #Set labels to be the same as input ids
        item["labels"] = item['input_ids'].clone()

        #Shift labels to the left and replace the last posistion with EOS token
        item["labels"][:-1] = item['input_ids'][1:]
        item["labels"][-1] = 2 #Replace last position with EOS token ID

        # Create a loss mask
        response_start_position = item['input_ids'].shape[0] - self.response_lengths[idx]
        item['loss_mask'] = torch.zeros_like(item['input_ids'])
        item['loss_mask'][response_start_position] = 1

        #Create a new tensor for the shifted loss mask
        shifted_loss_mask = torch.cat([item['loss_mask'][1:], torch.tensor([1])])
        item['loss_mask'] = shifted_loss_mask

        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

In [18]:
data["train"]["function_description"][0].strip()

'{\n    "name": "get_exchange_rate",\n    "description": "Get the exchange rate between two currencies",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "base_currency": {\n                "type": "string",\n                "description": "The currency to convert from"\n            },\n            "target_currency": {\n                "type": "string",\n                "description": "The currency to convert to"\n            }\n        },\n        "required": [\n            "base_currency",\n            "target_currency"\n        ]\n    }\n}'

In [19]:
def prepare_dataset(dataset, tokenizer):
    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<<SYS>>\n\n"

    formatted_dataset = []
    for row in dataset:
        # Combine system and function into one system prompt
        system_prompt = f"{B_SYS}{row['system_message'].strip()}\n\n{row['function_description'].strip()}{E_SYS}"

        conversation = row["conversations"]
        if isinstance(conversation, str):
            import ast
            conversation = ast.literal_eval(conversation)

        # Process only the last user + assistant turn for SFT
        last_user = None
        last_assistant = None
        for entry in reversed(conversation):
            if entry["from"] == "gpt" and last_assistant is None:
                last_assistant = entry["value"].strip()
            elif entry["from"] == "human" and last_user is None:
                last_user = entry["value"].strip()
            if last_user and last_assistant:
                break

        if last_user is None or last_assistant is None:
            continue  # skip incomplete samples

        # Prepare ChatML-style input
        input_text = f"{B_INST}{system_prompt}{last_user.strip()}{E_INST} {last_assistant.strip()}"
        response_text = last_assistant.strip()

        formatted_dataset.append({
            "input_text": input_text,
            "response_text": response_text
        })

    # Tokenize inputs
    input_texts = [d["input_text"] for d in formatted_dataset]
    response_texts = [d["response_text"] for d in formatted_dataset]

    encodings = tokenizer(input_texts, truncation=True, padding=True, max_length=2048, return_tensors="pt")
    response_lengths = [len(tokenizer.encode(resp, truncation=True, max_length=tokenizer.model_max_length)) for resp in response_texts]

    return TextDataset(encodings, response_lengths)

In [20]:
# def prepare_dataset(dataset, tokenizer):
#     B_INST, E_INST = "[INST]", "[/INST]"
#     B_SYS, E_SYS = "<<SYS>>\n", "\n<<SYS>>\n\n"

#     formatted_dataset = dataset.map(
#         lambda x : {
#             "input_text" : "".join([
#                 f"{B_INST}{B_SYS}{x['system_message'].strip()}{E_SYS}",
#                 f"{"".join(x['function_description']).strip()}{E_INST}\n\n",
#                 f"{"".join(x['conversations']).strip()}" #Appending EOS token in text data
#             ]),

#             "response_text" : "".join([
#                 f"{"".join(x['conversations']).strip()}" #Appending EOS token in text data
#             ])
#         }
#     )

#     encodings = tokenizer([dialogue['input_text'] for dialog in formatted_dataset], Truncation = True)
#     response_lengths = [len(tokenizer.encode(dialogue["response_text"], truncation = True, max_length = 84))]

#     text_dataset = TextDataset(encodings, response_lengths)

#     return text_dataset

In [21]:
data['train']

Dataset({
    features: ['system_message', 'function_description', 'conversations'],
    num_rows: 112960
})

In [None]:
train_dataset = prepare_dataset(data['train'], tokenizer)
test_dataset = prepare_dataset(data['test'], tokenizer)