In [1]:
!pip install "transformers==4.34.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.23.0" "bitsandbytes==0.41.1" "trl==0.7.4" "safetensors>=0.3.1" ipywidgets wandb --upgrade
!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!pip install ninja packaging
!MAX_JOBS=1 pip install flash-attn --no-build-isolation

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
# add parent directory to path
import sys
import os

project_dir = os.getcwd()
parent_dir = os.path.dirname(project_dir)
sys.path.insert(0, parent_dir)

In [3]:
# import json
# from gaitor_function_calling.data.prompting_utils import n_shot_to_no_shot
# with open("C:/Projects/function_calling/gaitor_function_calling/data/prompts_data-13b-2_shot.json", "r") as f:
#     prompts = json.load(f)

# new_prompts = []
# for prompt_obj in prompts:
#     new_prompt_obj = {**prompt_obj}
#     new_prompt_obj["expected_str"] = n_shot_to_no_shot(prompt_obj["expected_str"])
#     new_prompt_obj["generated_str"] = n_shot_to_no_shot(prompt_obj["generated_str"])
#     new_prompts.append(new_prompt_obj)

# with open("C:/Projects/function_calling/gaitor_function_calling/data/prompts_data-13b-2_shot-no_shot.json", "w") as f:
#     json.dump(new_prompts, f, indent=4)


In [4]:
# configs
from huggingface_hub import notebook_login
import wandb

model_id = "meta-llama/Llama-2-7b-chat-hf"
output_dir = "llama-7-chat-instruction-int4-fc-dpo-_5_beta"
hub_model_id = f"SebastianS/{output_dir}"
wandb.init(project="function_calling", id=output_dir, entity="sebastiansosa", notes="Trained for 3 epochs on 700+ samples of DPO formatted data generated by the 13B llama model with 2-shot prompting and using a beta of .5 .")
wandb.define_metric("fc_combine", goal="maximize")
wandb.define_metric("exact_match", goal="maximize")
wandb.define_metric("perplexity", goal="minimize")
wandb.define_metric("epoch")
notebook_login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msebastiansosa[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from gaitor_function_calling.data.data_utils import DataAbstractor, build_data_dpo
import pandas as pd
from datasets import Dataset, DatasetDict

data_abstractor = DataAbstractor("prompts_data-13b-2_shot-no_shot.json", "dpo_eval")

train_dpo_data, test_dpo_data = build_data_dpo(data_abstractor.raw_data) # keep it split to ensure no overlap with sft eval

train_dpo_df = pd.DataFrame(train_dpo_data)
test_dpo_df = pd.DataFrame(test_dpo_data)

train_dpo_dataset = Dataset.from_pandas(train_dpo_df)
test_dpo_dataset = Dataset.from_pandas(test_dpo_df)

dataset_dict = DatasetDict({
    'train': train_dpo_dataset,
    'test': test_dpo_dataset
})

dataset_dict

No train data found
No test data found
Skipped 0 items


DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 715
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 15
    })
})

In [6]:
# get data and convert to Datasets format
import pandas as pd
from datasets import Dataset, DatasetDict
from gaitor_function_calling.data.data_utils import DataAbstractor
from gaitor_function_calling.data.prompting_utils import INSTRUCTION

sft_data_abstractor = DataAbstractor("production_data-without_summary-only_fc.json", "sft")

sft_train_df = pd.DataFrame(sft_data_abstractor.train_data)
sft_test_df = pd.DataFrame(sft_data_abstractor.test_data)

sft_train_dataset = Dataset.from_pandas(sft_train_df)
sft_test_dataset = Dataset.from_pandas(sft_test_df)

sft_dataset_dict = DatasetDict({
    'train': sft_train_dataset,
    'test': sft_test_dataset
})

sft_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 833
    })
    test: Dataset({
        features: ['text'],
        num_rows: 17
    })
})

In [7]:
# setup metrics and evaluation
import json
from transformers import EvalPrediction
from gaitor_function_calling.evaluation.evaluation_utils import FunctionCallingMetric, compute_perplexity, get_logits_and_labels
from gaitor_function_calling.data.prompting_utils import INSTRUCTION, json_arguments_from_prompt
import numpy as np

fc_metric = FunctionCallingMetric()
def config_compute_metrics(tokenizer):
    def compute_metrics(pred: EvalPrediction):
        # pred.predictions are a batch of logits
        # pred.label_ids are a batch of tokens
        token_ids = np.argmax(pred.predictions, axis=-1)
        predictions = tokenizer.batch_decode(token_ids, skip_special_tokens=False)
        labels = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=False)
        
        
    
        # Now use your metric class
        fc_result = fc_metric.run(predictions, labels)  # Implement this method in your class
    
        return {
            "fc_combine": fc_result
        }
    return compute_metrics

def custom_evaluation(eval_dataset, model, tokenizer, epoch, save_prompts_path=False):
    print("Starting custom evaluation.")
    model.eval()  # Set the model to evaluation mode
    results = {}

    fc_results = []
    exact_match_results = []
    perplexity_results = []


    if save_prompts_path:
        try:
            with open(save_prompts_path, "r") as f:
                dpo_data = json.load(f)
        except:
            dpo_data = []


    for idx, example in enumerate(eval_dataset):
        print(f"Example {idx}: ", end="")
        post_message = ""
        
        # Custom Function Calling metric
        prompts = None
        try:
            generated_arguments, expected_arguments, prompts = json_arguments_from_prompt(
                example["text"],
                model,
                tokenizer,
                INSTRUCTION,
                {"idx": idx, "epoch": epoch}
            )
            fc_result = fc_metric.run(generated_arguments, expected_arguments)

            fc_results.append(fc_result)

            if save_prompts_path:
                # add prompts to dpo_data.json
                dpo_data.append({
                    "fc_result": fc_result,
                    **prompts
                })
                with open(save_prompts_path, "w") as f:
                    json.dump(dpo_data, f)
        except Exception as e:
            post_message += f"Error function calling: {e}\n"
            fc_results.append(0)

        # exact match metric
        if prompts:
            exact_match_res = fc_metric._sentence_similarity(prompts["expected_str"].split("[/INST]")[1], prompts["generated_str"].split("[/INST]")[1])
            exact_match_results.append(exact_match_res)
        else:
            exact_match_results.append(0)

        # perplexity metric
        try:
            logits, labels = get_logits_and_labels(example["text"], model, tokenizer)
            perplexity = compute_perplexity(logits[..., :-1, :], labels).item()
            perplexity_results.append(perplexity)
        except Exception as e:
            post_message += f"Error perplexity: {e}\n"
            # perplexity_results.append(float('inf'))

        example_metric = {
            "fc_combine": fc_results[-1],
            "exact_match": exact_match_results[-1],
            "perplexity": perplexity_results[-1] if len(perplexity_results) else None
        }
        print(example_metric)
        if post_message:
            print(post_message)
        
    results["fc_combine"] =  sum(fc_results) / len(fc_results)
    results["exact_match"] =  sum(exact_match_results) / len(exact_match_results)
    results["perplexity"] =  sum(perplexity_results) / len(perplexity_results) if len(perplexity_results) else 1

    return results

In [9]:
# base LLM model and tokenizer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

use_flash_attention = False

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
)
model.config.pretraining_tp = 1


tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# peft config on the model
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)


# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)


In [11]:
# training config
from transformers import TrainingArguments
from trl import DPOTrainer
from peft import LoraConfig

use_flash_attention = False

args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=6 if use_flash_attention else 4,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="wandb",
    push_to_hub=True,  # Enable pushing to Hub
    hub_model_id=hub_model_id,  # Hugging Face Hub model ID
    hub_strategy="every_save",  # Push to Hub every epoch
    disable_tqdm=True # disable tqdm since with packing values are in correct
)

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = DPOTrainer(
    model,
    beta=0.5,
    train_dataset=dataset_dict["train"],
    tokenizer=tokenizer,
    args=args,
    peft_config=peft_config,
    max_length=max_seq_length,
)



In [12]:
# Training and Evaluation Loop
eval_result = custom_evaluation(sft_dataset_dict["test"], model, tokenizer, -1)
print(f"Evaluation results for epoch {-1}: {eval_result}")
wandb.log({"epoch": -1, **eval_result})

num_epochs = 3
for epoch in range(num_epochs):
    print(f"Begin epoch {epoch}")
    # Train for one epoch
    trainer.train()

    # Evaluate the model with the custom evaluation logic
    eval_result = custom_evaluation(sft_dataset_dict["test"], model, tokenizer, epoch)

    wandb.log({"epoch": epoch, **eval_result})

    # Save the model after each epoch (optional)
    trainer.save_model()

    

    # Log or print evaluation results
    print(f"Evaluation results for epoch {epoch}: {eval_result}\n\n\n")

wandb.finish()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Starting custom evaluation.
Example 0: {'fc_combine': 0.3577747344970703, 'exact_match': 0.8308500051498413, 'perplexity': 6.818745136260986}
Example 1: {'fc_combine': 0.36043891310691833, 'exact_match': 0.7130554914474487, 'perplexity': 6.76043701171875}
Example 2: {'fc_combine': 0, 'exact_match': 0, 'perplexity': 7.09403657913208}
Error function calling: 'NoneType' object has no attribute 'group'

Example 3: {'fc_combine': 0, 'exact_match': 0, 'perplexity': 9.144186019897461}
Error function calling: 'NoneType' object has no attribute 'group'

Example 4: {'fc_combine': 0, 'exact_match': 0, 'perplexity': 9.144186019897461}
Error function calling: No function call found in generated data

Example 5: {'fc_combine': 0, 'exact_match': 0, 'perplexity': 15.047759056091309}
Error function calling: 'NoneType' object has no attribute 'group'

Example 6: {'fc_combine': 0, 'exact_match': 0, 'perplexity': 15.047759056091309}
Error function calling: No function call found in generated data

Example

Could not estimate the number of tokens of the input, floating-point operations will not be computed


{'loss': 0.623, 'learning_rate': 0.0002, 'rewards/chosen': -0.5104681253433228, 'rewards/rejected': -1.0584181547164917, 'rewards/accuracies': 0.48750001192092896, 'rewards/margins': 0.5479501485824585, 'logps/rejected': -33.82159423828125, 'logps/chosen': -43.675880432128906, 'logits/rejected': -0.1216442808508873, 'logits/chosen': -0.10507483780384064, 'epoch': 0.11}
{'loss': 0.7007, 'learning_rate': 0.0002, 'rewards/chosen': -0.3357903063297272, 'rewards/rejected': -1.8613595962524414, 'rewards/accuracies': 0.4749999940395355, 'rewards/margins': 1.5255694389343262, 'logps/rejected': -36.221824645996094, 'logps/chosen': -37.18891906738281, 'logits/rejected': -0.26771512627601624, 'logits/chosen': -0.2613884508609772, 'epoch': 0.22}
{'loss': 0.3905, 'learning_rate': 0.0002, 'rewards/chosen': 1.103568434715271, 'rewards/rejected': -1.7007964849472046, 'rewards/accuracies': 0.6000000238418579, 'rewards/margins': 2.8043651580810547, 'logps/rejected': -35.97684097290039, 'logps/chosen': -



{'loss': 0.4111, 'learning_rate': 0.0002, 'rewards/chosen': -7.942717552185059, 'rewards/rejected': -13.166648864746094, 'rewards/accuracies': 0.5789473652839661, 'rewards/margins': 5.223931789398193, 'logps/rejected': -57.800838470458984, 'logps/chosen': -56.78969192504883, 'logits/rejected': -0.3490375578403473, 'logits/chosen': -0.3505941331386566, 'epoch': 0.11}
{'loss': 0.4059, 'learning_rate': 0.0002, 'rewards/chosen': -9.244933128356934, 'rewards/rejected': -14.763525009155273, 'rewards/accuracies': 0.6000000238418579, 'rewards/margins': 5.518592357635498, 'logps/rejected': -59.670387268066406, 'logps/chosen': -57.719764709472656, 'logits/rejected': -0.3083796799182892, 'logits/chosen': -0.31523144245147705, 'epoch': 0.22}
{'loss': 0.2848, 'learning_rate': 0.0002, 'rewards/chosen': -7.134912014007568, 'rewards/rejected': -13.916926383972168, 'rewards/accuracies': 0.612500011920929, 'rewards/margins': 6.782012939453125, 'logps/rejected': -57.55754470825195, 'logps/chosen': -52.12



{'loss': 0.2534, 'learning_rate': 0.0002, 'rewards/chosen': -6.116293430328369, 'rewards/rejected': -13.921222686767578, 'rewards/accuracies': 0.7236841917037964, 'rewards/margins': 7.804930210113525, 'logps/rejected': -62.0134162902832, 'logps/chosen': -57.79282760620117, 'logits/rejected': -0.0896540954709053, 'logits/chosen': -0.08106567710638046, 'epoch': 0.11}
{'loss': 0.3213, 'learning_rate': 0.0002, 'rewards/chosen': -8.327025413513184, 'rewards/rejected': -14.768373489379883, 'rewards/accuracies': 0.5375000238418579, 'rewards/margins': 6.441347599029541, 'logps/rejected': -60.68284225463867, 'logps/chosen': -59.54541778564453, 'logits/rejected': -0.16882255673408508, 'logits/chosen': -0.1560194194316864, 'epoch': 0.22}
{'loss': 0.2522, 'learning_rate': 0.0002, 'rewards/chosen': -8.517793655395508, 'rewards/rejected': -17.18931007385254, 'rewards/accuracies': 0.637499988079071, 'rewards/margins': 8.671516418457031, 'logps/rejected': -65.05168151855469, 'logps/chosen': -51.249584

VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
epoch,▁▃▆█
exact_match,█▆▃▁
fc_combine,█▁▁▁
perplexity,█▃▄▁
train/epoch,▁▂▃▄▅▅▆▇█▁▂▃▄▅▅▆▇█▁▂▃▄▅▅▆▇█
train/global_step,▁▂▄▅▇█████████▂▄▅▇█████████▂▄▅▇█████████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/logits/chosen,▇▃▄▃▂▂▁▁▁▂▃▆▄█▇▆▇▅▇▇▆▃▁▅
train/logits/rejected,▆▃▄▂▂▂▁▁▁▂▃▆▄█▇▆▇▅▇▇▆▃▁▅
train/logps/chosen,▆██████▇▃▂▄▄▂▃▄▄▂▂▄▁▃▃▃▃

0,1
epoch,2.0
exact_match,0.0
fc_combine,0.0
perplexity,9.95735
train/epoch,0.99
train/global_step,89.0
train/learning_rate,0.0002
train/logits/chosen,-0.16692
train/logits/rejected,-0.1773
train/logps/chosen,-55.67173
