In [1]:
!pip install "transformers==4.34.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.23.0" "bitsandbytes==0.41.1" "trl==0.4.7" "safetensors>=0.3.1" ipywidgets wandb --upgrade
!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!pip install ninja packaging
!MAX_JOBS=1 pip install flash-attn --no-build-isolation

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
# add parent directory to path
import sys
import os

project_dir = os.getcwd()
parent_dir = os.path.dirname(project_dir)
sys.path.insert(0, parent_dir)

In [3]:
# configs
from huggingface_hub import notebook_login
import wandb

model_id = "meta-llama/Llama-2-7b-chat-hf"
output_dir = "llama-7-chat-instruction-pre_ft-op_glaive-few_shot"
hub_model_id = f"SebastianS/{output_dir}"
wandb.init(project="function_calling", id=output_dir, entity="sebastiansosa", notes="Evaluated over a range of 3 shots on the shuffled op_glaive dataset.")
wandb.define_metric("fc_combine", goal="maximize")
wandb.define_metric("exact_match", goal="maximize")
wandb.define_metric("perplexity", goal="minimize")
wandb.define_metric("epoch")
notebook_login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msebastiansosa[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# # Build files
# import json
# from gaitor_function_calling.data.data_utils import DataAbstractor
# from gaitor_function_calling.data.prompting_utils import INSTRUCTION, parse_prompt_back_to_data

# shot = 4
# data_abstractor = DataAbstractor("op_glaive_10k-test.json", f"pre_ft_eval-{shot}_shot")
# train, test = data_abstractor.build_data(INSTRUCTION, False, 1, shot)
# data_abstractor.save(data_abstractor.raw_data, train, test)

In [5]:
# get data and convert to Datasets format
import pandas as pd
from datasets import Dataset, DatasetDict
from gaitor_function_calling.data.data_utils import DataAbstractor
from gaitor_function_calling.data.prompting_utils import INSTRUCTION
dataset_dicts = []
for i in range(0, 4):
    data_abstractor = DataAbstractor("op_glaive_10k-test.json", f"pre_ft_eval-{i}_shot")

    train_df = pd.DataFrame(data_abstractor.train_data)
    test_df = pd.DataFrame(data_abstractor.test_data)

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    dataset_dicts.append(dataset_dict)

dataset_dicts

No test data found
No test data found
No test data found
No test data found


[DatasetDict({
     train: Dataset({
         features: ['text'],
         num_rows: 30
     })
     test: Dataset({
         features: [],
         num_rows: 0
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text'],
         num_rows: 30
     })
     test: Dataset({
         features: [],
         num_rows: 0
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text'],
         num_rows: 30
     })
     test: Dataset({
         features: [],
         num_rows: 0
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['text'],
         num_rows: 30
     })
     test: Dataset({
         features: [],
         num_rows: 0
     })
 })]

In [6]:
# setup metrics and evaluation
import json
from transformers import EvalPrediction
from gaitor_function_calling.evaluation.evaluation_utils import FunctionCallingMetric, compute_perplexity, get_logits_and_labels
from gaitor_function_calling.data.prompting_utils import INSTRUCTION, json_arguments_from_prompt, generate_prediction
import numpy as np

fc_metric = FunctionCallingMetric()
def config_compute_metrics(tokenizer):
    def compute_metrics(pred: EvalPrediction):
        # pred.predictions are a batch of logits
        # pred.label_ids are a batch of tokens
        token_ids = np.argmax(pred.predictions, axis=-1)
        predictions = tokenizer.batch_decode(token_ids, skip_special_tokens=False)
        labels = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=False)
        
        
    
        # Now use your metric class
        fc_result = fc_metric.run(predictions, labels)  # Implement this method in your class
    
        return {
            "fc_combine": fc_result
        }
    return compute_metrics

def custom_evaluation(eval_dataset, model, tokenizer, epoch, save_prompts_path=False):
    print("Starting custom evaluation.")
    model.eval()  # Set the model to evaluation mode
    results = {}

    fc_results = []
    exact_match_results = []
    perplexity_results = []


    if save_prompts_path:
        try:
            with open(save_prompts_path, "r") as f:
                dpo_data = json.load(f)
        except:
            dpo_data = []


    for idx, example in enumerate(eval_dataset):
        print(f"Example {idx}: ", end="")
        post_message = ""
        
        # Custom Function Calling metric
        prompts = None
        try:
            generated_str = generate_prediction(example["text"], model, tokenizer, INSTRUCTION)
            generated_arguments, expected_arguments, prompts = json_arguments_from_prompt(
                example["text"],
                generated_str,
                INSTRUCTION,
                {"idx": idx, "epoch": epoch}
            )
            fc_result = fc_metric.run(generated_arguments, expected_arguments)

            fc_results.append(fc_result)

            if save_prompts_path:
                # add prompts to dpo_data.json
                dpo_data.append({
                    "fc_result": fc_result,
                    **prompts
                })
                with open(save_prompts_path, "w") as f:
                    json.dump(dpo_data, f)
        except Exception as e:
            post_message += f"Error function calling: {e}\n"
            fc_results.append(0)

        # exact match metric
        if prompts:
            exact_match_res = fc_metric._sentence_similarity(prompts["expected_str"].split("[/INST]")[-1], prompts["generated_str"].split("[/INST]")[-1])
            exact_match_results.append(exact_match_res)
        else:
            exact_match_results.append(0)

        # perplexity metric
        try:
            logits, labels = get_logits_and_labels(example["text"], model, tokenizer)
            perplexity = compute_perplexity(logits[..., :-1, :], labels).item()
            perplexity_results.append(perplexity)
        except Exception as e:
            post_message += f"Error perplexity: {e}\n"
            # perplexity_results.append(float('inf'))

        example_metric = {
            "fc_combine": fc_results[-1],
            "exact_match": exact_match_results[-1],
            "perplexity": perplexity_results[-1] if len(perplexity_results) else None
        }
        print(example_metric)
        if post_message:
            print(post_message)
        
    results["fc_combine"] =  sum(fc_results) / len(fc_results)
    results["exact_match"] =  sum(exact_match_results) / len(exact_match_results)
    results["perplexity"] =  sum(perplexity_results) / len(perplexity_results) if len(perplexity_results) else 1

    return results

In [7]:
# base LLM model and tokenizer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_cache=False,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
for idx, dataset_dict in enumerate(dataset_dicts):
    print(f"Starting evaluation {idx} shot.")
    eval_result = custom_evaluation(dataset_dict["train"], model, tokenizer, idx)
    wandb.log({"epoch": idx, **eval_result})
    print(f"Evaluation results: {eval_result}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Starting evaluation 0 shot.
Starting custom evaluation.
Example 0: {'fc_combine': 1.0, 'exact_match': 0.8076157569885254, 'perplexity': 7.743117809295654}
Example 1: {'fc_combine': 0, 'exact_match': 0, 'perplexity': 6.452919006347656}
Error function calling: No function call found in generated data

Example 2: {'fc_combine': 0, 'exact_match': 0, 'perplexity': 9.793402671813965}
Error function calling: No function call found in generated data

Example 3: {'fc_combine': 1.0, 'exact_match': 0.8533190488815308, 'perplexity': 7.398660182952881}
Example 4: {'fc_combine': 1.0, 'exact_match': 0.7947900891304016, 'perplexity': 8.484703063964844}
Example 5: {'fc_combine': 0, 'exact_match': 0, 'perplexity': 8.992287635803223}
Error function calling: Expecting value: line 1 column 1 (char 0)

Example 6: {'fc_combine': 1.0, 'exact_match': 0.855522632598877, 'perplexity': 7.132600784301758}
Example 7: {'fc_combine': 1.0, 'exact_match': 0.7608612775802612, 'perplexity': 5.263855457305908}
Example 8: 

In [9]:
wandb.finish()

VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



0,1
epoch,▁▃▆█
exact_match,▁███
fc_combine,▁███
perplexity,█▅▁▁

0,1
epoch,3.0
exact_match,0.97487
fc_combine,0.89199
perplexity,4.20884
