In [1]:
!pip install "transformers==4.34.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.23.0" "bitsandbytes==0.41.1" "trl==0.4.7" "safetensors>=0.3.1" ipywidgets wandb --upgrade
!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!pip install ninja packaging
!MAX_JOBS=1 pip install flash-attn --no-build-isolation

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
# add parent directory to path
import sys
import os

project_dir = os.getcwd()
parent_dir = os.path.dirname(project_dir)
sys.path.insert(0, parent_dir)

In [3]:
# configs
from huggingface_hub import notebook_login
# import wandb

model_id = "meta-llama/Llama-2-7b-chat-hf"
output_dir = "llama-7-chat-instruction-int4-fc-pipeline"
hub_model_id = f"SebastianS/{output_dir}"
# wandb.init(project="function_calling", id=output_dir, entity="sebastiansosa")
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# get data and convert to Datasets format
import pandas as pd
from datasets import Dataset, DatasetDict
from gaitor_function_calling.data.data_utils import DataAbstractor
from gaitor_function_calling.data.prompting_utils import INSTRUCTION

data_abstractor = DataAbstractor("production_data-without_summary-only_fc.json", "dpo_eval")

train_df = pd.DataFrame(data_abstractor.train_data)
test_df = pd.DataFrame(data_abstractor.test_data)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 425
    })
    test: Dataset({
        features: ['text'],
        num_rows: 425
    })
})

In [15]:
# setup metrics and evaluation
import json
from transformers import EvalPrediction
from gaitor_function_calling.evaluation.evaluation_utils import FunctionCallingMetric, compute_perplexity, get_logits_and_labels
from gaitor_function_calling.data.prompting_utils import INSTRUCTION, json_arguments_from_prompt
import numpy as np

fc_metric = FunctionCallingMetric()
def config_compute_metrics(tokenizer):
    def compute_metrics(pred: EvalPrediction):
        # pred.predictions are a batch of logits
        # pred.label_ids are a batch of tokens
        token_ids = np.argmax(pred.predictions, axis=-1)
        predictions = tokenizer.batch_decode(token_ids, skip_special_tokens=False)
        labels = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=False)
        
        
    
        # Now use your metric class
        fc_result = fc_metric.run(predictions, labels)  # Implement this method in your class
    
        return {
            "fc_combine": fc_result
        }
    return compute_metrics

def custom_evaluation(eval_dataset, model, tokenizer, save_prompts_path=False):
    print("Starting custom evaluation.")
    # model.eval()  # Set the model to evaluation mode
    results = {}

    fc_results = []

    if save_prompts_path:
        try:
            with open(save_prompts_path, "r") as f:
                dpo_data = json.load(f)
        except:
            dpo_data = []


    for idx, example in enumerate(eval_dataset):
        print(f"Example {idx}: ", end="")
        post_message = ""
        
        # Custom Function Calling metric
        fc_message = ""
        try:
            print("in try block")
            generated_arguments, expected_arguments, prompts = json_arguments_from_prompt(
                example["text"],
                model,
                tokenizer,
                INSTRUCTION
            )
            fc_result = fc_metric.run(generated_arguments, expected_arguments)
            # add prompts to dpo_data.json
            dpo_data.append({
                "fc_result": fc_result,
                **prompts
            })
            fc_results.append(fc_result)

            if save_prompts_path:
                with open(save_prompts_path, "w") as f:
                    json.dump(dpo_data, f)
        except Exception as e:
            post_message += f"Error function calling: {e}\n"
            fc_results.append(0)

        example_metric = {"fc_combine": fc_results[-1]}
        print(example_metric)
        if post_message:
            print(post_message)
        
    results["fc_combine"] =  sum(fc_results) / len(fc_results)

    return results

In [6]:
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import torch

# load base LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    hub_model_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(hub_model_id)

Downloading adapter_config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading adapter_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
dataset_dict["test"]

Dataset({
    features: ['text'],
    num_rows: 425
})

In [16]:
eval_result = custom_evaluation(dataset_dict["test"], model, tokenizer, "data/prompts_data.json")
eval_result

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Starting custom evaluation.
Example 0: in try block
pre output generate


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


post output generate
{'fc_combine': 0.5176492598321702}
Example 1: in try block
pre output generate
post output generate
{'fc_combine': 0.53515545129776}
Example 2: in try block
pre output generate
post output generate
{'fc_combine': 0}
Error function calling: Expecting ',' delimiter: line 1 column 79 (char 78)

Example 3: in try block
pre output generate
post output generate
{'fc_combine': 0.6483641465504965}
Example 4: in try block
pre output generate
post output generate
{'fc_combine': 0.6554457545280457}
Example 5: in try block
pre output generate
post output generate
{'fc_combine': 0.6318585077921549}
Example 6: in try block
pre output generate
post output generate
{'fc_combine': 0.5127570877472559}
Example 7: in try block
pre output generate
post output generate
{'fc_combine': 0.6112533211708069}
Example 8: in try block
pre output generate
post output generate
{'fc_combine': 0}
Error function calling: division by zero

Example 9: in try block
pre output generate
post output gener

{'fc_combine': 0.5420713784818879}