# Dataset

In [17]:
from datasets import load_dataset
import numpy
relative_path_to_data = './production_eval_chat-instruction.json'
dataset = load_dataset('json', data_files={'train': relative_path_to_data}, split="train")
print(len(dataset))
print(dataset[0]["text"])
instruction = f"Your job is to identify weather or not the user input is related to the function specification delimited by {function_calling_tokens['FUNCTIONS']['start']} and {function_calling_tokens['FUNCTIONS']['end']}. If it is related then your response should be in the function_calling format: {function_calling_tokens['FUNCTION_CALL_NAME']['start']}NAME_ASSOCIATED_WITH_THE_FUNCTION{function_calling_tokens['FUNCTION_CALL_NAME']['end']}{function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['start']}ARGUMENTS_IN_STRINGIFIED_JSON_FORMAT{function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['end']}. Otherwise simply return a normal response. "



850
<s>[INST] <<SYS>>
Your job is to identify weather or not the user input is related to the function specification delimited by <FUNCTIONS> and </FUNCTIONS>. If it is related then your response should be in the function_calling format: <FUNCTION_CALL_NAME>NAME_ASSOCIATED_WITH_THE_FUNCTION</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>ARGUMENTS_IN_STRINGIFIED_JSON_FORMAT</FUNCTION_CALL_ARGUMENTS>. Otherwise simply return a normal response. 
<FUNCTIONS>[{"name": "getHashtagFollowerCount", "description": "Get the follower count, follower count over time and related trending hashtags of a specified LinkedIn hashtag", "parameters": {"type": "object", "properties": {"path_params": {"type": "object", "properties": {"hashtag": {"type": "string", "description": "The specified hashtag."}}, "required": ["hashtag"]}}}}]</FUNCTIONS>
<</SYS>>

linkedin hashtags for tech [/INST] <FUNCTION_CALL_NAME>getHashtagFollowerCount</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>{}</FUNCTION_CALL_ARGUMENTS> </s>

# Load Models

In [1]:
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import torch

# Set the path to the checkpoint directory
hub_id = "SebastianS/function_calling-llama_7b-nat-fc_only"

# load base LLM model and tokenizer
fc_model = AutoPeftModelForCausalLM.from_pretrained(
    hub_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
fc_tokenizer = AutoTokenizer.from_pretrained(hub_id)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

HfHubHTTPError: 500 Server Error: Internal Server Error for url: https://huggingface.co/SebastianS/function_calling-llama_7b-nat-fc_only/resolve/main/adapter_model.safetensors (Request ID: Root=1-655d1c02-248c587f63973cf649b1ceb6;a791c996-e0f8-4679-bba9-64568a0ffd0c)

Internal Error - We're working hard to fix this as soon as possible!

# Metric

In [None]:
import re
import json
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine

# Load a pre-trained model for sentence embedding (e.g., SBERT)
embedding_model_name = "sentence-transformers/bert-base-nli-mean-tokens"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

def parse_prompt_back_to_data(prompt, instruction):
    """
    Function to parse a prompt back into the original data format, using a dictionary of function calling tokens.
    
    :param prompt: A string representing the constructed prompt.
    :param function_calling_tokens: A dictionary containing the start and end tokens for different function call elements.
    :return: A dictionary representing the original data instance.
    """
    # Building regular expression patterns using the function_calling_tokens
    functions_pattern = rf"{function_calling_tokens['FUNCTIONS']['start']}(.*?){function_calling_tokens['FUNCTIONS']['end']}"
    input_pattern = r"<</SYS>>\n\n(.*?) \[/INST\]"  # This remains unchanged as it's not part of function_calling_tokens
    target_content_pattern = r"\[/INST\] (.*)</s>"  # This also remains unchanged
    function_call_name_pattern = rf"{function_calling_tokens['FUNCTION_CALL_NAME']['start']}(.*?){function_calling_tokens['FUNCTION_CALL_NAME']['end']}"
    function_call_arguments_pattern = rf"{function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['start']}(.*?){function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['end']}"

    instruction_adjusted_prompt = prompt
    if instruction:
        instruction_adjusted_prompt = "".join(prompt.split(instruction))
        
    # Extracting data using regular expressions
    functions_str = re.search(functions_pattern, instruction_adjusted_prompt).group(1)
    input_content = re.search(input_pattern, prompt).group(1)
    target_content_match = re.search(target_content_pattern, prompt)

    # Parse functions JSON string
    functions = json.loads(functions_str)

    # Prepare the data dictionary
    data = {
        "input": [{
            "chatgptMessage": {"role": "user", "content": input_content},
            "functions": functions
        }],
        "target": {
            "chatgptMessage": {"role": "assistant"},
            "functions": functions  # Including functions in the target as well
        }
    }

    # Check if the target has a function call
    if function_calling_tokens['FUNCTION_CALL_NAME']['start'] in prompt:
        function_call_name = re.search(function_call_name_pattern, instruction_adjusted_prompt).group(1)
        function_call_arguments = re.search(function_call_arguments_pattern, instruction_adjusted_prompt).group(1)
        data["target"]["chatgptMessage"]["function_call"] = {
            "name": function_call_name,
            "arguments": function_call_arguments
        }
    else:
        # Handle case where regex might not find a match for target content
        if target_content_match:
            target_content = target_content_match.group(1)
            data["target"]["chatgptMessage"]["content"] = target_content

    return data

def get_sentence_embedding(sentence):
    inputs = embedding_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = embedding_model(**inputs)

    # Mean Pooling - Take attention mask into account for correct averaging
    attention_mask = inputs['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
    sum_embeddings = torch.sum(outputs.last_hidden_state * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    mean_pooled = sum_embeddings / sum_mask

    return mean_pooled[0].numpy()

def sentence_similarity(sent1, sent2):
    embedding1 = get_sentence_embedding(sent1)
    embedding2 = get_sentence_embedding(sent2)
    return 1 - cosine(embedding1, embedding2)

def custom_metric(generated_json, expected_json):
    def compare_json(g_json, e_json, key_similarity_scores, value_similarity_scores):
        for e_key, e_value in e_json.items():
            # Check for exact key match or find the most similar key
            if e_key in g_json:
                g_key = e_key
                key_similarity_scores.append(1)
            else:
                # Compute similarity with all keys in generated_json and find the best match
                key_similarity = {gen_key: sentence_similarity(e_key, gen_key) for gen_key in g_json.keys()}
                g_key, key_sim_score = max(key_similarity.items(), key=lambda x: x[1])
                key_similarity_scores.append(key_sim_score)

            # Recursive comparison for nested objects, else compare values
            if isinstance(e_value, dict) and isinstance(g_json.get(g_key, {}), dict):
                compare_json(g_json[g_key], e_value, key_similarity_scores, value_similarity_scores)
            elif isinstance(e_value, str) and isinstance(g_json.get(g_key, ""), str):
                # Compare values only if they are strings at the root level
                value_sim_score = sentence_similarity(e_value, g_json[g_key])
                value_similarity_scores.append(value_sim_score)
            elif e_value == g_json.get(g_key, None):
                value_similarity_scores.append(1)  # Exact match for non-string root values
            else:
                value_similarity_scores.append(0)  # Non-matching root values

    key_similarity_scores = []
    value_similarity_scores = []
    compare_json(generated_json, expected_json, key_similarity_scores, value_similarity_scores)

    # Compute the average similarity scores
    avg_key_similarity = sum(key_similarity_scores) / len(key_similarity_scores) if key_similarity_scores else 0
    avg_value_similarity = sum(value_similarity_scores) / len(value_similarity_scores) if value_similarity_scores else 0

    return (avg_key_similarity + avg_value_similarity) / 2

# Iterator

In [None]:
metric_scores = []
for i, data in enumerate(dataset):
    if 0 < i < len(dataset):
        try:
            inp, target = data["text"].split("[/INST]")
            prompt = inp + "[/INST]"
            input_ids = fc_tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
            outputs = fc_model.generate(input_ids=input_ids, do_sample=True, top_p=0.9,temperature=0.9)
    
            expected_str = data["text"]
            generated_str = fc_tokenizer.batch_decode(outputs.detach().cpu().numpy())[0]
            
    
            expected_data = parse_prompt_back_to_data(expected_str, instruction)
            generated_data = parse_prompt_back_to_data(generated_str, instruction)
            parse_prompt_back_to_data(generated_str, instruction)
    
            if "function_call" not in generated_data["target"]["chatgptMessage"]:
                metric_scores.append(0)
                print(f"{i} Metric score: {0}")
                continue
            generated_arguments = json.loads(generated_data["target"]["chatgptMessage"]["function_call"]["arguments"])
            expected_arguments = json.loads(expected_data["target"]["chatgptMessage"]["function_call"]["arguments"])
    
            metric_score = custom_metric(generated_arguments, expected_arguments)
            metric_scores.append(metric_score)
    
            print(f"{i} Metric score: {metric_score:.2f}")
        except Exception as e:
            print("Error: ", e)

print(f"Average metric score: {sum(metric_scores) / len(metric_scores):.2f}")


1 Metric score: 0.67
2 Metric score: 1.00
3 Metric score: 1.00
4 Metric score: 1.00
5 Metric score: 0.91
Error:  'NoneType' object has no attribute 'group'
7 Metric score: 0.61
8 Metric score: 0.82
Error:  Expecting ',' delimiter: line 1 column 54 (char 53)
10 Metric score: 1.00
11 Metric score: 0.62
12 Metric score: 0.00
13 Metric score: 0.76
14 Metric score: 0.92
15 Metric score: 0.71
16 Metric score: 0.53
17 Metric score: 0.66
18 Metric score: 1.00
19 Metric score: 0.00
20 Metric score: 0.60
21 Metric score: 1.00
22 Metric score: 1.00
23 Metric score: 0.83
24 Metric score: 0.79
25 Metric score: 0.75
26 Metric score: 0.00
27 Metric score: 0.52
28 Metric score: 0.63
29 Metric score: 0.51
30 Metric score: 0.43
31 Metric score: 0.45
32 Metric score: 0.46
33 Metric score: 1.00
34 Metric score: 0.63
35 Metric score: 0.00
36 Metric score: 0.83
37 Metric score: 0.54
38 Metric score: 0.61
39 Metric score: 0.65
40 Metric score: 0.00
41 Metric score: 0.52
42 Metric score: 0.76
Error:  'NoneTyp