In [4]:
!pip install "transformers==4.34.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.23.0" "bitsandbytes==0.41.1" "trl==0.4.7" "safetensors>=0.3.1" ipywidgets wandb --upgrade

Defaulting to user installation because normal site-packages is not writeable


In [5]:
function_calling_tokens = {
    "FUNCTIONS": {
        "start": "<FUNCTIONS>",
        "end": "</FUNCTIONS>"
    },
    "FUNCTION_CALL_NAME": {
        "start": "<FUNCTION_CALL_NAME>",
        "end": "</FUNCTION_CALL_NAME>"
    },
    "FUNCTION_CALL_ARGUMENTS": {
        "start": "<FUNCTION_CALL_ARGUMENTS>",
        "end": "</FUNCTION_CALL_ARGUMENTS>"
    },
    "all": ["<FUNCTIONS>", "</FUNCTIONS>", "<FUNCTION_CALL_NAME>", "</FUNCTION_CALL_NAME>", "<FUNCTION_CALL_ARGUMENTS>", "</FUNCTION_CALL_ARGUMENTS>"]
}

In [6]:
from datasets import load_dataset
relative_path_to_data = '../data/chat/production_train_chat.json'

dataset = load_dataset('json', data_files={'train': relative_path_to_data}, split="train")
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
print(dataset[30]["text"])

Found cached dataset json (/home/sosa.s/.cache/huggingface/datasets/json/default-7aee0b44caf3693f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"json": {"properties": {"link": {"type": "string", "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the input is a valid URL or a succinct search query."}}, "type": "object"}}}}]</FUNCTIONS>
<</SYS>>

1 sentence about https://www.openplugin.io/ [/INST] <FUNCTION_CALL_NAME>transcodeWebPage</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>{"json": {"link": "https://www.openplugin.io/"}}</FUNCTION_CALL_ARGUMENTS> </s>


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.spatial.distance import cosine
import json

# Load a pre-trained model for sentence embedding (e.g., SBERT)
model_name = "sentence-transformers/bert-base-nli-mean-tokens"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs[0][0].numpy()

def sentence_similarity(sent1, sent2):
    embedding1 = get_sentence_embedding(sent1)
    embedding2 = get_sentence_embedding(sent2)
    return 1 - cosine(embedding1, embedding2)

def custom_metric(generated_json, expected_json):
    # Evaluate JSON structure
    structure_score = evaluate_json_structure(generated_json, expected_json)
    
    # Evaluate sentence similarity for string values
    string_similarity_scores = []
    for key, value in expected_json.items():
        if isinstance(value, str):
            gen_value = generated_json.get(key, "")
            sim_score = sentence_similarity(value, gen_value)
            string_similarity_scores.append(sim_score)
    
    avg_string_similarity = sum(string_similarity_scores) / len(string_similarity_scores) if string_similarity_scores else 1
    # Combine structure score and string similarity
    final_score = (structure_score + avg_string_similarity) / 2
    return final_score

# Example usage
generated_json = json.loads(model_output)
expected_json = json.loads(expected_output)
score = custom_metric(generated_json, expected_json)


In [30]:
filtered_data = [item for item in dataset if "<FUNCTION_CALL_NAME>" in item["text"]]
print("Amount of data using FC:", len(filtered_data))

Amount of data using FC: 850


In [31]:
!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!pip install ninja packaging
!MAX_JOBS=1 pip install flash-attn --no-build-isolation


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [32]:
# Hugging Face model id
# model_id = "NousResearch/Llama-2-7b-hf"  # non-gated
model_id = "meta-llama/Llama-2-7b-chat-hf" # gated

output_dir = "llama-7-chat-int4-fc"

In [33]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

use_flash_attention = False

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
)
model.config.pretraining_tp = 1


tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("Pre add tokens:", len(tokenizer))
tokenizer.add_tokens(function_calling_tokens["all"])
print("Post add tokens:", len(tokenizer))
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Pre add tokens: 32000
Post add tokens: 32006


Embedding(32006, 4096)

In [34]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)


# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)


In [35]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=6,
    per_device_train_batch_size=6 if use_flash_attention else 4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=True # disable tqdm since with packing values are in correct
)


In [36]:
from trl import SFTTrainer

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_text_field="text",
    # formatting_func=format_instruction,
    args=args,
)


In [37]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33msebastiansosa[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [38]:
import wandb

wandb.init(project='fc_chat_prod', entity='sebastiansosa')


VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111978801070815, max=1.0…

In [39]:
# train
trainer.train() # there will not be a progress bar since tqdm is disabled
# 4:30 mins per epoch
# 10:00 min per 2 epoch

# save model
trainer.save_model()


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.1407, 'learning_rate': 0.0002, 'epoch': 0.06}
{'loss': 0.8568, 'learning_rate': 0.0002, 'epoch': 0.12}
{'loss': 0.7505, 'learning_rate': 0.0002, 'epoch': 0.18}
{'loss': 0.7268, 'learning_rate': 0.0002, 'epoch': 1.04}
{'loss': 0.6347, 'learning_rate': 0.0002, 'epoch': 1.1}
{'loss': 0.6548, 'learning_rate': 0.0002, 'epoch': 1.16}
{'loss': 0.5633, 'learning_rate': 0.0002, 'epoch': 2.03}
{'loss': 0.542, 'learning_rate': 0.0002, 'epoch': 2.09}
{'loss': 0.4787, 'learning_rate': 0.0002, 'epoch': 2.15}
{'loss': 0.541, 'learning_rate': 0.0002, 'epoch': 3.01}
{'loss': 0.4412, 'learning_rate': 0.0002, 'epoch': 3.07}
{'loss': 0.4484, 'learning_rate': 0.0002, 'epoch': 3.13}
{'loss': 0.4279, 'learning_rate': 0.0002, 'epoch': 3.19}
{'loss': 0.3807, 'learning_rate': 0.0002, 'epoch': 4.06}
{'loss': 0.4359, 'learning_rate': 0.0002, 'epoch': 4.12}
{'loss': 0.4135, 'learning_rate': 0.0002, 'epoch': 4.18}
{'loss': 0.3498, 'learning_rate': 0.0002, 'epoch': 5.04}
{'loss': 0.3848, 'learning_rate': 

In [15]:
import os

# Replace 'your_directory_path' with the path of the directory you want to list
directory_path = '/home/sosa.s/gaitor-function-calling/instruction_tune/llama-7-int4-fc/checkpoint-39'

# List all files and directories in the specified path
for filename in os.listdir(directory_path):
    if os.path.isfile(os.path.join(directory_path, filename)):
        print(filename)

special_tokens_map.json
tokenizer.json
trainer_state.json
README.md
optimizer.pt
scheduler.pt
adapter_model.bin
rng_state.pth
tokenizer_config.json
training_args.bin
adapter_config.json


In [41]:
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import json

# Set the path to the checkpoint directory
checkpoint_path = "/home/sosa.s/gaitor-function-calling/instruction_tune/llama-7-chat-int4-fc"

adapter_config_path = checkpoint_path + '/adapter_config.json'
try:
    with open(adapter_config_path, 'r') as file:
        config_data = json.load(file)

    # Update the 'base_model_name_or_path' field
    config_data['base_model_name_or_path'] = "meta-llama/Llama-2-7b-chat-hf"

    # Write the updated data back to the file
    with open(adapter_config_path, 'w') as file:
        json.dump(config_data, file, indent=4)

    print("File updated successfully.")
except FileNotFoundError:
    print(f"File not found: {adapter_config_path}")
except Exception as e:
    print(f"An error occurred: {e}")

print("checkpoint_path: ", checkpoint_path)

# load base LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    checkpoint_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)


File updated successfully.
checkpoint_path:  /home/sosa.s/gaitor-function-calling/instruction_tune/llama-7-chat-int4-fc


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [43]:
from datasets import load_dataset
from random import randrange


# Load dataset from the hub and get a sample
sample = dataset[30]["text"]

inp, target = sample.split("[/INST]")
prompt = inp + "[/INST]"

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9,temperature=0.9)

print(f"Prompt:\n{prompt}\n\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy())[0][len(prompt):]}\n\n")
print(f"Ground truth:\n{target}\n\n")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:
<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"json": {"properties": {"link": {"type": "string", "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the input is a valid URL or a succinct search query."}}, "type": "object"}}}}]</FUNCTIONS>
<</SYS>>

1 sentence about https://www.openplugin.io/ [/INST]


Generated instruction:
NST] OpenPlugin.io is a platform that enables developers to create plugins for various applications, providing them with a seamless way to extend the functionality of their applications.</s>


In [44]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [45]:
repo_name = "function_calling-llama_7b"
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

adapter_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SebastianS/function_calling-llama_7b/commit/b1d3d8f68fcb986ce4de3c0cb068e45cf6106ddc', commit_message='Upload tokenizer', commit_description='', oid='b1d3d8f68fcb986ce4de3c0cb068e45cf6106ddc', pr_url=None, pr_revision=None, pr_num=None)

In [7]:
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import torch

# Set the path to the checkpoint directory
hub_id = "SebastianS/function_calling-llama_7b"

# load base LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    hub_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(hub_id)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# Load dataset from the hub and get a sample
sample = dataset[29]["text"]

inp, target = sample.split("[/INST]")
prompt = inp + "[/INST]"

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9,temperature=0.9)

print(f"Prompt:\n{prompt}\n\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy())[0][len(prompt):]}\n\n")
print(f"Ground truth:\n{target}\n\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:
<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"json": {"properties": {"link": {"type": "string", "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the input is a valid URL or a succinct search query."}}, "type": "object"}}}}]</FUNCTIONS>
<</SYS>>

"In two sentences tell me what this site is about https://www.openplugin.io/" [/INST]


Generated instruction:
NST] OpenPlugin.io is a website that offers a tool for users to interact with non-URL strings, providing them with tailored search results and connecting the

In [57]:
expected_str = """<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"json": {"properties": {"link": {"type": "string", "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the input is a valid URL or a succinct search query."}}, "type": "object"}}}}]</FUNCTIONS>
<</SYS>>

1 sentence about https://www.openplugin.io/ [/INST] <FUNCTION_CALL_NAME>transcodeWebPage</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>{"json": {"link": "https://www.openplugin.io/"}}</FUNCTION_CALL_ARGUMENTS> </s>"""
expected_str_target = expected_str.split("[/INST]")[1]
generated_str = """<s>[INST] <<SYS>>
<FUNCTIONS>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"json": {"properties": {"link": {"type": "string", "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the input is a valid URL or a succinct search query."}}, "type": "object"}}}}]</FUNCTIONS>
<</SYS>>

1 sentence about https://www.openplugin.io/ [/INST] <FUNCTION_CALL_NAME>transcodeWebPage</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>{"json": {"links": "https://www.openplugin.io/"}}</FUNCTION_CALL_ARGUMENTS> </s>"""
generated_str_target = generated_str.split("[/INST]")[1]

In [58]:
import re
import json
def parse_prompt_back_to_data(prompt):
    """
    Function to parse a prompt back into the original data format, using a dictionary of function calling tokens.
    
    :param prompt: A string representing the constructed prompt.
    :param function_calling_tokens: A dictionary containing the start and end tokens for different function call elements.
    :return: A dictionary representing the original data instance.
    """
    # Building regular expression patterns using the function_calling_tokens
    functions_pattern = rf"{function_calling_tokens['FUNCTIONS']['start']}(.*?){function_calling_tokens['FUNCTIONS']['end']}"
    input_pattern = r"<</SYS>>\n\n(.*?) \[/INST\]"  # This remains unchanged as it's not part of function_calling_tokens
    target_content_pattern = r"\[/INST\] (.*)</s>"  # This also remains unchanged
    function_call_name_pattern = rf"{function_calling_tokens['FUNCTION_CALL_NAME']['start']}(.*?){function_calling_tokens['FUNCTION_CALL_NAME']['end']}"
    function_call_arguments_pattern = rf"{function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['start']}(.*?){function_calling_tokens['FUNCTION_CALL_ARGUMENTS']['end']}"

    # Extracting data using regular expressions
    functions_str = re.search(functions_pattern, prompt).group(1)
    input_content = re.search(input_pattern, prompt).group(1)
    target_content_match = re.search(target_content_pattern, prompt)

    # Parse functions JSON string
    functions = json.loads(functions_str)

    # Prepare the data dictionary
    data = {
        "input": [{
            "chatgptMessage": {"role": "user", "content": input_content},
            "functions": functions
        }],
        "target": {
            "chatgptMessage": {"role": "assistant"},
            "functions": functions  # Including functions in the target as well
        }
    }

    # Check if the target has a function call
    if function_calling_tokens['FUNCTION_CALL_NAME']['start'] in prompt:
        function_call_name = re.search(function_call_name_pattern, prompt).group(1)
        function_call_arguments = re.search(function_call_arguments_pattern, prompt).group(1)
        data["target"]["chatgptMessage"]["function_call"] = {
            "name": function_call_name,
            "arguments": function_call_arguments
        }
    else:
        # Handle case where regex might not find a match for target content
        if target_content_match:
            target_content = target_content_match.group(1)
            data["target"]["chatgptMessage"]["content"] = target_content

    return data
expected_data = parse_prompt_back_to_data(expected_str)
generated_data = parse_prompt_back_to_data(generated_str)

In [59]:
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
import json

# Load a pre-trained model for sentence embedding (e.g., SBERT)
embedding_model_name = "sentence-transformers/bert-base-nli-mean-tokens"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

def get_sentence_embedding(sentence):
    inputs = embedding_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = embedding_model(**inputs)

    # Mean Pooling - Take attention mask into account for correct averaging
    attention_mask = inputs['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
    sum_embeddings = torch.sum(outputs.last_hidden_state * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    mean_pooled = sum_embeddings / sum_mask

    return mean_pooled[0].numpy()

def sentence_similarity(sent1, sent2):
    embedding1 = get_sentence_embedding(sent1)
    embedding2 = get_sentence_embedding(sent2)
    return 1 - cosine(embedding1, embedding2)

def custom_metric(generated_json, expected_json):
    def compare_json(g_json, e_json, key_similarity_scores, value_similarity_scores):
        for e_key, e_value in e_json.items():
            # Check for exact key match or find the most similar key
            if e_key in g_json:
                g_key = e_key
                key_similarity_scores.append(1)
            else:
                # Compute similarity with all keys in generated_json and find the best match
                key_similarity = {gen_key: sentence_similarity(e_key, gen_key) for gen_key in g_json.keys()}
                g_key, key_sim_score = max(key_similarity.items(), key=lambda x: x[1])
                key_similarity_scores.append(key_sim_score)

            # Recursive comparison for nested objects, else compare values
            if isinstance(e_value, dict) and isinstance(g_json.get(g_key, {}), dict):
                compare_json(g_json[g_key], e_value, key_similarity_scores, value_similarity_scores)
            elif isinstance(e_value, str) and isinstance(g_json.get(g_key, ""), str):
                # Compare values only if they are strings at the root level
                value_sim_score = sentence_similarity(e_value, g_json[g_key])
                value_similarity_scores.append(value_sim_score)
            elif e_value == g_json.get(g_key, None):
                value_similarity_scores.append(1)  # Exact match for non-string root values
            else:
                value_similarity_scores.append(0)  # Non-matching root values

    key_similarity_scores = []
    value_similarity_scores = []
    compare_json(generated_json, expected_json, key_similarity_scores, value_similarity_scores)

    # Compute the average similarity scores
    avg_key_similarity = sum(key_similarity_scores) / len(key_similarity_scores) if key_similarity_scores else 0
    avg_value_similarity = sum(value_similarity_scores) / len(value_similarity_scores) if value_similarity_scores else 0

    return (avg_key_similarity + avg_value_similarity) / 2


# Example usage
generated_json = json.loads(generated_data["target"]["chatgptMessage"]["function_call"]["arguments"])
expected_json = json.loads(expected_data["target"]["chatgptMessage"]["function_call"]["arguments"])
score = custom_metric(generated_json, expected_json)
score


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


0.9852805435657501

In [49]:
json.loads(expected_data["target"]["chatgptMessage"]["function_call"]["arguments"])

{'json': {'link': 'https://www.openplugin.io/'}}

In [19]:
from huggingface_hub import HfApi, HfFolder, Repository
import os
api = HfApi()
api.create_repo("function_calling-llama_7b")


RepoUrl('https://huggingface.co/SebastianS/function_calling-llama_7b', endpoint='https://huggingface.co', repo_type='model', repo_id='SebastianS/function_calling-llama_7b')

In [None]:
repo = Repository("function_calling-llama_7b", clone_from="SebastianS/function_calling-llama_7b")


In [23]:
if use_flash_attention:
    # unpatch flash attention
    from utils.llama_patch import unplace_flash_attn_with_attn
    unplace_flash_attn_with_attn()

import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

args.output_dir = "llama-7-int4-fc"

# load base LLM model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
)
model.config.pretraining_tp = 1


tokenizer = AutoTokenizer.from_pretrained(model_id)



NameError: name 'AutoModelForCausalLM' is not defined

In [24]:
from datasets import load_dataset
from random import randrange


# Load dataset from the hub and get a sample
sample = dataset[0]["text"]

inp, target = sample.split("### Response:\n")
prompt = inp + "### Response:\n"

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)

print(f"Prompt:\n{prompt}\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"Ground truth:\n{target}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:

  Below is an instruction that describes a "function calling" task...

  ### Instruction:
  Analyze the prompt and json speicifaction pair (denoted in <FC></FC>) to produce a relevant "function calling" response (denoted in <FCR></FCR>), otherwise return a plain text response.

  ### Input:
  sumarize this in 1 sentence https://openai.com/blog/function-calling-and-other-api-updates
  <FC>[{"name": "transcodeWebPage", "description": "Acquire precise webpage details or real-time search engine responses based on user-input content.", "parameters": {"type": "object", "properties": {"json": {"properties": {"link": {"type": "string", "description": "This parameter takes either a URL or a non-URL string. If a URL is given, the model will engage with the designated webpage to collect or interact with its data. If a non-URL string is given, the model will handle it as a search inquiry and try to find related real-time news or information. To guarantee the best results, make sure the in