In [1]:
!pip install "transformers==4.35.2" "datasets==2.15.0" "peft==0.6.2" "trl==0.7.4"

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# add parent directory to path
import sys
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "4"

project_dir = os.getcwd()
parent_dir = os.path.dirname(project_dir)
sys.path.insert(0, parent_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
os.getcwd()

'/home/sosa.s/gaitor_function_calling'

In [4]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('json', data_files='./data/train_test/glaive_dataset-only_fc-train-glaive-full_train/train.json')
dataset["train"] = dataset["train"].select(range(10000))

print(dataset)

# get data and convert to Datasets format
import pandas as pd
from datasets import Dataset, DatasetDict
from gaitor_function_calling.data.data_utils import DataAbstractor
from gaitor_function_calling.data.prompting_utils import INSTRUCTION

data_abstractor = DataAbstractor("production_data-without_summary-only_fc.json", "sft")

train_df = pd.DataFrame(data_abstractor.train_data)
test_df = pd.DataFrame(data_abstractor.test_data)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_sft = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

dataset_sft

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 10000
    })
})
No raw data found


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 833
    })
    test: Dataset({
        features: ['text'],
        num_rows: 17
    })
})

In [5]:
from huggingface_hub import notebook_login


model_id = "meta-llama/Llama-2-7b-chat-hf"
output_dir="llama-7-chat-instruction-int4-glaive-fc-testing"
hub_model_id=f"SebastianS/{output_dir}"
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


# model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True).to(torch.device("cuda", index=3))
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from datasets import Metric
from scipy.spatial.distance import cosine
import torch
from transformers import AutoTokenizer, AutoModel
from gaitor_function_calling.evaluation.evaluation_utils import FunctionCallingMetric, compute_perplexity, get_logits_and_labels
from gaitor_function_calling.data.prompting_utils import INSTRUCTION, json_arguments_from_prompt
import datasets
from transformers import EvalPrediction
import numpy as np

fc_metric = FunctionCallingMetric()
epoch=-1
class MyCustomMetric(Metric):
    def _info(self):
        # Returns the MetricInfo that defines the name, description, etc.
        return datasets.MetricInfo(
            # This should be a short description of your metric.
            description="_DESCRIPTION",
            # You can cite papers, GitHub repositories, etc.
            citation="_CITATION",
            # The inputs and outputs your metric expects.
            # These are used to validate the inputs and outputs of _compute
            inputs_description="_KWARGS_DESCRIPTION",
            features=datasets.Features({
                'predictions': datasets.Value('string'),
                'references': datasets.Value('string')
            })
        )

    def _compute(self, predictions, references):
        # Here is where you should put your main metric computation logic
        # Adapt your existing code to fit in here
        
        fc_results = []
        for idx, example in enumerate(predictions):
            print(f"Example {idx}: ", end="")
            post_message = ""

            # Custom Function Calling metric
            prompts = None
            try:
                generated_arguments, expected_arguments, prompts = json_arguments_from_prompt(
                    references[idx],
                    predictions[idx],
                    INSTRUCTION
                    # {"idx": idx, "epoch": epoch}
                )
                fc_result = fc_metric.run(generated_arguments, expected_arguments)

                fc_results.append(fc_result)

                # if save_prompts_path:
                #     # add prompts to dpo_data.json
                #     dpo_data.append({
                #         "fc_result": fc_result,
                #         **prompts
                #     })
                #     with open(save_prompts_path, "w") as f:
                #         json.dump(dpo_data, f)
            except Exception as e:
                print(f"Error function calling: {e}\n")
                fc_results.append(0)
        return fc_results

    
sample = {
    "fc_result":1,
    "expected_str":"""<s>[INST] <<SYS>> Your job is to identify weather or not the user input is related to the function specification delimited by <FUNCTIONS> and </FUNCTIONS>. If it is related then your response should be in the function_calling format: <FUNCTION_CALL_NAME>NAME_ASSOCIATED_WITH_THE_FUNCTION</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>ARGUMENTS_IN_STRINGIFIED_JSON_FORMAT</FUNCTION_CALL_ARGUMENTS>. Otherwise simply return a normal response. <FUNCTIONS>[{"name": "root_healthcheck_get", "description": "Root", "parameters": {"type": "object", "properties": {}}}, {"name": "getSearchResults", "description": "Get a list of gifs based on the search query", "parameters": {"type": "object", "properties": {"json": {"properties": {"query": {"type": "string", "title": "Query"}}, "type": "object", "required": ["query"], "title": "GiphyRequest"}}}}, {"name": "getTrendingResults", "description": "Get a list of gifs based on the current trends", "parameters": {"type": "object", "properties": {}}}]</FUNCTIONS> <</SYS>> show me one gif of a cool cat [/INST] <FUNCTION_CALL_NAME>getSearchResults</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>{"json": {"query": "cool cat"}}</FUNCTION_CALL_ARGUMENTS> </s>""",
    "generated_str":"""<s><s> [INST] <<SYS>> Your job is to identify weather or not the user input is related to the function specification delimited by <FUNCTIONS> and </FUNCTIONS>. If it is related then your response should be in the function_calling format: <FUNCTION_CALL_NAME>NAME_ASSOCIATED_WITH_THE_FUNCTION</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>ARGUMENTS_IN_STRINGIFIED_JSON_FORMAT</FUNCTION_CALL_ARGUMENTS>. Otherwise simply return a normal response. <FUNCTIONS>[{"name": "root_healthcheck_get", "description": "Root", "parameters": {"type": "object", "properties": {}}}, {"name": "getSearchResults", "description": "Get a list of gifs based on the search query", "parameters": {"type": "object", "properties": {"json": {"properties": {"query": {"type": "string", "title": "Query"}}, "type": "object", "required": ["query"], "title": "GiphyRequest"}}}}, {"name": "getTrendingResults", "description": "Get a list of gifs based on the current trends", "parameters": {"type": "object", "properties": {}}}]</FUNCTIONS> <</SYS>> show me one gif of a cool cat [/INST] <FUNCTION_CALL_NAME>getSearchResults</FUNCTION_CALL_NAME><FUNCTION_CALL_ARGUMENTS>{"json": {"query": "cool cat"}}</FUNCTION_CALL_ARGUMENTS> </s>"""
}

class TestMyCustomMetric():

    def __init__(self):
        self.metric = MyCustomMetric()

    def test_simple_case(self):
        # Test a simple case with known inputs and outputs
        predictions = [sample["expected_str"]]
        references = [sample["generated_str"]]
        
        expected_score = 1  # Replace with the expected score
        result = self.metric._compute(predictions=predictions, references=references)
        print(expected_score, result)

    def test_edge_cases(self):
        # Test various edge cases like empty inputs, very long texts, etc.
        pass

    # Add more test cases as needed

my_custom_metric = MyCustomMetric()
tester = TestMyCustomMetric()

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, EvalPrediction

def compute_metrics(eval_pred: EvalPrediction):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    predictions, label_ids = eval_pred
    if isinstance(predictions, np.ndarray):
        predictions = torch.from_numpy(eval_pred.predictions)
        label_ids = torch.from_numpy(eval_pred.label_ids)
    # You might need to adjust the format of predictions and labels
    # to match what your custom metric expects
    print("predictions:",predictions, end="\n\n\n")
    print("label_ids:",label_ids, end="\n\n\n")
    
    predicted_token_ids = torch.argmax(predictions, dim=-1).to(device)
    
    # Replace -100 in label_ids with tokenizer.pad_token_id
    label_ids = label_ids.clone()  # Clone to avoid modifying the original tensor
    ignore_index_mask = label_ids == -100
    label_ids[ignore_index_mask] = tokenizer.pad_token_id
    
    predicted_tokens = [tokenizer.decode(ids) for ids in predicted_token_ids]
    label_tokens = [tokenizer.decode(ids) for ids in label_ids]

    # Join tokens into strings
    predicted_strings = [''.join(tokens) for tokens in predicted_tokens]
    label_strings = [''.join(tokens) for tokens in label_tokens]
    
    print("predicted_strings: ", predicted_strings, end="\n\n\n")
    print("label_strings: ", label_strings, end="\n\n\n")

    # Compute the metric
    metric_result = my_custom_metric._compute(predictions=predicted_strings, references=label_strings)
    
    # Return results in a dictionary
    return {"my_custom_metric": metric_result}



# Sample texts
sample_texts = [sample["expected_str"]]
# Tokenize texts
inputs = tokenizer(sample_texts, padding=True, truncation=True, return_tensors="pt").to(device)

# Generate logits using the model
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Create labels (shifting input_ids to the right by one)
labels = inputs.input_ids

# Create an EvalPrediction object
mock_eval_pred = EvalPrediction(predictions=logits, label_ids=labels)

# Now you can use mock_eval_pred to test your compute_metrics function
result = compute_metrics(mock_eval_pred)

print(result)


# tester.test_simple_case()

  my_custom_metric = MyCustomMetric()
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


predictions: tensor([[[-0.0523, -0.2910,  0.3679,  ...,  1.1621,  1.7275,  0.5503],
         [-0.0523, -0.2910,  0.3679,  ...,  1.1621,  1.7275,  0.5503],
         [-6.3398, -0.7319,  1.3848,  ..., -4.5820, -6.2188, -2.3750],
         ...,
         [ 0.0693,  1.1943, 20.2969,  ..., -0.1329, -0.6284,  0.1520],
         [-0.2079, -2.1934, 16.8281,  ...,  1.4727, -1.4873, -1.9941],
         [-2.6406,  1.3369, 10.1875,  ..., -3.5703, -4.8867, -2.0137]]],
       device='cuda:0')


label_ids: tensor([[    1,     1,   518, 25580, 29962,  3532, 14816, 29903,  6778,  3575,
          4982,   338,   304, 12439, 14826,   470,   451,   278,  1404,  1881,
           338,  4475,   304,   278,   740, 21992,   628,   326,  1573,   491,
           529, 29943, 28700, 29903, 29958,   322,  1533, 29943, 28700, 29903,
         15513,   960,   372,   338,  4475,   769,   596,  2933,   881,   367,
           297,   278,   740, 29918,  4804,   292,  3402, 29901,   529, 29943,
         28700, 29918, 29907,  981

In [10]:
def print_tokens_with_ids(txt):
    tokens = tokenizer.tokenize(txt, add_special_tokens=False)
    token_ids = tokenizer.encode(txt, add_special_tokens=False)
    print(list(zip(tokens, token_ids)))

response_template = "[/INST]"
print_tokens_with_ids(response_template)
instruction_template = "<s> [INST] <<SYS>>"
print_tokens_with_ids(instruction_template)



[('‚ñÅ[', 518), ('/', 29914), ('INST', 25580), (']', 29962)]
[('<s>', 1), ('‚ñÅ', 29871), ('‚ñÅ[', 518), ('INST', 25580), (']', 29962), ('‚ñÅ<<', 3532), ('SY', 14816), ('S', 29903), ('>>', 6778)]


In [12]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from peft import LoraConfig

training_args = TrainingArguments(
    per_device_train_batch_size=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    hub_strategy="every_save",
    hub_model_id=hub_model_id,
    push_to_hub=True,
    num_train_epochs=3,
    eval_steps=1000,
    output_dir=output_dir,
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM",
)

response_template = "[/INST]"
instruction_template = "[INST]"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)

In [15]:
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
from transformers import Trainer
import torch
import torch.nn as nn

class CustomTrainer(Trainer):
    def prediction_step(
        self,
        model: nn.Module,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        prediction_loss_only: bool,
        ignore_keys: Optional[List[str]] = None,
    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Perform an evaluation step on `model` using `inputs`.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to evaluate.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (`bool`):
                Whether or not to return the loss only.
            ignore_keys (`Lst[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.

        Return:
            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
            logits and labels (each being optional).
        """
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)
        if ignore_keys is None:
            if hasattr(self.model, "config"):
                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
            else:
                ignore_keys = []

        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
        if has_labels:
            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
            if len(labels) == 1:
                labels = labels[0]
        else:
            labels = None

        with torch.no_grad():
            if is_sagemaker_mp_enabled():
                raw_outputs = smp_forward_only(model, inputs)
                if has_labels:
                    if isinstance(raw_outputs, dict):
                        loss_mb = raw_outputs["loss"]
                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
                    else:
                        loss_mb = raw_outputs[0]
                        logits_mb = raw_outputs[1:]

                    loss = loss_mb.reduce_mean().detach().cpu()
                    logits = smp_nested_concat(logits_mb)
                else:
                    loss = None
                    if isinstance(raw_outputs, dict):
                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
                    else:
                        logits_mb = raw_outputs
                    logits = smp_nested_concat(logits_mb)
            else:
                if has_labels:
                    with self.compute_loss_context_manager():
                        loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
                    loss = loss.mean().detach()

                    if isinstance(outputs, dict):
                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
                    else:
                        logits = outputs[1:]
                else:
                    loss = None
                    with self.compute_loss_context_manager():
                        outputs = model(**inputs)
                    if isinstance(outputs, dict):
                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
                    else:
                        logits = outputs
                    # TODO: this needs to be fixed and made cleaner later.
                    if self.args.past_index >= 0:
                        self._past = outputs[self.args.past_index - 1]

        if prediction_loss_only:
            return (loss, None, None)

        logits = nested_detach(logits)
        if len(logits) == 1:
            logits = logits[0]

        return (loss, logits, labels)

In [11]:
from transformers import AutoModelForCausalLM
from trl import SFTTrainer


model = AutoModelForCausalLM.from_pretrained(model_id)



trainer = SFTTrainer(
    model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset_sft["test"],
    compute_metrics=compute_metrics,
    dataset_text_field="text",
    peft_config=peft_config,
    max_seq_length=4096,
    data_collator=collator
)

trainer.e



In [None]:
trainer.train()