In [1]:
!pip install "transformers==4.34.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.23.0" "bitsandbytes==0.41.1" "trl==0.4.7" "safetensors>=0.3.1" ipywidgets wandb --upgrade
!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!pip install ninja packaging
!MAX_JOBS=1 pip install flash-attn --no-build-isolation

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
# add parent directory to path
import sys
import os

project_dir = os.getcwd()
parent_dir = os.path.dirname(project_dir)
sys.path.insert(0, parent_dir)

In [3]:
# get data and convert to Datasets format
import pandas as pd
from datasets import Dataset, DatasetDict
from gaitor_function_calling.data.data_utils import DataAbstractor
from gaitor_function_calling.data.prompting_utils import INSTRUCTION

data_abstractor = DataAbstractor("production_data-without_summary-only_fc.json", "unique")
data_abstractor.get_data()

train_df = pd.DataFrame(data_abstractor.train_data)
test_df = pd.DataFrame(data_abstractor.test_data)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 807
    })
    test: Dataset({
        features: ['text'],
        num_rows: 43
    })
})

In [4]:
# setup metrics and evaluation
from transformers import EvalPrediction
from gaitor_function_calling.evaluation.evaluation_utils import FunctionCallingMetric, compute_perplexity, get_logits_and_labels
from gaitor_function_calling.data.prompting_utils import INSTRUCTION, json_arguments_from_prompt
import numpy as np

fc_metric = FunctionCallingMetric()
def config_compute_metrics(tokenizer):
    def compute_metrics(pred: EvalPrediction):
        # pred.predictions are a batch of logits
        # pred.label_ids are a batch of tokens
        token_ids = np.argmax(pred.predictions, axis=-1)
        predictions = tokenizer.batch_decode(token_ids, skip_special_tokens=False)
        labels = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=False)
        
        
    
        # Now use your metric class
        fc_result = fc_metric.run(predictions, labels)  # Implement this method in your class
    
        return {
            "fc_combine": fc_result
        }
    return compute_metrics

def custom_evaluation(eval_dataset, model, tokenizer):
    print("Starting custom evaluation.")
    model.eval()  # Set the model to evaluation mode
    results = {}

    fc_results = []
    perplexity_results = []
    for idx, example in enumerate(eval_dataset):
        print(f"Example {idx}: ", end="")
        post_message = ""
        
        # Custom Function Calling metric
        fc_message = ""
        try:
            generated_arguments, expected_arguments = json_arguments_from_prompt(
                example["text"],
                model,
                tokenizer,
                INSTRUCTION
            )
            fc_result = fc_metric.run(generated_arguments, expected_arguments)
            fc_results.append(fc_result)
        except Exception as e:
            post_message += f"Error function calling: {e}\n"
            fc_results.append(0)

        # perplexity metric
        try:
            logits, labels = get_logits_and_labels(example["text"], model, tokenizer)
            perplexity = compute_perplexity(logits[..., :-1, :], labels).item()
            perplexity_results.append(perplexity)
        except Exception as e:
            post_message += f"Error perplexity: {e}\n"
            # perplexity_results.append(float('inf'))

        example_metric = {"fc_combine": fc_results[-1], "perplexity": perplexity_results[-1]}
        print(example_metric)
        if post_message:
            print(post_message)
        
    results["fc_combine"] =  sum(fc_results) / len(fc_results)
    results["perplexity"] =  sum(perplexity_results) / len(perplexity_results)

    return results

In [5]:
# Hugging Face model id
model_id = "meta-llama/Llama-2-7b-chat-hf"
output_dir = "llama-7-chat-instruction-int4-fc-pipeline"

In [6]:
# base LLM model and tokenizer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

use_flash_attention = False

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
)
model.config.pretraining_tp = 1


tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# peft config on the model
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
)


# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)


In [8]:
# training config
from transformers import TrainingArguments
from trl import SFTTrainer


args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=6 if use_flash_attention else 4,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=True # disable tqdm since with packing values are in correct
)


max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_dict["train"],
    # eval_dataset=dataset_dict["test"],
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    compute_metrics=config_compute_metrics(tokenizer),
    dataset_text_field="text",
    args=args,
)

In [None]:
# Training and Evaluation Loop
num_epochs = 1
for epoch in range(num_epochs):
    # Train for one epoch
    trainer.train()

    # Evaluate the model with the custom evaluation logic
    eval_result = custom_evaluation(dataset_dict["test"], model, tokenizer)

    # Save the model after each epoch (optional)
    trainer.save_model()

    # Log or print evaluation results
    print(f"Evaluation results for epoch {epoch}: {eval_result}")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msebastiansosa[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.8772, 'learning_rate': 0.0002, 'epoch': 0.1}
{'loss': 0.6309, 'learning_rate': 0.0002, 'epoch': 0.2}
{'train_runtime': 186.2506, 'train_samples_per_second': 4.333, 'train_steps_per_second': 0.542, 'train_loss': 0.7487607056444342, 'epoch': 0.22}
Starting custom evaluation.
Example 0: {'fc_combine': 0, 'perplexity': 3.9063055515289307}
Error function calling: 'NoneType' object has no attribute 'group'

Example 1: 

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'fc_combine': 0.3722313642501831, 'perplexity': 4.183399677276611}
Example 2: {'fc_combine': 0.4100513756275177, 'perplexity': 3.222144842147827}
Example 3: {'fc_combine': 0.6185612181822459, 'perplexity': 3.5680434703826904}
Example 4: {'fc_combine': 0, 'perplexity': 3.9063055515289307}
Error function calling: division by zero

Example 5: {'fc_combine': 0, 'perplexity': 3.9063055515289307}
Error function calling: division by zero

Example 6: {'fc_combine': 0, 'perplexity': 4.838942527770996}
Error function calling: 'NoneType' object has no attribute 'group'

Example 7: {'fc_combine': 0, 'perplexity': 4.259296417236328}
Error function calling: Expecting value: line 1 column 1 (char 0)

Example 8: {'fc_combine': 0, 'perplexity': 4.570363521575928}
Error function calling: 'NoneType' object has no attribute 'group'

Example 9: {'fc_combine': 0, 'perplexity': 4.147039890289307}
Error function calling: Expecting value: line 1 column 1 (char 0)

Example 10: {'fc_combine': 0.4462427298227946