This notebook provides a method to compare the similarity of the Json output between Llama 3, Llama 3 - one shot, GPT 3.5, and my model while dealing with API endpoints IE (information extraction) task

# Load test dataset

In [1]:
# use the same test/eval data while in training
from datasets import load_dataset

dataset = load_dataset('billyfin/doc2json')
# delete the last line for future one-shot test
one_shot_example = dataset['train'][166]
dataset = dataset.filter(lambda example, idx: idx != 166, with_indices=True)
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
test_dataset = dataset['test']

In [2]:
print(test_dataset['json_form'][0])
print(test_dataset['text_content'][0])

{
    "title": "MyIP.com JSON API Documentation",
    "endpoints": [
        {
            "name": "Get IP Information",
            "description": "Retrieves information about the IP address making the request.",
            "method": "GET",
            "url": "https://api.myip.com",
            "headers": [],
            "required_parameters": [],
            "optional_parameters": []
        }
    ]
}
JSON API | MyIP.com JSON API Contact JSON API You can make automated requests to the site using the API . Access URL: https://api.myip.com Response example: {"ip":"66.249.75.9","country":"United States","cc":"US"} Response elements: ip: IP address country: IP country location in English language cc: Two-letter country code in ISO 3166-1 alpha-2 format If there is no location data for an IP address cc will return "XX" and country "Unknown". Is this a free service? Yes. What are the API usage limits? There is no request limit, the only restriction is the server capacity which I will try 

# Define the evaluation method for structure similarity

In [9]:
def compare_and_score_json(json1, json2, path=""):
    """ Recursively compares two nested JSON objects and calculates similarity score. """
    total_score = 0
    max_score = 0

    if isinstance(json1, dict) and isinstance(json2, dict):
        keys1 = set(json1.keys())
        keys2 = set(json2.keys())
        all_keys = keys1.union(keys2)

        max_score += len(all_keys)

        for key in all_keys:
            if key in json1 and key in json2:
                # Increment score for matching keys
                sub_score, sub_max = compare_and_score_json(json1[key], json2[key], path + f".{key}")
                total_score += sub_score
                max_score += sub_max - 1  # Adjust because the key itself is counted in max_score
            elif key in json1 or key in json2:
                # Key missing in one of the JSONs
                continue
    elif isinstance(json1, list) and isinstance(json2, list):
        # Simple comparison of list lengths; could be expanded to compare elements
        min_len = min(len(json1), len(json2))
        max_len = max(len(json1), len(json2))
        total_score += min_len
        max_score += max_len
    # else:
    #     # Compare scalar values
    #     max_score += 1
    #     if json1 == json2:
    #         total_score += 1
    #     elif type(json1) == type(json2):
    #         # If types match but not values, give partial credit
    #         total_score += 0.5

    return total_score, max_score

# Example JSON structures
json1 = {
    "name": "Get IP Information",
    "description": "Retrieve IP address information",
    "details": {
        "method": "GET",
        "url": "https://api.myip.com"
    }
}

json2 = {
    "name": "Get IP Information",
    "description": "Get the IP address, country, and two-letter country code.",
    "details": {
        "method": "POST",  # Note the difference here
        "url": "https://api.myip.com"
    }
}

total_score, max_score = compare_and_score_json(json1, json2)
similarity_score = total_score / max_score if max_score != 0 else 0
print(f"Similarity Score: {similarity_score:.2f}")


Similarity Score: 0.75


# Define the evaluation method for Json keys' values

In [None]:
def value_similarity(json1, json2):
    if isinstance(json1, dict) and isinstance(json2, dict):
        common_keys = set(json1.keys()) & set(json2.keys())
        total_similarity = sum(value_similarity(json1[k], json2[k]) for k in common_keys)
        max_similarity = len(json1.keys()) + len(json2.keys()) - len(common_keys) # adjust for unmatched keys
        return total_similarity / max_similarity if max_similarity else 1.0
    else:
        return 0

def json_similarity_value(file1, file2):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        json1 = json.load(f1)
        json2 = json.load(f2)
        return value_similarity(json1, json2)


In [8]:
def compare_strings(str1, str2):
    """ Simple comparison for exact matches or partial based on common words. """
    if str1 == str2:
        return 1.0
    common_words = set(str1.split()) & set(str2.split())
    return len(common_words) / max(len(str1.split()), len(str2.split()))

def compare_responses(resp1, resp2):
    """ Compare response elements by checking all keys and descriptions. """
    keys1 = set(resp1.keys())
    keys2 = set(resp2.keys())
    if keys1 != keys2:
        return 0.5  # Partial score if keys do not match exactly
    
    # Full match of descriptions
    for key in keys1:
        if resp1[key] != resp2[key]:
            return 0.75  # Slightly higher if only descriptions differ
    return 1.0

def compute(json, truth):
    title_score = compare_strings(truth['title'], json['title']) * 0.05
    
    description_score = compare_strings(truth['description'], test_json['description']) * 0.15
    method_score = (1 if ground_truth['method'] == test_json['method'] else 0) * 0.20
    url_score = (1 if ground_truth['url'] == test_json['url'] else 0) * 0.20
    response_score = compare_responses(ground_truth['response_elements'], test_response_elements) * 0.30
    
    total_score = name_score + description_score + method_score + url_score + response_score
    total_score


(0.7214285714285713, 0.7214285714285713)

# Preparation

In [3]:
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup, BitsAndBytesConfig
from huggingface_hub import notebook_login
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PeftModel, PeftConfig
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import transformers

torch.manual_seed(42)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Llama 3

In [12]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"quantization_config": quantization_config},
    device_map="auto",
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [48]:
import json

count = 1
for test_sample in test_dataset['text_content']:
    messages = [
        {"role": "system", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format."},
        {"role": "user", "content": "API text content: " + test_sample + "\n\nJson: "},
    ]
    outputs = pipeline(
        messages,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        return_full_text=False,
    )
    
    result = outputs[0]["generated_text"]
    with open("./model_outputs/llama3/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    
    count+=1

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for

# Llama 3 - one shot

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"quantization_config": quantization_config},
    device_map="auto",
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [49]:
import json

count = 1
for test_sample in test_dataset['text_content']:
    messages = [
        {"role": "user", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " + one_shot_example['text_content'] + "\n\nJson: "},
        {"role": "assistant", "content": one_shot_example['json_form']},
        {"role": "user", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " + test_sample + "\n\nJson: "},
    ]
    outputs = pipeline(
        messages,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        return_full_text=False,
    )
    
    result = outputs[0]["generated_text"]
    with open("./model_outputs/llama3_one_shot/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    count+=1

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for

# GPT3.5 - one shot

In [50]:
from openai import OpenAI
OPENAI_API_KEY = str(input('Please type in your api key: '))

count = 1
client = OpenAI(api_key=OPENAI_API_KEY)
for test_sample in test_dataset['text_content']:
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        # model="gpt-4-turbo",
        messages=[
            {"role": "user", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " + one_shot_example['text_content'] + "\n\nJson: "},
            {"role": "assistant", "content": one_shot_example['json_form']},
            {"role": "user", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " + test_sample + "\n\nJson: "},
        ],
        temperature=0,
    )
    result = str(completion.choices[0].message.content)
    with open("./model_outputs/gpt3.5_one_shot/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    count+=1

Please type in your api key:  sk-None-LBwUJe7KgakZQCd1sFS2T3BlbkFJGZlBKtOqC13W19K504OG


# My model

In [4]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)
    print("Current CUDA Device:", device_name)
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead")

GPU is available
Current CUDA Device: NVIDIA L40


In [5]:
peft_model_id = "billyfin/llama_3_prompt_tuning_api2json_v4"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                             quantization_config=quantization_config,
                                             low_cpu_mem_usage=True,
                                            )
model = PeftModel.from_pretrained(model, peft_model_id)

adapter_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/328k [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct')

# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
max_length = 10240

def format(example):
    input_messages = [
        {"role":"user", "content": one_shot_example['text_content']},
        {"role":"assistant", "content": one_shot_example['json_form']},
        {"role":"user", "content": example},
    ]
    example = tokenizer.apply_chat_template(input_messages, tokenize=False) + "<|start_header_id|>assistant<|end_header_id|>\n\n"
    return example
    
def preprocess_for_inference(examples):
    inputs = f"{examples}"
    
    model_inputs = tokenizer(inputs)
    model_inputs['input_ids'] += [tokenizer.pad_token_id]
    model_inputs["attention_mask"] = [1] * len(model_inputs["input_ids"])
    
    sample_input_ids = model_inputs["input_ids"]
    model_inputs["input_ids"] = [tokenizer.pad_token_id] * (
        max_length - len(sample_input_ids)
    ) + sample_input_ids
    model_inputs["attention_mask"] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
        "attention_mask"
    ]
    model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"][:max_length])
    model_inputs["attention_mask"] = torch.tensor(model_inputs["attention_mask"][:max_length])
    return model_inputs

In [8]:
count = 1
for test_sample in test_dataset['text_content']:
    test_sample = format(test_sample)
    test_input = preprocess_for_inference(test_sample)
    inputs = {k: v.unsqueeze(0).to(device) for k, v in test_input.items()}
    prompt = inputs['input_ids'].shape[1]
    
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"], 
            attention_mask=inputs["attention_mask"],
            max_new_tokens=1024,
            temperature=0.1
        )
    
    result = tokenizer.decode(outputs[0, prompt:], skip_special_tokens=True)
    with open("./model_outputs/my_model/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    count+=1

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for o

In [13]:
print(len(test_dataset[24]['text_content']))

28106


# Evaluation