This notebook provides a method to compare the similarity of the Json output between Llama 3, Llama 3 - one shot, GPT 3.5, and my model while dealing with API endpoints IE (information extraction) task

# Load test dataset

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# use the same test/eval data while in training
from datasets import load_dataset

dataset = load_dataset('billyfin/APIdoc2json')
# delete the last line for future one-shot test
one_shot_example = dataset['train'][166]
dataset = dataset.filter(lambda example, idx: idx != 166, with_indices=True)
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
test_dataset = dataset['test']

In [3]:
print(test_dataset['json_form'][0])
print(test_dataset['text_content'][0])

{
    "title": "MyIP.com JSON API Documentation",
    "endpoints": [
        {
            "name": "Get IP Information",
            "description": "Retrieves information about the IP address making the request.",
            "method": "GET",
            "url": "https://api.myip.com",
            "headers": [],
            "required_parameters": [],
            "optional_parameters": []
        }
    ]
}
JSON API | MyIP.com JSON API Contact JSON API You can make automated requests to the site using the API . Access URL: https://api.myip.com Response example: {"ip":"66.249.75.9","country":"United States","cc":"US"} Response elements: ip: IP address country: IP country location in English language cc: Two-letter country code in ISO 3166-1 alpha-2 format If there is no location data for an IP address cc will return "XX" and country "Unknown". Is this a free service? Yes. What are the API usage limits? There is no request limit, the only restriction is the server capacity which I will try 

# Preparation

In [4]:
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup, BitsAndBytesConfig
from huggingface_hub import notebook_login
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PeftModel, PeftConfig
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import transformers

torch.manual_seed(42)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Llama 3 outputs

In [12]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"quantization_config": quantization_config},
    device_map="auto",
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [48]:
import json

count = 1
for test_sample in test_dataset['text_content']:
    messages = [
        {"role": "system", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format."},
        {"role": "user", "content": "API text content: " + test_sample + "\n\nJson: "},
    ]
    outputs = pipeline(
        messages,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        return_full_text=False,
    )
    
    result = outputs[0]["generated_text"]
    with open("./model_outputs/llama3/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    
    count+=1

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for

# Llama 3 - one shot outputs

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"quantization_config": quantization_config},
    device_map="auto",
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [49]:
import json

count = 1
for test_sample in test_dataset['text_content']:
    messages = [
        {"role": "user", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " + one_shot_example['text_content'] + "\n\nJson: "},
        {"role": "assistant", "content": one_shot_example['json_form']},
        {"role": "user", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " + test_sample + "\n\nJson: "},
    ]
    outputs = pipeline(
        messages,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        return_full_text=False,
    )
    
    result = outputs[0]["generated_text"]
    with open("./model_outputs/llama3_one_shot/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    count+=1

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for

# GPT3.5 - one shot outputs

In [50]:
from openai import OpenAI
OPENAI_API_KEY = str(input('Please type in your api key: '))

count = 1
client = OpenAI(api_key=OPENAI_API_KEY)
for test_sample in test_dataset['text_content']:
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        # model="gpt-4-turbo",
        messages=[
            {"role": "user", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " + one_shot_example['text_content'] + "\n\nJson: "},
            {"role": "assistant", "content": one_shot_example['json_form']},
            {"role": "user", "content": "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " + test_sample + "\n\nJson: "},
        ],
        temperature=0,
    )
    result = str(completion.choices[0].message.content)
    with open("./model_outputs/gpt3.5_one_shot/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    count+=1

Please type in your api key:  sk-None-LBwUJe7KgakZQCd1sFS2T3BlbkFJGZlBKtOqC13W19K504OG


# GPT3.5 + pydantic schema - one shot outputs

In [6]:
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from typing import Optional, Any, Union, List

class Parameters(BaseModel):
    name: str = Field(description="Name of the parameter")
    type: str = Field(description="Type of the parameter")
    description: str = Field(description="Description of the parameter")
    default: Optional[Any] = Field(
        description="Default value of the parameter")
    example: Optional[Any] = Field(
        description="Example value of the parameter")


class Endpoint(BaseModel):
    name: str = Field(description="Name of the endpoint")
    description: Optional[str] = Field(
        description="Description of the endpoint")
    method: str = Field(description="Method of the endpoint")
    url: Union[str, List[str]] = Field(description="URL of the endpoint")
    headers: Optional[list] = Field(
        default=[], description="Headers of the endpoint")
    required_parameters: list[Parameters]
    optional_parameters: Optional[list[Parameters]]


class Api_json(BaseModel):
    title: Optional[str] = Field(description="Title of the API")
    endpoints: list[Endpoint]

In [27]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate

OPENAI_API_KEY = str(input('Please type in your api key: '))
count = 1
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0,
                 openai_api_key=OPENAI_API_KEY,
                 max_tokens=None,
                )

for test_sample in test_dataset['text_content']:
    prompt = MessagesPlaceholder("history", optional=True)
    prompt.format_messages()
    prompt.format_messages(
        history=[
            ("system", "You are an AI assistant."),
            ("human", "Hello!"),
        ]
    )
    prompt_template = ChatPromptTemplate.from_messages([
        MessagesPlaceholder("history"),
        ("human",
         "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: {api_doc} \n\nJson: "),
    ])

    chain = prompt_template | llm.with_structured_output(
        Api_json, method="function_calling")
    result = chain.invoke(
        {"history":
            [
                ("human", "You will be given an API documentation. Extract the endpoints and output in JSON format.\n\nAPI text content: " +
                 one_shot_example['text_content'] + "\n\nJson: "),
                ("ai", one_shot_example['json_form']),
            ],
         "api_doc": test_sample
        }
    )
    json = result.json(indent=4)
    with open("./model_outputs/gpt3.5+pydantic_schema_one_shot/" + str(count) + ".txt", 'w') as file:
        file.write(json)
    count += 1

{
    "title": "MyIP.com JSON API",
    "endpoints": [
        {
            "name": "Get IP Information",
            "description": "Retrieve IP information including IP address, country, and country code.",
            "method": "GET",
            "url": "https://api.myip.com",
            "headers": [],
            "required_parameters": [],
            "optional_parameters": []
        }
    ]
}
{
    "title": "MeowFacts API Documentation",
    "endpoints": [
        {
            "name": "Get Random Cat Fact",
            "description": "Returns a random fact about cats.",
            "method": "GET",
            "url": "https://meowfacts.herokuapp.com/",
            "headers": [],
            "required_parameters": [],
            "optional_parameters": null
        },
        {
            "name": "Get Multiple Cat Facts",
            "description": "Returns multiple cat facts at a time.",
            "method": "GET",
            "url": "https://meowfacts.herokuapp.com/",
     

# APILlama outputs

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)
    print("Current CUDA Device:", device_name)
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead")

In [5]:
peft_model_id = "billyfin/APILlama"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                             quantization_config=quantization_config,
                                             low_cpu_mem_usage=True,
                                            )
model = PeftModel.from_pretrained(model, peft_model_id)

adapter_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/328k [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct')

# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
max_length = 10240

def format(example):
    input_messages = [
        {"role":"user", "content": one_shot_example['text_content']},
        {"role":"assistant", "content": one_shot_example['json_form']},
        {"role":"user", "content": example},
    ]
    example = tokenizer.apply_chat_template(input_messages, tokenize=False) + "<|start_header_id|>assistant<|end_header_id|>\n\n"
    return example
    
def preprocess_for_inference(examples):
    inputs = f"{examples}"
    
    model_inputs = tokenizer(inputs)
    model_inputs['input_ids'] += [tokenizer.pad_token_id]
    model_inputs["attention_mask"] = [1] * len(model_inputs["input_ids"])
    
    sample_input_ids = model_inputs["input_ids"]
    model_inputs["input_ids"] = [tokenizer.pad_token_id] * (
        max_length - len(sample_input_ids)
    ) + sample_input_ids
    model_inputs["attention_mask"] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
        "attention_mask"
    ]
    model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"][:max_length])
    model_inputs["attention_mask"] = torch.tensor(model_inputs["attention_mask"][:max_length])
    return model_inputs

In [8]:
count = 1
for test_sample in test_dataset['text_content']:
    test_sample = format(test_sample)
    test_input = preprocess_for_inference(test_sample)
    inputs = {k: v.unsqueeze(0).to(device) for k, v in test_input.items()}
    prompt = inputs['input_ids'].shape[1]
    
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"], 
            attention_mask=inputs["attention_mask"],
            max_new_tokens=1024,
            temperature=0.1
        )
    
    result = tokenizer.decode(outputs[0, prompt:], skip_special_tokens=True)
    with open("./model_outputs/apillama/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    count+=1

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for o

# APILlama_v2 outputs

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)
    print("Current CUDA Device:", device_name)
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead")

GPU is available
Current CUDA Device: NVIDIA A40


In [5]:
peft_model_id = "billyfin/ApiLlama_v2"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                             quantization_config=quantization_config,
                                             low_cpu_mem_usage=True,
                                            )
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B-Instruct')

# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [8]:
max_length = 10240

def format(example):
    input_messages = [
        {"role":"user", "content": one_shot_example['text_content']},
        {"role":"assistant", "content": one_shot_example['json_form']},
        {"role":"user", "content": example},
    ]
    example = tokenizer.apply_chat_template(input_messages, tokenize=False) + "<|start_header_id|>assistant<|end_header_id|>\n\n"
    return example
    
def preprocess_for_inference(examples):
    inputs = f"{examples}"
    
    model_inputs = tokenizer(inputs)
    model_inputs['input_ids'] += [tokenizer.pad_token_id]
    model_inputs["attention_mask"] = [1] * len(model_inputs["input_ids"])
    
    sample_input_ids = model_inputs["input_ids"]
    model_inputs["input_ids"] = [tokenizer.pad_token_id] * (
        max_length - len(sample_input_ids)
    ) + sample_input_ids
    model_inputs["attention_mask"] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
        "attention_mask"
    ]
    model_inputs["input_ids"] = torch.tensor(model_inputs["input_ids"][:max_length])
    model_inputs["attention_mask"] = torch.tensor(model_inputs["attention_mask"][:max_length])
    return model_inputs

In [11]:
count = 1
for test_sample in test_dataset['text_content']:
    test_sample = format(test_sample)
    test_input = preprocess_for_inference(test_sample)
    inputs = {k: v.unsqueeze(0).to(device) for k, v in test_input.items()}
    prompt = inputs['input_ids'].shape[1]
    
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"], 
            attention_mask=inputs["attention_mask"],
            max_new_tokens=1024,
        )
    
    result = tokenizer.decode(outputs[0, prompt:], skip_special_tokens=True)
    with open("./model_outputs/apillama_v2/" + str(count) + ".txt", 'w') as file:
        file.write(result)
    count+=1

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

# Evaluation

In [12]:
import json

# Filter and extract json only
def clean_json(str):
    json_str = str.strip()
    start_index = json_str.find('{')
    json_type = 'object' if start_index != -1 else 'array'
    end_index = json_str.rfind('}') if json_type == 'object' else json_str.rfind(']')
    if start_index == -1:
        start_index = json_str.find('[')
        if start_index == -1:
            raise ValueError("No JSON object or array found in the text")
    if end_index == -1:
        raise ValueError("Incomplete JSON structure, no closing bracket found")
    
    return json_str[start_index:end_index+1]

## Calculate the accuracy of the format

In [14]:
from pydantic import BaseModel, ValidationError, Field
from typing import Optional, Any, Union, List

class Parameters(BaseModel):
    name: str = Field(description="Name of the parameter")
    type: str = Field(description="Type of the parameter")
    description: str = Field(description="Description of the parameter")
    default: Optional[Any] = Field(
        description="Default value of the parameter")
    example: Optional[Any] = Field(
        description="Example value of the parameter")

class Endpoint(BaseModel):
    name: str = Field(description="Name of the endpoint")
    description: Optional[str] = Field(
        description="Description of the endpoint")
    method: str = Field(description="Method of the endpoint")
    url: Union[str, List[str]] = Field(description="URL of the endpoint")
    headers: Optional[list] = Field(
        default=[], description="Headers of the endpoint")
    required_parameters: list[Parameters]
    optional_parameters: Optional[list[Parameters]]

class Api_json(BaseModel):
    title: Optional[str] = Field(description="Title of the API")
    endpoints: list[Endpoint]

def check_format(generated):
    try:
        generated = Api_json(**generated)
    except ValidationError as e:
        return False
    return True
    


In [17]:
from sklearn.metrics import accuracy_score
import os

# Checks if the structure and format of the predicted JSON is corresponding to the one-shot example
json_truths = test_dataset['json_form']
format_truth = []
format_prediction = []

gpt_format_result = []
my_model_format_result = []

directory = './model_outputs/'
prediction_folders = [f for f in os.listdir(directory)]

for folder in prediction_folders:

    print("Evaluating folder: {}".format(folder))

    for i in range(len(json_truths)):
        with open(directory + folder + "/" + str(i + 1) + ".txt", 'r', encoding='utf-8') as file:
            content = file.read()
        try:
            json_content = json.loads(clean_json(content))
            truth = json.loads(json_truths[i])
        except Exception as e:
            format_prediction.append(False)
            format_truth.append(check_format(truth))
            continue

        format_prediction.append(check_format(json_content))
        format_truth.append(check_format(truth))

    # Calculate the accuracy
    if False in format_truth:
        print("Some of the truth JSONs are not in the correct format")
    accuracy = accuracy_score(format_truth, format_prediction)
    print("Accuracy of {} is: {}".format(folder, accuracy), end='\n\n')
    # Save the results
    if folder == "apillama":
        my_model_format_result = format_prediction
    elif folder == "gpt3.5+pydantic_schema_one_shot":
        gpt_format_result = format_prediction
    # Reset the lists
    format_truth = []
    format_prediction = []
    

Evaluating folder: apillama
Accuracy of apillama is: 0.2647058823529412

Evaluating folder: my_model
Accuracy of my_model is: 0.9411764705882353

Evaluating folder: llama3_one_shot
Accuracy of llama3_one_shot is: 0.11764705882352941

Evaluating folder: llama3
Accuracy of llama3 is: 0.0

Evaluating folder: gpt3.5_one_shot
Accuracy of gpt3.5_one_shot is: 0.5

Evaluating folder: gpt3.5+pydantic_schema_one_shot
Accuracy of gpt3.5+pydantic_schema_one_shot is: 1.0



## Calculate the precision, recall, and the F1 of matched urls

In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score

def compute_precision_recall_f1(truth, generated):
    # Combine both lists to create a unique list of URLs
    all_urls = list(set(truth) | set(generated))
    # Create a list of 1s and 0s for the ground truth and the generated URLs
    y_true = [1 if url in truth else 0 for url in all_urls]
    y_pred = [1 if url in generated else 0 for url in all_urls]
    # Calculate the precision, recall and f1 score
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred)

    return precision, recall, f1

In [6]:
from deepdiff import DeepSearch

def search(obj, key):
    list = []
    ds = DeepSearch(obj, key, verbose_level=2)
    for key in ds['matched_paths']:
        if key.endswith("['url']"):
            list.append(ds['matched_paths'][key])
    return list

intersection = []
for i in range(len(json_truths)):
    if my_model_format_result[i] == gpt_format_result[i] == True:
        intersection.append(i)

item = "url"
for folder in ["apillama/", "gpt3.5+pydantic_schema_one_shot/"]:
    # reset the lists
    truth_urls = []
    prediction_urls = []
    print("Evaluating folder: {}".format(folder))
    for i in intersection:
        truth = json.loads(json_truths[i])
        truth_urls.extend(search(truth, item))

        with open(directory + folder + str(i + 1) + ".txt", 'r', encoding='utf-8') as file:
            content = file.read()
        prediction = json.loads(clean_json(content))
        prediction_urls.extend(search(prediction, item))

    precision, recall, f1 = compute_precision_recall_f1(truth_urls, prediction_urls)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1: ", f1, end='\n\n')


Evaluating folder: my_model/
Precision:  0.6818181818181818
Recall:  0.4787234042553192
F1:  0.5625

Evaluating folder: gpt3.5+pydantic_schema_one_shot/
Precision:  0.746268656716418
Recall:  0.5319148936170213
F1:  0.6211180124223602



## Evaluate details

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("infgrad/stella_en_400M_v5", trust_remote_code=True).cuda()

def semantic_similarity(generated, truth):
    global model
    docs = [
        generated,
        truth
    ]
    doc_embeddings = model.encode(docs)
    similarities = model.similarity(doc_embeddings, doc_embeddings)
    return similarities[0][1].item()

  warn(


modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/174k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/infgrad/stella_en_400M_v5:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/57.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/infgrad/stella_en_400M_v5:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A matching Triton is not available, some optimizations will not be enabled
Traceback (most recent call last):
  File "d:\Anaconda3\envs\summer_research\Lib\site-packages\xformers\__init__.py", line 57, in _is_triton_available
    import triton  # noqa
    ^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'triton'


pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.huggingface.co/repos/a1/74/a1748fc68813865c8a8b6aff794a679a253884a30b778c8a7c6be455549b3750/da57827f222d56c5ac1706bf8e25ade405abd87a5d2613ae6c53d756c363b6b5?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1722219046&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMjIxOTA0Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2ExLzc0L2ExNzQ4ZmM2ODgxMzg2NWM4YThiNmFmZjc5NGE2NzlhMjUzODg0YTMwYjc3OGM4YTdjNmJlNDU1NTQ5YjM3NTAvZGE1NzgyN2YyMjJkNTZjNWFjMTcwNmJmOGUyNWFkZTQwNWFiZDg3YTVkMjYxM2FlNmM1M2Q3NTZjMzYzYjZiNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=vOdR9zyxEQcIjXerukpkBUgMMDvk3rDfo45SWwEyhBSRJ0e34bkjTdXxx5HR5ZeEjuoUTLcLuNINHR8RJiNx%7EvWPYg5TkQDSsO8Wr7OBMJouCDeEqMUcNeCdwkhC0FtJh5c3fkS7MHG9AwIxYH2riYa9exvs

pytorch_model.bin:  17%|#6        | 294M/1.74G [00:00<?, ?B/s]

Some weights of the model checkpoint at infgrad/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

In [48]:
for folder in ["apillama/", "gpt3.5+pydantic_schema_one_shot/"]:
    # reset the variables
    total_name_cosine = 0
    total_description_cosine = 0
    total_method_list = []
    total_req_precision = 0
    total_req_recall = 0
    total_opt_precision = 0
    total_opt_recall = 0
    total_req_description_cosine = 0
    total_opt_description_cosine = 0
    total_req_type_accuracy = 0
    total_opt_type_accuracy = 0
    num_endpoints = 0
    endpoint_req_description_size = 0
    endpoint_opt_description_size = 0
    endpoint_req_type_size = 0
    endpoint_opt_type_size = 0
    print("Evaluating folder: {}".format(folder))

    for i in intersection:

        truth = json.loads(json_truths[i])
        truth_endpoints = truth['endpoints']
        with open(directory + folder + str(i + 1) + ".txt", 'r', encoding='utf-8') as file:
            content = file.read()
        prediction = json.loads(clean_json(content))
        prediction_endpoints = prediction['endpoints']

        for truth_endpoint in truth_endpoints:
            endpoint_req_descriptions_cosine = 0
            endpoint_opt_descriptions_cosine = 0
            endpoint_req_type_list = []
            endpoint_opt_type_list = []

            truth_name = truth_endpoint['name']
            truth_description = truth_endpoint['description']
            truth_req_params = truth_endpoint['required_parameters']
            truth_opt_params = truth_endpoint['optional_parameters']
            matching_endpoint = next((endpoint for endpoint in prediction_endpoints if endpoint['url'] == truth_endpoint['url']), None)
            if matching_endpoint:
                num_endpoints += 1
                total_name_cosine += semantic_similarity(matching_endpoint['name'], truth_name)
                total_description_cosine += semantic_similarity(matching_endpoint['description'], truth_description)
                total_method_list.append(matching_endpoint['method'] == truth_endpoint['method'])
                # calculate the precision and recall score for the parameters
                truth_req_param_names = [param['name'] for param in truth_req_params]
                generated_req_param_names = [param['name'] for param in matching_endpoint['required_parameters']]
                truth_opt_param_names = [param['name'] for param in truth_opt_params]
                generated_opt_param_names = [param['name'] for param in matching_endpoint['optional_parameters']] if matching_endpoint['optional_parameters'] else []
                req_precision, req_recall, req_f1 = compute_precision_recall_f1(truth_req_param_names, generated_req_param_names) if not (truth_req_params == generated_req_param_names == []) else (1, 1, 1)                
                opt_precision, opt_recall, opt_f1 = compute_precision_recall_f1(truth_opt_param_names, generated_opt_param_names) if not (truth_opt_params == generated_opt_param_names == []) else (1, 1, 1)
                total_req_precision += req_precision
                total_req_recall += req_recall
                total_opt_precision += opt_precision
                total_opt_recall += opt_recall
                # calculate the cosine similarity of the parameter descriptions and types
                req_count = 0
                opt_count = 0
                req_type_count = 0
                opt_type_count = 0
                for name in truth_req_param_names:
                    matched_description = next((param['description'] for param in matching_endpoint['required_parameters'] if param['name'] == name), None)
                    matched_type = next((param['type'] for param in matching_endpoint['required_parameters'] if param['name'] == name), None)
                    if matched_description:
                        req_count += 1
                        endpoint_req_descriptions_cosine += semantic_similarity(matched_description, next(param['description'] for param in truth_req_params if param['name'] == name))
                    if matched_type:
                        req_type_count += 1
                        endpoint_req_type_list.append(matched_type == next(param['type'] for param in truth_req_params if param['name'] == name))
                if req_type_count != 0:
                    endpoint_req_type_size += 1
                if req_count != 0:
                    # increment the size of the description for the average calculation
                    endpoint_req_description_size += 1
                for name in truth_opt_param_names:
                    # check if the optional parameter is present in the generated parameters
                    if matching_endpoint['optional_parameters']:
                        matched_description = next((param['description'] for param in matching_endpoint['optional_parameters'] if param['name'] == name), None)
                        if matched_description:
                            opt_count += 1
                            endpoint_opt_descriptions_cosine += semantic_similarity(matched_description, next(param['description'] for param in truth_opt_params if param['name'] == name))
                        matched_type = next((param['type'] for param in matching_endpoint['optional_parameters'] if param['name'] == name), None)
                        if matched_type:
                            opt_type_count += 1
                            endpoint_opt_type_list.append(matched_type == next(param['type'] for param in truth_opt_params if param['name'] == name))
                if opt_type_count != 0:
                    endpoint_opt_type_size += 1
                if opt_count != 0:
                    # increment the size of the description for the average calculation
                    endpoint_opt_description_size += 1
                
            # calculate the average cosine similarity for the descriptions
            total_req_description_cosine += (endpoint_req_descriptions_cosine / req_count if req_count != 0 else 0)
            total_opt_description_cosine += (endpoint_opt_descriptions_cosine / opt_count if opt_count != 0 else 0)
            # calculate the average type accuracy
            total_req_type_accuracy += sum(endpoint_req_type_list) / len(endpoint_req_type_list) if len(endpoint_req_type_list) != 0 else 0
            total_opt_type_accuracy += sum(endpoint_opt_type_list) / len(endpoint_opt_type_list) if len(endpoint_opt_type_list) != 0 else 0

    total_name_cosine /= num_endpoints
    total_description_cosine /= num_endpoints
    total_method_accuracy = sum(total_method_list) / num_endpoints
    total_req_precision /= num_endpoints
    total_req_recall /= num_endpoints
    total_opt_precision /= num_endpoints
    total_opt_recall /= num_endpoints
    total_req_description_cosine /= endpoint_req_description_size
    total_opt_description_cosine /= endpoint_opt_description_size
    total_req_type_accuracy /= endpoint_req_type_size
    total_opt_type_accuracy /= endpoint_opt_type_size

    print("Name Cosine Similarity: %.2f" % total_name_cosine)
    print("Description Cosine Similarity: %.2f" % total_description_cosine)
    print("Method Accuracy: %.2f" % total_method_accuracy)
    print("Required Parameters Precision: %.2f" % total_req_precision)
    print("Required Parameters Recall: %.2f" % total_req_recall)
    print("Optional Parameters Precision: %.2f" % total_opt_precision)
    print("Optional Parameters Recall: %.2f" % total_opt_recall)
    print("Required Parameters Description Cosine Similarity: %.2f" % total_req_description_cosine)
    print("Optional Parameters Description Cosine Similarity: %.2f" % total_opt_description_cosine)
    print("Required Parameters Type Accuracy: %.2f" % total_req_type_accuracy)
    print("Optional Parameters Type Accuracy: %.2f" % total_opt_type_accuracy, end='\n\n')

Evaluating folder: my_model/
Name Cosine Similarity: 0.94
Description Cosine Similarity: 0.89
Method Accuracy: 1.00
Required Parameters Precision: 0.92
Required Parameters Recall: 0.92
Optional Parameters Precision: 0.84
Optional Parameters Recall: 0.82
Required Parameters Description Cosine Similarity: 0.91
Optional Parameters Description Cosine Similarity: 0.92
Required Parameters Type Accuracy: 1.00
Optional Parameters Type Accuracy: 0.93

Evaluating folder: gpt3.5+pydantic_schema_one_shot/
Name Cosine Similarity: 0.96
Description Cosine Similarity: 0.90
Method Accuracy: 0.96
Required Parameters Precision: 0.79
Required Parameters Recall: 0.80
Optional Parameters Precision: 0.80
Optional Parameters Recall: 0.78
Required Parameters Description Cosine Similarity: 0.91
Optional Parameters Description Cosine Similarity: 0.86
Required Parameters Type Accuracy: 0.83
Optional Parameters Type Accuracy: 0.99

