In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
import os
import json
import typing
TestType = typing.Literal['multiple_function', 'simple', 'parallel_function', 'parallel_multiple_function', 'relevance']

answer_keys = set()
result_keys = set()
score_keys = set()

def merge_results(model: str, test: TestType):
    # results/baml-gpt-3.5-turbo-0125/gorilla_openfunctions_v1_test_simple_result.json
    # berkeley-function-call-leaderboard/results/baml-gpt-3.5-turbo-0125/gorilla_openfunctions_v1_test_simple_result.json does
    # berkeley-function-call-leaderboard/result/baml-gpt-3.5-turbo-0125/gorilla_openfunctions_v1_test_simple_result.json
    if not os.path.exists(f'result/{model}/gorilla_openfunctions_v1_test_{test}_result.json'):
        print(f'No result file found for {model} {test}')
        return None

    results = pd.read_json(f'result/{model}/gorilla_openfunctions_v1_test_{test}_result.json', lines=True)
    
    with open(f'score/{model}/{test}_score.json', 'r') as f:
        content = [json.loads(line) for idx, line in enumerate(f) if idx > 0]
        # convert to DataFrame
        scores = pd.DataFrame(content)

    if 'executable' in test or 'relevance' in test:
        answers = [{ "id": f'{test}_{idx}', "answer": '<look-at-code>'} for idx in range(len(results))]
    else:
        with open(f'data/possible_answer/gorilla_openfunctions_v1_test_{test}.json', 'r') as f:
            answers = [json.loads(line) for line in f]
            # convert to DataFrame
    answers = pd.DataFrame(answers)
    answer_keys.update(answers.columns)
    
    # merge on results[idx], scores[id - 1] (scores is sparse)
    # print('-----')
    result_keys.update(results.columns)
    # Rname result['error'] -> result['baml_error']
    results.rename(columns={'error': 'baml_error'}, inplace=True)
    # print('-----')
    print(f'score/{model}/{test}_score.json')
    print('Results:', test, len(scores))
    if scores.empty:
        merged = results
        merged['_merge'] = 'left_only'
    else:
        # scores['id'] = scores['id'] - 1
        # Drop prompt in scores
        # print(scores.columns)
        if "relevance" in test:
            scores['id'] = scores['id'].apply(lambda x: f'{test}_{x - 1}')
        else:
            scores['id'] = scores['id'].apply(lambda x: f'{test}_{x - 1}')
        if 'prompt' in scores.columns:
            scores.drop(columns=['prompt'], inplace=True)
        score_keys.update(scores.columns)
        merged = results.merge(scores, left_on='id', right_on='id', how='outer', indicator=True)
    merged = merged.merge(answers, left_on='id', right_on='id')


    # print('>>>>>>>>>>>>')
    merged['valid'] = merged['_merge'] == 'left_only'
    # drop columns
    merged.drop(columns=['_merge'], inplace=True)
    merged['test_type'] = test
    if model.endswith('-BAML'):
        qualifier = 'baml'
        model = model[:-5]
    elif model.endswith('-FC'):
        qualifier = 'FC'
        model = model[:-3]
    elif model.endswith('-FC-strict'):
        qualifier = 'FC-strict'
        model = model[:-10]
    else:
        qualifier = 'bfcl'
        model = model
    merged['qualifier'] = qualifier
    merged['model'] = model
    merged['test_group'] = 'exec' if 'executable' in test else 'ast'
    

    # reorder columns from current:
    # idx	result	prompt	input_token_count	output_token_count	latency	valid	error	error_type	model_result_raw	model_result_decoded	answer	test_type
    # to:
    # idx test_type model result answer valid error error_type model_result_raw model_result_decoded prompt input_token_count output_token_count latency
    # merged = merged[['idx', 'test_type', 'model', 'result', 'answer', 'valid', 'error', 'raw_llm_response',  'error_type', 'prompt', 'input_token_count', 'output_token_count', 'latency']]

    return merged

dfs = []
for model in ['gpt-3.5-turbo-0125', 'gpt-4o-2024-05-13', 'claude-3-haiku-20240307', 'gpt-4o-mini-2024-07-18', 'claude-3-5-sonnet-20240620', 'ollama-llama3.1']:
    for qualifier in ['-BAML', '-FC', '-FC-strict', '']:
        for prefix in ['']:
            for test in ['multiple_function', 'simple', 'parallel_function', 'parallel_multiple_function', 'relevance']:
                df = merge_results(f'{model}{qualifier}', f'{prefix}{test}')
                if df is not None:
                    dfs.append(df)

# Stack
df = pd.concat(dfs)
df.to_json('score/merged.json', orient='records', lines=True)

print('Answer keys:', answer_keys)
print('Result keys:', result_keys)
print('Score keys:', score_keys)

# copy a file to the right place
!cp score/merged.json /Users/vbv/repos/baml-berkeley-benchmark/baml-berkeley-benchmark/public/merged.json

score/gpt-3.5-turbo-0125-BAML/multiple_function_score.json
Results: multiple_function 11
score/gpt-3.5-turbo-0125-BAML/simple_score.json
Results: simple 21
score/gpt-3.5-turbo-0125-BAML/parallel_function_score.json
Results: parallel_function 19
score/gpt-3.5-turbo-0125-BAML/parallel_multiple_function_score.json
Results: parallel_multiple_function 22
score/gpt-3.5-turbo-0125-BAML/relevance_score.json
Results: relevance 43
score/gpt-3.5-turbo-0125-FC/multiple_function_score.json
Results: multiple_function 12
score/gpt-3.5-turbo-0125-FC/simple_score.json
Results: simple 44
score/gpt-3.5-turbo-0125-FC/parallel_function_score.json
Results: parallel_function 23
score/gpt-3.5-turbo-0125-FC/parallel_multiple_function_score.json
Results: parallel_multiple_function 32
score/gpt-3.5-turbo-0125-FC/relevance_score.json
Results: relevance 233
No result file found for gpt-3.5-turbo-0125-FC-strict multiple_function
No result file found for gpt-3.5-turbo-0125-FC-strict simple
No result file found for g

In [49]:
df.columns

Index(['result', 'input_token_count', 'output_token_count', 'latency',
       'prompt', 'model_name', 'test_category', 'valid', 'error', 'error_type',
       'model_result_raw', 'model_result_decoded', 'possible_answer',
       'ground_truth', 'test_type', 'qualifier', 'model', 'test_group',
       'baml_error'],
      dtype='object')

In [58]:
import numpy as np

# mean + std of latency, group by (model, test_type, qualifier)

group = df.groupby(['model', 'test_type', 'qualifier'])

# Add latency = ['latency'].agg(['mean', 'std'])
latency = group['latency'].agg(['mean', 'std'])
latency = latency.reset_index()

# Add valid_count = ['valid'].agg(['count'])
valid_count = group['valid'].apply(lambda x: (x == True).sum() / len(x))
valid_count = valid_count.reset_index(name='Accuracy')

model_group = df.groupby(['test_group', 'model', 'qualifier'])

model_valid_count = model_group['valid'].apply(lambda x: (x == True).sum() / len(x))
# model_valid_count = model_valid_count.reset_index(name='Accuracy')


# Function to create a radar chart
def radar_chart(df, model, ax):
    # Get the test types and qualifiers
    test_types = df['test_type'].unique()
    qualifiers = df['qualifier'].unique()
    
    # Number of variables
    num_vars = len(test_types)

    # Compute angle for each variable
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle

    # Plot for each qualifier
    for qualifier in qualifiers:
        values = df[df['qualifier'] == qualifier]['Accuracy'].tolist()
        values += values[:1]  # Complete the circle
        
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=qualifier)
        ax.fill(angles, values, alpha=0.1)

    # Set the labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(test_types)

    # Set y-axis limits
    ax.set_ylim(0, 1)

    # Add legend
    ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    # Set title
    ax.set_title(model, fontweight='bold')

# Create subplots

def plot(df):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10), subplot_kw=dict(projection='polar'))
    
    # Plot for gpt-3.5-turbo-0125
    df_gpt35 = df[df['model'] == 'gpt-3.5-turbo-0125']
    radar_chart(df_gpt35, 'GPT-3.5-turbo-0125', ax1)

    # Plot for gpt-4o-2024-05-13
    df_gpt4 = df[df['model'] == 'gpt-4o-2024-05-13']
    radar_chart(df_gpt4, 'GPT-4o-2024-05-13', ax2)

# Adjust layout and show plot
# plot(valid_count)

# model_valid_count.plot(kind='bar', x='model', y='Accuracy', figsize=(10, 5))
# plt.tight_layout()
# plt.show()

model_valid_count


test_group  model               qualifier
ast         gpt-3.5-turbo-0125  FC           0.812000
                                baml         0.882000
                                bfcl         0.773000
            gpt-4o-2024-05-13   FC           0.873000
                                baml         0.933000
                                bfcl         0.841000
exec        gpt-3.5-turbo-0125  FC           0.829167
                                baml         0.895833
                                bfcl         0.770833
            gpt-4o-2024-05-13   FC           0.875000
                                baml         0.900000
                                bfcl         0.800000
Name: valid, dtype: float64

In [16]:
pd.read_csv('score/data.csv')

Unnamed: 0,Rank,Overall Acc,Model,Model Link,Organization,License,AST Summary,Exec Summary,Simple Function AST,Python Simple Function AST,...,Python Simple Function Exec,REST Simple Function Exec,Multiple Functions Exec,Parallel Functions Exec,Parallel Multiple Exec,Relevance Detection,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s)
0,1,92.66%,GPT-4o-2024-05-13 (BAML),https://www.boundaryml.com,OpenAI + Boundary,Proprietary,92.94%,87.00%,94.75%,94.75%,...,99.00%,0.00%,92.00%,82.00%,75.00%,0.00%,2.23,1.7,1.29,3.55
1,2,88.47%,GPT-3.5-Turbo-0125 (BAML),https://www.boundaryml.com,OpenAI + Boundary,Proprietary + Apache 2.0,86.69%,86.37%,94.25%,94.25%,...,100.00%,0.00%,88.00%,80.00%,77.50%,0.00%,0.21,1.15,0.68,2.61
2,3,87.34%,GPT-4o-2024-05-13 (FC),https://openai.com/index/hello-gpt-4o/,OpenAI,Proprietary,87.44%,85.00%,86.75%,86.75%,...,95.00%,0.00%,86.00%,84.00%,75.00%,0.00%,1.86,1.46,1.01,3.14
3,4,83.31%,GPT-4o-2024-05-13 (Prompt),https://openai.com/index/hello-gpt-4o/,OpenAI,Proprietary,81.62%,76.50%,94.00%,94.00%,...,92.00%,0.00%,80.00%,64.00%,70.00%,0.00%,2.73,1.01,0.61,2.09
4,5,81.53%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-...,OpenAI,Proprietary,82.50%,79.62%,76.00%,76.00%,...,92.00%,0.00%,86.00%,78.00%,62.50%,0.00%,0.19,1.07,0.52,1.96
5,6,77.26%,GPT-3.5-Turbo-0125 (Prompting),https://platform.openai.com/docs/models/gpt-3-...,OpenAI,Proprietary,75.75%,71.25%,83.50%,83.50%,...,92.00%,0.00%,80.00%,78.00%,35.00%,0.00%,0.27,0.88,1.18,1.46
