In [4]:
import pandas as pd

In [5]:
def get_category(answer):
    answer = str(answer).lower()
    if answer in ["yes", "no"]:
        return "judge"
    elif answer in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]:
        return "count"
    else:
        return "query"
    
def get_evaluation(df):
    acc = (df["predicted"] == df["actual"]).sum() / len(df)
    judge_df = df[df["category"] == "judge"]
    count_df = df[df["category"] == "count"]
    query_df = df[df["category"] == "query"]

    acc_judge = (judge_df["predicted"] == judge_df["actual"]).sum() / len(judge_df)
    acc_count = (count_df["predicted"] == count_df["actual"]).sum() / len(count_df)
    acc_query = (query_df["predicted"] == query_df["actual"]).sum() / len(query_df)

    return {
        "total accuracy": acc,
        "judge_accuracy": acc_judge,
        "count_accuracy": acc_count,
        "query_accuracy": acc_query
    }

In [61]:
evaluation_map = {
    "direct_1": "results/0.2/direct_prompt_results_gpt-4o-mini_1.csv",
    "direct_2": "results/0.2/direct_prompt_results_gpt-4o-mini_2.csv",
    "direct_3": "results/0.2/direct_prompt_results_gpt-4o-mini_3.csv",
    "state_machine_1": "results/0.2/state_machine_results_gpt-4o-mini_1.csv",
    "state_machine_2": "results/0.2/state_machine_results_gpt-4o-mini_2.csv",
    "state_machine_3": "results/0.2/state_machine_results_gpt-4o-mini_2.csv",

    "react": "results/react_results_gpt-4o-mini.csv",
    # "react_no_scene": "react_results_gpt-4o-mini_no_scene.csv",
    "routing": "results/routing_results_gpt-4o-mini.csv",
    # "routing_no_scene": "results/routing_results_gpt-4o-mini_no_scene.csv",
    # "state_machine": "state_machine_results_gpt-4o-mini.csv",
    # "state_machine_2": "state_machine_results_gpt-4o-mini_2.csv",
    # "state_machine_no_scene": "state_machine_results_gpt-4o-mini_no_scene.csv",
}

In [48]:
evaluation_map = {
    "direct_1": "results/direct_prompt_Meta-Llama-3.1-8B-Instruct-Turbo_1.csv",
    "direct_2": "results/direct_prompt_Meta-Llama-3.1-8B-Instruct-Turbo_2.csv",
    "direct_3": "results/direct_prompt_Meta-Llama-3.1-8B-Instruct-Turbo_3.csv",
    "state_machine_1": "results/state_machine_Meta-Llama-3.1-8B-Instruct-Turbo_1.csv",
    "state_machine_2": "results/state_machine_Meta-Llama-3.1-8B-Instruct-Turbo_2.csv",
    "state_machine_3": "results/state_machine_Meta-Llama-3.1-8B-Instruct-Turbo_3.csv",

    # "direct_2": "results/0.2/direct_prompt_results_gpt-4o-mini_2.csv",
    # "direct_3": "results/0.2/direct_prompt_results_gpt-4o-mini_3.csv",
    # "state_machine_1": "results/0.2/state_machine_results_gpt-4o-mini_1.csv",
    # "state_machine_2": "results/0.2/state_machine_results_gpt-4o-mini_2.csv",
    # "state_machine_3": "results/0.2/state_machine_results_gpt-4o-mini_2.csv",

    # "react": "results/react_results_gpt-4o-mini.csv",
    # "react_no_scene": "react_results_gpt-4o-mini_no_scene.csv",
    # "routing": "results/routing_results_gpt-4o-mini.csv",
    # "routing_no_scene": "results/routing_results_gpt-4o-mini_no_scene.csv",
    # "state_machine": "state_machine_results_gpt-4o-mini.csv",
    # "state_machine_2": "state_machine_results_gpt-4o-mini_2.csv",
    # "state_machine_no_scene": "state_machine_results_gpt-4o-mini_no_scene.csv",
}

In [49]:
evaluation_data = {}

for key, value in evaluation_map.items():
    evaluation_data[key] = pd.read_csv(f"{value}")
for value in evaluation_data.values():
    value["category"] = value["actual"].apply(get_category)

In [50]:
evaluation_results = []

for key, value in evaluation_data.items():
    value = value.astype(str)
    result = get_evaluation(value)
    result["setting"] = key
    evaluation_results.append(result)

In [51]:
evaluation_result_df = pd.DataFrame(evaluation_results)
evaluation_result_df.set_index("setting", inplace=True)
evaluation_result_df

Unnamed: 0_level_0,total accuracy,judge_accuracy,count_accuracy,query_accuracy
setting,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
direct_1,0.72,0.787879,0.69697,0.676471
direct_2,0.74,0.909091,0.606061,0.705882
direct_3,0.73,0.848485,0.69697,0.647059
state_machine_1,0.72,0.878788,0.515152,0.764706
state_machine_2,0.74,0.848485,0.606061,0.764706
state_machine_3,0.72,0.909091,0.545455,0.705882
state_machine_test,0.72,0.787879,0.636364,0.735294


In [8]:
evaluation_result_df = pd.DataFrame(evaluation_results)
evaluation_result_df.set_index("setting", inplace=True)
evaluation_result_df

Unnamed: 0_level_0,total accuracy,judge_accuracy,count_accuracy,query_accuracy
setting,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
direct,0.84,0.939394,0.818182,0.764706
react,0.85,1.0,0.787879,0.764706
react_no_scene,0.62,0.787879,0.636364,0.441176
routing,0.36,0.030303,0.787879,0.264706
routing_no_scene,0.24,0.0,0.727273,0.0
state_machine,0.88,1.0,0.848485,0.794118
state_machine_no_scene,0.68,0.878788,0.727273,0.441176


## Non-functional Evaluation

In [15]:
evaluation_data["react_no_scene"]["reasoning"].map(lambda x: x.count("EventType.action ")).mean()

4.33

In [21]:
average_reasoning_length = {}
for key, value in evaluation_data.items():
    if key == "direct":
        average_reasoning_length[key] = 1
    else:
        average_reasoning_length[key] = value["reasoning"].map(lambda x: x.count("EventType.action ")).mean()

In [23]:
average_reasoning_length

{'direct': 1,
 'react': 3.68,
 'react_no_scene': 4.33,
 'routing': 3.09,
 'routing_no_scene': 3.44,
 'state_machine': 2.81,
 'state_machine_no_scene': 4.44}