In [1]:
import pandas as pd

In [34]:
def get_category(answer):
    answer = str(answer).lower()
    if answer in ["yes", "no"]:
        return "judge"
    elif answer in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]:
        return "count"
    else:
        return "query"
    
def get_evaluation(df):
    acc = (df["predicted"] == df["actual"]).sum() / len(df)
    judge_df = df[df["category"] == "judge"]
    count_df = df[df["category"] == "count"]
    query_df = df[df["category"] == "query"]

    acc_judge = (judge_df["predicted"] == judge_df["actual"]).sum() / len(judge_df)
    acc_count = (count_df["predicted"] == count_df["actual"]).sum() / len(count_df)
    acc_query = (query_df["predicted"] == query_df["actual"]).sum() / len(query_df)

    return {
        "total accuracy": acc,
        "judge_accuracy": acc_judge,
        "count_accuracy": acc_count,
        "query_accuracy": acc_query
    }

In [35]:
evaluation_map = {
    "direct": "direct_prompt_results_gpt-4o-mini.csv",
    "react": "react_results_gpt-4o-mini.csv",
    "react_no_scene": "react_results_gpt-4o-mini_no_scene.csv",
    "routing": "results/routing_results_gpt-4o-mini.csv",
    "routing_no_scene": "results/routing_results_gpt-4o-mini_no_scene.csv",
    "state_machine": "state_machine_results_gpt-4o-mini.csv",
    "state_machine_no_scene": "state_machine_results_gpt-4o-mini_no_scene_1.csv",
}

In [36]:
evaluation_data = {}

for key, value in evaluation_map.items():
    evaluation_data[key] = pd.read_csv(f"{value}")
for value in evaluation_data.values():
    value["category"] = value["actual"].apply(get_category)

In [37]:
value.astype(str)

Unnamed: 0,predicted,reasoning,actual,category
0,2,['start: EventType.action - Action: start star...,2,count
1,0,['start: EventType.action - Action: start star...,0,count
2,8,['start: EventType.action - Action: start star...,8,count
3,2,['start: EventType.action - Action: start star...,2,count
4,5,['start: EventType.action - Action: start star...,5,count
...,...,...,...,...
95,unknown,['start: EventType.action - Action: start star...,sphere,query
96,sphere,['start: EventType.action - Action: start star...,sphere,query
97,blue,['start: EventType.action - Action: start star...,red,query
98,none,['start: EventType.action - Action: start star...,cube,query


In [38]:
evaluation_results = []

for key, value in evaluation_data.items():
    value = value.astype(str)
    result = get_evaluation(value)
    result["setting"] = key
    evaluation_results.append(result)

In [39]:
evaluation_data["react"]

Unnamed: 0,predicted,reasoning,actual,category
0,1,"[""filter_with_attribute: EventType.action - Ac...",2,count
1,0,"[""filter_with_attribute: EventType.action - Ac...",0,count
2,8,"[""filter_with_attribute: EventType.action - Ac...",8,count
3,2,"[""filter_with_attribute: EventType.action - Ac...",2,count
4,5,"[""filter_with_attribute: EventType.action - Ac...",5,count
...,...,...,...,...
95,sphere,"[""get_related_objects: EventType.action - Acti...",sphere,query
96,sphere,"[""filter_with_attribute: EventType.action - Ac...",sphere,query
97,blue,"[""get_related_objects: EventType.action - Acti...",red,query
98,cube,"[""filter_with_attribute: EventType.action - Ac...",cube,query


In [40]:
evaluation_result_df = pd.DataFrame(evaluation_results)
evaluation_result_df.set_index("setting", inplace=True)
evaluation_result_df

Unnamed: 0_level_0,total accuracy,judge_accuracy,count_accuracy,query_accuracy
setting,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
direct,0.84,0.939394,0.818182,0.764706
react,0.85,1.0,0.787879,0.764706
react_no_scene,0.62,0.787879,0.636364,0.441176
routing,0.36,0.030303,0.787879,0.264706
routing_no_scene,0.24,0.0,0.727273,0.0
state_machine,0.88,1.0,0.848485,0.794118
state_machine_no_scene,0.68,0.878788,0.727273,0.441176
