In [None]:
!pip install plotly kaleido datasets nbformat -U -q

In [3]:
import os

import datasets
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))

pd.set_option("max_colwidth", None)

OUTPUT_DIR = "output"

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [20]:
eval_ds = datasets.load_dataset(
        "data/gaia/GAIA.py",
        name="2023_all",
        split="validation",
        trust_remote_code=True
        # data_files={"validation": "validation/metadata.jsonl", "test": "test/metadata.jsonl"},
    )
eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
eval_df = pd.DataFrame(eval_ds)

# 1. Load all results

In [21]:
import glob


results = []
for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl"):
    df = pd.read_json(f, lines=True)
    df["agent_name"] = f.split("/")[-1].split(".")[0]
    results.append(df)

result_df = pd.concat(results)
result_df["prediction"] = result_df["prediction"].fillna("No prediction")

In [22]:
import re
from collections import Counter

from scripts.gaia_scorer import check_close_call, question_scorer


result_df["is_correct"] = result_df.apply(lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1)
result_df["is_near_correct"] = result_df.apply(
    lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
    axis=1,
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
    matches = eval_df.loc[eval_df["question"].apply(lambda x: x in question), "file_name"]

    if len(matches) == 0:
        return "Not found"
    file_path = matches.values[0]

    if isinstance(file_path, str) and len(file_path) > 0:
        return file_path.split(".")[-1]
    else:
        return "None"


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
    regex = r"\b(\w+)\("
    function_calls = [el for el in re.findall(regex, code) if el.islower()]

    function_call_counter = Counter(function_calls)
    return function_call_counter


def sum_tool_calls(steps):
    total_count = Counter()
    for step in steps:
        if "llm_output" in step:
            total_count += extract_tool_calls(step["llm_output"])

    return total_count


def get_durations(row):
    # start_datetime = datetime.strptime(row['start_time'], "%Y-%m-%d %H:%M:%S")
    # end_datetime = datetime.strptime(row['end_time'], "%Y-%m-%d %H:%M:%S")

    duration_timedelta = row["end_time"] - row["start_time"]
    return int(duration_timedelta.total_seconds())


result_df["duration"] = result_df.apply(get_durations, axis=1)
# result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)

String No prediction cannot be normalized to number str.
String No prediction cannot be normalized to number str.
String No prediction cannot be normalized to number str.
String No prediction cannot be normalized to number str.
String No prediction cannot be normalized to number str.
String No prediction cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String No prediction cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String 2072 Akaikai Loop 1057000 cannot be normalized to number str.
String No prediction cannot be normalized to number str.
String No prediction cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String 1.54 √Ö cannot be normalized to number str.
String No prediction cannot be normalized to number 


Answer lists have different lengths, returning False.



In [23]:
result_df["agent_name"].value_counts()

agent_name
gaia_o1                              165
gaia_o1_audio                        165
gaia_o4-mini                         165
gaia_o4-mini_search                  165
gaia_claude-3-5-sonnet                37
generate-traces-03-apr-noplanning      9
Name: count, dtype: int64

# 2. Inspect specific runs

In [24]:
sel_df = result_df
# sel_df = sel_df.loc[
#     (result_df["agent_name"].isin(list_versions))
# ]
sel_df = sel_df.reset_index(drop=True)
display(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
display(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)

agent_name
gaia_o1                              165
gaia_o1_audio                        165
gaia_o4-mini                         165
gaia_o4-mini_search                  165
gaia_claude-3-5-sonnet                37
generate-traces-03-apr-noplanning      9
Name: count, dtype: int64

agent_name                         task
gaia_claude-3-5-sonnet             2       20
                                   1       11
                                   3        6
gaia_o1                            2       86
                                   1       53
                                   3       26
gaia_o1_audio                      2       86
                                   1       53
                                   3       26
gaia_o4-mini                       2       86
                                   1       53
                                   3       26
gaia_o4-mini_search                2       86
                                   1       53
                                   3       26
generate-traces-03-apr-noplanning  1        5
                                   2        2
                                   3        2
Name: count, dtype: int64

Total length: 706 - is complete: False


In [25]:
display("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
display(
    sel_df.groupby(["agent_name", "task"])[["is_correct", "is_near_correct", "count_steps", "question", "duration"]]
    .agg(
        {
            "is_correct": "mean",
            "is_near_correct": "mean",
            "count_steps": "mean",
            "question": "count",
            "duration": "mean",
        }
    )
    .rename(columns={"question": "count"})
)

'Average score:'

Unnamed: 0_level_0,is_correct
agent_name,Unnamed: 1_level_1
gaia_claude-3-5-sonnet,0.0
gaia_o1,0.442
gaia_o1_audio,0.442
gaia_o4-mini,0.412
gaia_o4-mini_search,0.509
generate-traces-03-apr-noplanning,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,is_correct,is_near_correct,count_steps,count,duration
agent_name,task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gaia_claude-3-5-sonnet,1,0.0,0.0,0.0,11,0.181818
gaia_claude-3-5-sonnet,2,0.0,0.0,0.0,20,0.25
gaia_claude-3-5-sonnet,3,0.0,0.0,0.0,6,0.166667
gaia_o1,1,0.509434,0.509434,9.90566,53,492.54717
gaia_o1,2,0.476744,0.476744,10.27907,86,619.348837
gaia_o1,3,0.192308,0.192308,11.692308,26,1021.615385
gaia_o1_audio,1,0.528302,0.528302,9.981132,53,172.660377
gaia_o1_audio,2,0.453488,0.453488,10.72093,86,239.465116
gaia_o1_audio,3,0.230769,0.230769,14.923077,26,444.384615
gaia_o4-mini,1,0.566038,0.584906,18.679245,53,681.132075


In [26]:
import plotly.express as px


cumulative_df = (
    (
        sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
        .expanding(min_periods=1, axis=0, method="single")
        .agg({"is_correct": "mean", "is_near_correct": "count"})
        .reset_index()
    )
    .copy()
    .rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1


def find_question(row):
    try:
        res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[row["index"]][:50]
        return res
    except Exception:
        return ""


cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)

px.line(
    cumulative_df,
    color="agent_name",
    x="index",
    y="is_correct",
    hover_data="question",
)

# 3. Dive deeper into one run

In [27]:
sel_df = result_df.loc[result_df["agent_name"] == "gaia_o1"]
print(len(sel_df))

165


### Count errors

In [28]:
import numpy as np


error_types = [
    "AgentParsingError",
    "AgentExecutionError",
    "AgentMaxIterationsError",
    "AgentGenerationError",
]
sel_df[error_types] = 0
sel_df["Count steps"] = np.nan


def count_errors(row):
    if isinstance(row["intermediate_steps"], list):
        row["Count steps"] = len(row["intermediate_steps"])
        for step in row["intermediate_steps"]:
            if isinstance(step, dict) and "error" in step:
                try:
                    row[str(step["error"]["error_type"])] += 1
                except Exception:
                    pass
    return row


sel_df = sel_df.apply(count_errors, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [29]:
import plotly.express as px


aggregate_errors = (
    sel_df.groupby(["is_correct"])[error_types + ["Count steps"]].mean().reset_index().melt(id_vars=["is_correct"])
)

fig = px.bar(
    aggregate_errors,
    y="value",
    x="variable",
    color="is_correct",
    labels={
        "agent_name": "<b>Model</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "value": "<b>Average count</b>",
        "eval_score_GPT4": "<b>Score</b>",
    },
)
fig.update_layout(
    height=500,
    width=800,
    barmode="group",
    bargroupgap=0.0,
)
fig.update_traces(textposition="outside")
fig.write_image("aggregate_errors.png", scale=3)
fig.show()

### Inspect result by file extension type

In [30]:
display(
    result_df.groupby(["attachment_type"])[["is_correct", "count_steps", "question"]].agg(
        {"is_correct": "mean", "count_steps": "mean", "question": "count"}
    )
)

Unnamed: 0_level_0,is_correct,count_steps,question
attachment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.454724,14.234252,508
csv,0.0,14.0,6
docx,0.8,8.0,5
jpg,0.4,11.2,10
jsonld,0.0,15.666667,6
mp3,0.055556,11.555556,18
pdb,0.0,10.0,6
pdf,0.333333,8.933333,15
png,0.2,10.1,40
pptx,0.666667,6.666667,6


# 4. Ensembling methods

In [31]:
counts = result_df["agent_name"].value_counts()
long_series = result_df.loc[result_df["agent_name"].isin(counts[counts > 140].index)]

In [32]:
def majority_vote(df):
    df = df[(df["prediction"] != "Unable to determine") & (~df["prediction"].isna()) & (df["prediction"] != "None")]

    answer_modes = df.groupby("question")["prediction"].agg(lambda x: x.mode()[0]).reset_index()
    first_occurrences = (
        df.groupby(["question", "prediction"]).agg({"task": "first", "is_correct": "first"}).reset_index()
    )
    result = answer_modes.merge(first_occurrences, on=["question", "prediction"], how="left")

    return result


def oracle(df):
    def get_first_correct_or_first_wrong(group):
        correct_answers = group[group["is_correct"]]
        if len(correct_answers) > 0:
            return correct_answers.iloc[0]
        return group.iloc[0]

    result = df.groupby("question").apply(get_first_correct_or_first_wrong)

    return result.reset_index(drop=True)


display((long_series.groupby("agent_name")["is_correct"].mean() * 100).round(2))
print(f"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}")
print(f"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}")

agent_name
gaia_o1                44.24
gaia_o1_audio          44.24
gaia_o4-mini           41.21
gaia_o4-mini_search    50.91
Name: is_correct, dtype: float64

Majority score: 51.52
Oracle score: 63.64






### Submit

In [4]:
agent_run = "gaia_o4-mini_search_test.jsonl"
df = pd.read_json(f"output/test/{agent_run}", lines=True)
df = df[["task_id", "prediction", "intermediate_steps"]]
df = df.rename(columns={"prediction": "model_answer", "intermediate_steps": "reasoning_trace"})

In [5]:
df.to_json("submission.jsonl", orient="records", lines=True)