In [1]:
!pip install plotly kaleido datasets nbformat -U -q

In [2]:
import os

import datasets
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))

pd.set_option("max_colwidth", None)

OUTPUT_DIR = "output"

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
eval_df = pd.DataFrame(eval_ds)

In [4]:
pd.Series(eval_ds["task"]).value_counts()

2    86
1    53
3    26
Name: count, dtype: int64

# 1. Load all results

In [5]:
import glob

results = []
for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl"):
    df = pd.read_json(f, lines=True)
    df["agent_name"] = f.split("/")[-1].split(".")[0]
    results.append(df)

result_df = pd.concat(results)
result_df = result_df.drop(columns=["start_time", "end_time"])
result_df["prediction"] = result_df["prediction"].fillna("No prediction")

In [6]:
import re
from collections import Counter

from scripts.gaia_scorer import check_close_call, question_scorer


result_df["is_correct"] = result_df.apply(lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1)
result_df["is_near_correct"] = result_df.apply(
    lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
    axis=1,
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
    matches = eval_df.loc[eval_df["question"].apply(lambda x: x in question), "file_name"]

    if len(matches) == 0:
        return "Not found"
    file_path = matches.values[0]

    if isinstance(file_path, str) and len(file_path) > 0:
        return file_path.split(".")[-1]
    else:
        return "None"


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
    regex = r"\b(\w+)\("
    function_calls = [el for el in re.findall(regex, code) if el.islower()]

    function_call_counter = Counter(function_calls)
    return function_call_counter


def sum_tool_calls(steps):
    total_count = Counter()
    for step in steps:
        if "llm_output" in step:
            total_count += extract_tool_calls(step["llm_output"])

    return total_count


# result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)

String 250 for Cheater cannot be normalized to number str.
String  220 for Cheater beater cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String 120.28 for Cheater cannot be normalized to number str.
String  119.04 for Cheater beater cannot be normalized to number str.
String 3 or 4 cannot be normalized to number str.
String 2017 Komo Mai Drive 900000 cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String 1.46 Å cannot be normalized to number str.
String  cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String 94.5 for Cheater cannot be normalized to number str.
String  93.5 for Cheater beater cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.
String 2017 Komo Mai D



In [7]:
def get_thoughts(x):
    try:
        output = x[0]["task"]
        for y in x[1:]:
            try:
                if "observation" in y:
                    output += y["llm_output"] + "\nObservation:" + y["observation"]
                else:
                    output += y["llm_output"] + r"\Error:" + str(y["error"])
            except Exception:
                pass
        return output
    except Exception:
        return None


result_df["thoughts"] = result_df["intermediate_steps"].apply(lambda x: get_thoughts(x))

In [8]:
result_df["agent_name"].value_counts()

agent_name
code_o1_01_february_text                       163
code_o1_29-01_text                             105
code_o1_22-01_managedagent-summary_planning     67
code_o1_25-01_visioon                           53
Name: count, dtype: int64

# 2. Inspect specific runs

In [9]:
o1_vision = "code_o1_25-01_visioon"
o1_next = "code_o1_29-01_text"
o1 = "code_o1_01_february_text"

list_versions = [o1, o1_vision, o1_next]

# submission_selection_name = "react_code_llama3-70b_02-05_full-gaia-validation-code"
sel_df = result_df.loc[
    (result_df["agent_name"].isin(list_versions))
    # & (~result_df["question"].isin(UNSOLVED_QUESTIONS))
].reset_index(drop=True)
display(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
display(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)
# assert sel_df["question"].value_counts().max() == len(list_versions), "Some questions are duplicate!"

agent_name
code_o1_01_february_text    163
code_o1_29-01_text          105
code_o1_25-01_visioon        53
Name: count, dtype: int64

agent_name                task
code_o1_01_february_text  2       85
                          1       53
                          3       25
code_o1_25-01_visioon     2       30
                          1       17
                          3        6
code_o1_29-01_text        2       58
                          1       31
                          3       16
Name: count, dtype: int64

Total length: 321 - is complete: False


In [10]:
display("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
display(
    sel_df.groupby(["agent_name", "task"])[["is_correct", "is_near_correct", "count_steps", "question"]]
    .agg(
        {
            "is_correct": "mean",
            "is_near_correct": "mean",
            "count_steps": "mean",
            "question": "count",
        }
    )
    .rename(columns={"question": "count"})
)

'Average score:'

Unnamed: 0_level_0,is_correct
agent_name,Unnamed: 1_level_1
code_o1_01_february_text,0.491
code_o1_25-01_visioon,0.34
code_o1_29-01_text,0.39


Unnamed: 0_level_0,Unnamed: 1_level_0,is_correct,is_near_correct,count_steps,count
agent_name,task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
code_o1_01_february_text,1,0.54717,0.566038,2.849057,53
code_o1_01_february_text,2,0.529412,0.529412,3.317647,85
code_o1_01_february_text,3,0.24,0.24,4.48,25
code_o1_25-01_visioon,1,0.411765,0.411765,5.294118,17
code_o1_25-01_visioon,2,0.366667,0.366667,5.333333,30
code_o1_25-01_visioon,3,0.0,0.0,6.666667,6
code_o1_29-01_text,1,0.516129,0.516129,4.967742,31
code_o1_29-01_text,2,0.37931,0.431034,5.241379,58
code_o1_29-01_text,3,0.1875,0.1875,6.5,16


In [11]:
import plotly.express as px


cumulative_df = (
    (
        sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
        .expanding(min_periods=1, axis=0, method="single")
        .agg({"is_correct": "mean", "is_near_correct": "count"})
        .reset_index()
    )
    .copy()
    .rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1


def find_question(row):
    try:
        res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[row["index"]][:50]
        return res
    except Exception:
        return ""


cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)
# cumulative_df["question"] = [el[:50] for el in sel_df["question"].values]

# cumulative_df["is_correct"] = cumulative_df["is_correct"] * (165 - 68) / 165

px.line(
    cumulative_df,
    color="agent_name",
    x="index",
    y="is_correct",
    hover_data="question",
)

# 3. Dive deeper into one run

In [12]:
sel_df = result_df.loc[result_df["agent_name"] == o1]
print(len(sel_df))

163


### Count errors

In [13]:
import numpy as np


error_types = [
    "AgentParsingError",
    "AgentExecutionError",
    "AgentMaxIterationsError",
    "AgentGenerationError",
]
sel_df[error_types] = 0
sel_df["Count steps"] = np.nan


def count_errors(row):
    if isinstance(row["intermediate_steps"], list):
        row["Count steps"] = len(row["intermediate_steps"])
        for step in row["intermediate_steps"]:
            if isinstance(step, dict) and "error" in step:
                try:
                    row[str(step["error"]["error_type"])] += 1
                except Exception:
                    pass
    return row


sel_df = sel_df.apply(count_errors, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [14]:
import plotly.express as px


aggregate_errors = (
    sel_df.groupby(["is_correct"])[error_types + ["Count steps"]].mean().reset_index().melt(id_vars=["is_correct"])
)

fig = px.bar(
    aggregate_errors,
    y="value",
    x="variable",
    color="is_correct",
    labels={
        "agent_name": "<b>Model</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "value": "<b>Average count</b>",
        "eval_score_GPT4": "<b>Score</b>",
    },
)
fig.update_layout(
    height=500,
    width=800,
    barmode="group",
    bargroupgap=0.0,
)
fig.update_traces(textposition="outside")
fig.write_image("aggregate_errors.png", scale=3)
fig.show()

### Count tool calls

In [15]:
tools_calls = pd.DataFrame.from_records(sel_df["tool_calls"].values).fillna(0)

# Exclude the tools that were not used enough
tools_calls = tools_calls.loc[:, tools_calls.sum() > 10]

# Sort the columns by the sum of the values
tools_calls = tools_calls[tools_calls.sum().sort_values(ascending=False).index]
display(tools_calls)
sel_with_calls = pd.concat([sel_df[["question", "is_correct", "task"]], tools_calls], axis=1)
sel_with_calls = sel_with_calls.drop("question", axis=1).groupby(["is_correct", "task"]).mean()
# sel_with_calls = sel_with_calls.melt(id_vars=['question', 'is_correct', 'task'], var_name="tool", value_name='count')

KeyError: 'tool_calls'

In [16]:
sel_with_calls = sel_with_calls.reset_index().melt(
    id_vars=["is_correct", "task"], var_name="tool", value_name="average_count"
)

In [None]:
import plotly.express as px


fig = px.bar(
    sel_with_calls,
    x="tool",
    y="average_count",
    color="is_correct",
    facet_row="task",
    labels={
        "agent_name": "<b>Agent variant</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "eval_score_GPT4": "<b>Score</b>",
        "agent_type": "<b>Agent type</b>",
        "average_count": "<b>Average #calls per run</b>",
    },
)
fig.update_layout(
    barmode="group",
    height=800,
    width=1000,
    title="<b>" + "</b>",
)

### Inspect result by file extension type

In [None]:
display(
    sel_df.groupby(["agent_name", "attachment_type"])[["is_correct", "count_steps", "question"]].agg(
        {"is_correct": "mean", "count_steps": "mean", "question": "count"}
    )
)

# 4. Ensembling methods

### 4.1 Simple retry mechanism

In [None]:
run_1 = result_df.loc[result_df["agent_name"] == o1_vision].copy()
run_2 = result_df.loc[result_df["agent_name"] == o1].copy()
run_3 = result_df.loc[result_df["agent_name"] == o1_next].copy()


def majority_vote(df1, df2, df3):
    # Combine all predictions and is_correct values into one dataframe
    combined = pd.DataFrame(
        {
            "question": df1["question"],
            "task": df1["task"],
            "pred1": df1["prediction"],
            "pred2": df2["prediction"],
            "pred3": df3["prediction"],
            "correct1": df1["is_correct"],
            "correct2": df2["is_correct"],
            "correct3": df3["is_correct"],
        }
    )

    def get_majority_and_correct(row):
        # Get all predictions
        predictions = [row["pred1"], row["pred2"], row["pred3"]]
        correct_values = [row["correct1"], row["correct2"], row["correct3"]]

        # Count occurrences of each prediction
        from collections import Counter

        counts = Counter(predictions)

        # Get the most common prediction
        majority_pred = counts.most_common(1)[0][0]

        # Find the first dataframe that gave this prediction
        selected_idx = predictions.index(majority_pred)

        # Return both the prediction and its corresponding is_correct value
        return pd.Series(
            {"prediction": majority_pred, "is_correct": correct_values[selected_idx], "task": row["task"]}
        )

    # Apply the majority voting and get corresponding is_correct
    result = combined.apply(get_majority_and_correct, axis=1)

    # Combine with questions
    final_df = pd.DataFrame(
        {
            "question": combined["question"],
            "prediction": result["prediction"],
            "is_correct": result["is_correct"],
            "task": result["task"],
        }
    )

    return final_df


majority = majority_vote(run_1, run_2, run_3)
majority

In [None]:
print("First run:")
print(f"{run_1['is_correct'].mean():.2f}")

print("Second run:")
print(f"{run_2['is_correct'].mean():.2f}")

print("Third run:")
print(f"{run_3['is_correct'].mean():.2f}")

print("Combined run:")
display(majority.groupby(["task"])[["is_correct"]].mean())
print(f"{majority['is_correct'].mean():.2f}")

### 4.2 Ideal ensembling

In [None]:
third_run = result_df.loc[result_df["agent_name"] == noanchorplan].copy()
INCLUDE_THIRD_RUN = False


# test ideal ensembling
def score_best_both(row, result_df_replacement):
    try:
        if row["is_correct"]:
            return True

        else:
            matching_answer = result_df_replacement.loc[(result_df_replacement["question"] == row["question"])].iloc[0]
            if matching_answer["is_correct"]:
                return True
            else:
                return False
    except:
        return row["is_correct"]


combined_gpt4 = first_run_gpt4.copy()
combined_gpt4["is_correct"] = combined_gpt4.apply(lambda x: score_best_both(x, second_run_gpt4), axis=1)
if INCLUDE_THIRD_RUN:
    combined_gpt4["is_correct"] = combined_gpt4.apply(lambda x: score_best_both(x, third_run), axis=1)
print("Ideal combined run:")
print(combined_gpt4.groupby(["task"])["is_correct"].mean())
print(combined_gpt4["is_correct"].mean())