# Notebook Test

## Declaration de l'agent utilise par la suite

In [4]:
from agent import ToolAgent  # Ton agent local

agent = ToolAgent(model="phi3:instruct")

## Questions Data from the GAIA dataset



In [5]:
import json 

with open('metadata.jsonl', 'r') as jsonl_file:
    json_list = list(jsonl_file)

json_QA = []
for json_str in json_list:
    json_data = json.loads(json_str)
    json_QA.append(json_data)

json_QA_level1 = [item for item in json_QA if str(item.get("Level", "")) == "1"]


In [6]:
import random
# random.seed(42)
random_samples = random.sample(json_QA, 1)
for sample in random_samples:
    print("=" * 50)
    print(f"Task ID: {sample['task_id']}")
    print(f"Question: {sample['Question']}")
    print(f"Level: {sample['Level']}")
    print(f"Final Answer: {sample['Final answer']}")
    print(f"Annotator Metadata: ")
    print(f"  ├── Steps: ")
    for step in sample['Annotator Metadata']['Steps'].split('\n'):
        print(f"  │      ├── {step}")
    print(f"  ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}")
    print(f"  ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}")
    print(f"  ├── Tools:")
    for tool in sample['Annotator Metadata']['Tools'].split('\n'):
        print(f"  │      ├── {tool}")
    print(f"  └── Number of tools: {sample['Annotator Metadata']['Number of tools']}")
print("=" * 50)

Task ID: 853c8244-429e-46ca-89f2-addf40dfb2bd
Question: In the 2015 Metropolitan Museum of Art exhibition titled after the Chinese zodiac animal of 2015, how many of the "twelve animals of the Chinese zodiac" have a hand visible?
Level: 2
Final Answer: 11
Annotator Metadata: 
  ├── Steps: 
  │      ├── 1. Search "2015 Chinese zodiac animal" on Google search.
  │      ├── 2. Note the animal (ram).
  │      ├── 3. Search "Metropolitan Museum of Art" on Google search.
  │      ├── 4. Open the Metropolitan Museum of Art website.
  │      ├── 5. Click "Exhibitions" under "Exhibitions and Events" 
  │      ├── 6. Click "Past".
  │      ├── 7. Set the year to 2015.
  │      ├── 8. Scroll to find the exhibit mentioning rams and click "Celebration of the Year of the Ram".
  │      ├── 9. Click "View All Objects".
  │      ├── 10. Click "Twelve animals of the Chinese zodiac" to open the image.
  │      ├── 11. Count how many have a visible hand.
  ├── Number of steps: 11
  ├── How long did this 

In [7]:
# list of the tools used in all the samples
from collections import Counter, OrderedDict

tools = []
for sample in json_QA:
    for tool in sample['Annotator Metadata']['Tools'].split('\n'):
        tool = tool[2:].strip().lower()
        if tool.startswith("("):
            tool = tool[11:].strip()
        tools.append(tool)
tools_counter = OrderedDict(Counter(tools))
print("List of tools used in all samples:")
print("Total number of tools used:", len(tools_counter))
for tool, count in tools_counter.items():
    print(f"  ├── {tool}: {count}")

List of tools used in all samples:
Total number of tools used: 83
  ├── web browser: 107
  ├── image recognition tools (to identify and parse a figure with three axes): 1
  ├── search engine: 101
  ├── calculator: 34
  ├── unlambda compiler (optional): 1
  ├── a web browser.: 2
  ├── a search engine.: 2
  ├── a calculator.: 1
  ├── microsoft excel: 5
  ├── google search: 1
  ├── ne: 9
  ├── pdf access: 7
  ├── file handling: 2
  ├── python: 3
  ├── image recognition tools: 12
  ├── jsonld file access: 1
  ├── video parsing: 1
  ├── python compiler: 1
  ├── video recognition tools: 3
  ├── pdf viewer: 7
  ├── microsoft excel / google sheets: 3
  ├── word document access: 1
  ├── tool to extract text from images: 1
  ├── a word reversal tool / script: 1
  ├── counter: 1
  ├── excel: 3
  ├── image recognition: 5
  ├── color recognition: 3
  ├── excel file access: 3
  ├── xml file access: 1
  ├── access to the internet archive, web.archive.org: 1
  ├── text processing/diff tool: 1
  ├── gi

## Verification of the correct use of each tools

### Tool verification

The following block is intended for directly testing the tools. This ensures that when the Agent invokes a tool, it performs as expected.

In [8]:
import pandas as pd
from tools import ToolExecutor

# Liste de tests à exécuter
test_cases = [
    ("add", ["3", "5"]),
    ("multiply", ["7", "6"]),
    ("subtract", ["10", "4"]),
    ("divide", ["20", "5"]),
    ("modulus", ["13", "5"]),
    ("wiki_search", ["Albert Einstein"]),
    ("web_search", ["current president of France"]),
]

# Stocke les résultats
results = []

for tool_name, args in test_cases:
    args_str = ', '.join(f'"{arg}"' for arg in args)
    command = f'Action: {tool_name}[{args_str}]'
    print(f"\n🛠️ Testing tool: {tool_name}")
    print(f"➡️ Command: {command}")
    result = ToolExecutor.execute(command)
    print(f"📤 Result: {result}")
    results.append({
        "tool": tool_name,
        "command": command,
        "result": result,
        "success": "Observation:" in result and "error" not in result.lower()
    })

# Résumé final
df = pd.DataFrame(results)
print("\n📊 TEST SUMMARY:")
print(df[["tool", "success"]])



🛠️ Testing tool: add
➡️ Command: Action: add["3", "5"]
📤 Result: Observation: 8

🛠️ Testing tool: multiply
➡️ Command: Action: multiply["7", "6"]
📤 Result: Observation: 42

🛠️ Testing tool: subtract
➡️ Command: Action: subtract["10", "4"]
📤 Result: Observation: 6

🛠️ Testing tool: divide
➡️ Command: Action: divide["20", "5"]
📤 Result: Observation: 4.0

🛠️ Testing tool: modulus
➡️ Command: Action: modulus["13", "5"]
📤 Result: Observation: 3

🛠️ Testing tool: wiki_search
➡️ Command: Action: wiki_search["Albert Einstein"]
📤 Result: Observation: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist who is best known for developing the theory of relativity. Einstein also made important contributions to quantum mechanics.

🛠️ Testing tool: web_search
➡️ Command: Action: web_search["current president of France"]
📤 Result: Observation: Emmanuel Jean-Michel Frédéric Macron ([emanɥɛl makʁɔ̃] ⓘ; born 21 December 1977) is a French politician who has served as Pre

## Call verification

In [9]:
question_unique = "What is 7 plus 5?"

# Mode normal (pas de trace disponible)
simple_answer = agent(question_unique)
print("✅ Simple answer:", simple_answer)

# Mode log (trace + outils utilisés)
logged = agent(question_unique, log=True)
print("\n📜 Full trace with log:")
print("✅ Final answer:", logged["final_answer"])
print("🛠️ Tools used:", logged["used_tools"])
print("📜 Trace:\n", logged["trace"])


✅ Simple answer: 12

📜 Full trace with log:
✅ Final answer: 12
🛠️ Tools used: ['add']
📜 Trace:
 📨 Prompt sent to Ollama (step 0):
You are an intelligent and precise agent with access to external tools. Your goal is to solve questions as accurately as possible by reasoning step by step and using the tools at your disposal when needed.

---

HOW TO REASON:

1. **Understand** the question. Identify what is expected (a number, a location, a name, etc.).
2. If a **calculation** is required, use a math tool like `add` or `multiply`.
3. If **external knowledge** is required, use `wiki_search` with a precise and relevant keyword.
4. After a tool is used, you will receive an `Observation:` line with the result.
5. You may repeat steps 2–4 as needed to refine your answer.
6. Once you are confident, write your final response using the format below.
7. If you are unsure from the result of wiki_search or web_search, extract a short answer using:
   Action: extract_answer["<text>", "<question>"]


-

## Evaluation on random data


In [12]:
import random
from agent import ToolAgent  # Ton agent local


# Set seed for reproducibility
random.seed(1)
evaluation_samples = random.sample(json_QA_level1, 15)  # Adjust the sample size if needed

# Initialize your local agent (make sure Ollama is running)
agent = ToolAgent(model="phi3:instruct")

# Store results
results = []

for sample in evaluation_samples:
    task_id = sample["task_id"]
    question = sample["Question"]
    expected = sample["Final answer"].strip().lower()

    try:
        # Call your agent directly
        print(f"\n🟨 --- TRACE FOR TASK {task_id} ---")
        print(f"🧠 Question: {question}")
        answer = agent(question).strip().lower()
        print(f"✅ Agent Answer: {answer}")
    except Exception as e:
        answer = f"ERROR: {e}"
        print(f"❌ ERROR during agent call: {e}")

    results.append({
        "task_id": task_id,
        "question": question,
        "expected": expected,
        "answer": answer,
        "correct": answer == expected
    })


🟨 --- TRACE FOR TASK 2d83110e-a098-4ebb-9987-066c06fa42d0 ---
🧠 Question: .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI
✅ Agent Answer: satoru ebisaki is known for his career as an economist at nippon life insurance company limited and mizuhama hospital; authoring management books like "human resource development"; writing articles in business journals, such as shizuoka shinpoh from chuo news; and being awarded honors including the medal of honor (silver) and person of cultural merit.

🟨 --- TRACE FOR TASK e142056d-56ab-4352-b091-b56054bd1359 ---
🧠 Question: Bob was invited to participate in a game show, and he advanced to the final round. The final round offered Bob the chance to win a large sum by playing a game against the host. The host has 30 shiny prop coins, each of which is worth $1,000 if Bob manages to win them by playing the game. The host hides the coins in three different prize boxes and then shuffles their order. The only rule rest



  lis = BeautifulSoup(html).find_all('li')


✅ Agent Answer: insufficient data available to provide the surname of a veterinarian, particularly given only marisa alviar-agnew & henry agnew are mentioned without further context indicating their profession in equine care or any association with this field that would imply such surnames. further clarification may be needed on how these individuals relate to the domain directly if not erroneous inclusion of names outside professional credentials is expected as part of a larger narrative, which might require human reasoning beyond current tool capabilities and provided context.)

🟨 --- TRACE FOR TASK 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 ---
🧠 Question: Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe

In [13]:
import pandas as pd
from tabulate import tabulate

# Résultats collectés précédemment
df_results = pd.DataFrame(results)
df_results["score"] = df_results["correct"].astype(int)

# Limites d'affichage pour éviter l'explosion horizontale
pd.set_option("display.max_colwidth", 80)

# Tronquer les textes longs
def truncate(text, maxlen=80):
    return text if len(text) <= maxlen else text[:maxlen - 3] + "..."

# Appliquer le tronquage sur colonnes longues
df_results["question"] = df_results["question"].apply(lambda x: truncate(x, 200))
df_results["expected"] = df_results["expected"].apply(lambda x: truncate(x, 50))
df_results["answer"] = df_results["answer"].apply(lambda x: truncate(x, 50))


# Sélection et affichage
summary_cols = ["correct", "expected", "answer","question"]
print("\n📊 EVALUATION SUMMARY:\n")
print(tabulate(df_results[summary_cols], headers="keys", tablefmt="grid", showindex=False))

# Score final
accuracy = df_results["correct"].mean() * 100
print(f"\n✅ Accuracy: {accuracy:.2f}%")




📊 EVALUATION SUMMARY:

+-----------+----------------------------------------------------+----------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| correct   | expected                                           | answer                                             | question                                                                                                                                                                                                 |
| False     | right                                              | satoru ebisaki is known for his career as an ec... | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI                                                                                                                    |
+------