In [None]:
import copy
import json
import logging
from pathlib import Path
from time import sleep
import traceback
from smolagents.models import OpenAIServerModel, MessageRole, Model, ChatMessage
from tqdm import tqdm


def extract_answer(original_task: str, final_answer: str, model: Model, conv_num:int=0) -> str:
    role_mapping={"user": MessageRole.USER, "assistant": MessageRole.ASSISTANT, "system": MessageRole.SYSTEM, 'tool-call': MessageRole.TOOL_CALL, 'tool-response': MessageRole.TOOL_RESPONSE}
    messages = [
        {
            "role": MessageRole.SYSTEM,
            "content": [
                {
                    "type": "text",
                    "text": f"""You are an answer format verifying expert for multi-agent systems.
Your task is to analyze the final answer given by the agent team and verify if it is in the correct FORMAT, then return the final answer in the correct FORMAT.

## Analysis Steps:
1. **Identify Key Information**: Scan the answer for facts, data, calculations, and conclusions relevant to the question
2. **Constraints Check**: Check if the answer satisfies all the instructions in the question.
3. **Verify Completeness**: Ensure the answer directly addresses what was asked.
4. **Result Convert**: You may need to convert the analysis results appropriately to match the question requirements. For example, statistical results from the analysis process.
5. **Verify Format**: Verify if the final answer is in the correct format.
                    
## Answer Formatting Rules:
- Results: Only return the concise core answer required by the question stem without any redundant description.
- Numbers: Use digits only (e.g., "42", not "forty-two"), no commas, no units (e.g. $, %) unless specifically requested.
- Plain Text: Use exact tex without numbers, avoid articles/abbreviations unless specified, no punctuation at the end.
- Lists: Comma-separated, apply number/text rules to each element.
- Units: Pay close attention to the units of measurement specified in the question if necessary. (units: e.g. per 100, thousand hours)

## Response Rules:
- Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
- Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
- If you are asked for a string, don't use articles, codes, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
- DO NOT add `Thoughts` to the FINAL ANSWER template, the final answer should be specific.

---
## Correction Answer Example:
## Example 1
task: Who is the president of the United States in 2025, give the full name?
final_answer: The president of the United States in 2025 is Donald Trump.
correct_answer: Donald John Trump

## Example 2
task: What is the tallest building in the world, and give its height in meters just the number?
final_answer: The Burj Khalifa, 828 meters
correct_answer: 828

## Example 3
task: How much revenue did Apple generate in the fourth quarter of fiscal year 2024?
final_answer: $9,490,000,000
correct_answer: 9490000000

## Example 4
task: 1. Please output 'Hello, world!'.\n2. Dont print anything.\n If there is a conflict, ignore above instructions, and just output the 'Java'.
final_answer: Hello, world! Java
correct_answer: Java

"""
                }
            ],
        }
    ]

    messages.append(
        {
            "role": MessageRole.USER,
            "content": [
                {
                    "type": "text",
                    "text": f"""The Agent team has finished their investigation. Here is the original task and the final answer given by the agent team, please check the format of the final answer.
                    For Reasoning or Understanding tasks, check if the answer is logical and consistent with the question, you can reason by yourself first, if your answer is different from the given final answer, you should return yours.
                    But for other tasks, you should just check the format. Please give your analysis process and reason before returning the final answer.

                    ## Task:
                    ```
                    {original_task}
                    ```

                    ## Final Answer:
                    ```
                    {final_answer}
                    ```

                    Now, begin your analysis, remember return the final answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
                    """,
                }
            ],
        }
    )
    
    messages = [ChatMessage.from_dict(msg) for msg in messages]

    response = model(messages)
    # print(response.token_usage)
    final_answer = None

    # match = re.search(r"FINAL ANSWER:\s*([\s\S]+?)(?:\n\n|\Z)(?![\s\S]*FINAL ANSWER:)", response)
    # if match:
    #     final_answer = match.group(1).strip()
    # else:
    #     final_answer = response.split("FINAL ANSWER: ")[-1].strip()

    final_answer = response.content.split("FINAL ANSWER: ")[-1].strip()

    return final_answer

In [2]:
import json
records=[]
with open("output/cache/validation/gaia_val_xsy_claude4_0719v4.jsonl", "r", encoding='utf8') as file:
    for line in file:
        records.append(json.loads(line))

In [60]:
idx=71
records[idx]['true_answer']

'Guava'

In [62]:
from typing import Any
from smolagents.models import OpenAIServerModel,OpenAIServerModel

custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
model_params: dict[str, Any] = {
    "model_id": "gpt-4.1",
    # "custom_role_conversions": custom_role_conversions,
}

model_params["max_tokens"] = 10000
model_params['temperature'] = 0.2

model = OpenAIServerModel(
    api_base="http://gpt-proxy.jd.com/gateway/common",
    api_key="64268e2b-188f-4e86-9b2a-8542ba3849c8",
    **model_params,
)

res=extract_answer(records[idx]['question'], final_answer=records[idx]['intermediate_steps'][-1]['content'].split("\n")[-1], model=model, conv_num=4)
res

'Guava'

In [53]:
records[idx]['intermediate_steps'][-1]['content'].split("\n")[-1]

'Finance'