# Import Required Libraries

In [None]:
import json, time, re
import numpy as np
from urllib.parse import urlparse, urlunparse
import openai

# Definition of API Call Class and I/O Paths

In [None]:
# Definition of Deep Research Agents Model (using o4-mini-deep-research as an example)
class DRM:
    def __init__(self, model):
        self.client = openai.OpenAI(api_key="OPENAI_API_KEY", timeout=3600)
        self.model = model
    def __call__(self, uprompt):
        response = self.client.responses.create(model=self.model, input=uprompt, tools=[{"type": "web_search_preview"}])
        return response

# Definition of Judger Model
class GPT:
    def __init__(self, model, sprompt=""):
        self.client = openai.OpenAI(api_key="OPENAI_API_KEY")
        self.model = model
        self.sprompt = sprompt
    def __call__(self, uprompt):
        messages = [{"role": "system", "content": self.sprompt}]
        messages.append({"role": "user", "content": uprompt})
        completion = self.client.chat.completions.create(model=self.model, messages=messages, temperature=0)
        response = completion.choices[0].message.content
        return response

drm = DRM("o4-mini-deep-research-2025-06-26")
gpt = GPT("gpt-4o-2024-11-20")

In [None]:
# Definition of I/O Paths
bench_file_path = "RigorousBench.jsonl"
log_file_path = "Log_o4-mini-deep-research-2025-06-26.jsonl"
eval_file_path = "Eval_o4-mini-deep-research-2025-06-26.jsonl"

# Get Response Report and Save Log

In [None]:
# Resume Mechanism After Interruption
completed_list = []
with open(log_file_path, "r", encoding="utf-8") as f:
    for line in f:
        log_data = json.loads(line.strip())
        completed_list.append(log_data["uid"])

# Load Benchmark
bench = []
with open(bench_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        entry = json.loads(line.strip())
        bench.append(entry)

# Get Response Report and Metadata
for i in range(len(bench)):
    # Retry Mechanism
    for j in range(3):
        try:
            entry = bench[i]
            uid = entry["uid"]
            query = entry["query"]
            qsr = entry["qsr"]
            tsl = entry["tsl"]
            fak = entry["fak"]
            fdk = entry["fdk"]
            if uid in completed_list:
                print(f"NOTE: Log for UID {uid} already exists")
            else:
                print(f"START ### UID {uid}")
                # Get Deep Research Response
                response = drm(query)
                # Log of Usage
                input_tokens = response.usage.input_tokens
                output_tokens = response.usage.output_tokens
                reasoning_tokens = response.usage.output_tokens_details.reasoning_tokens
                total_tokens = response.usage.total_tokens
                # Log of Report and Annotations
                for item in response.output:
                    if type(item) == openai.types.responses.response_output_message.ResponseOutputMessage:
                        annotations = item.content[0].annotations
                        annotations = [annotation.url for annotation in annotations if hasattr(annotation, 'url') and annotation.url]
                        pure_annotations = [re.sub(r'#:~:text=.*$', '', url) for url in annotations]
                        text = item.content[0].text
                        pure_text = re.sub(r'\(\[[^\]]+\]\([^)]+\)\)', '', text)
                # Log Entry
                log = {"uid": uid, "query": query, "qsr": qsr, "tsl": tsl, "fak": fak, "fdk": fdk,
                    "input_tokens": input_tokens, "output_tokens": output_tokens, 
                    "reasoning_tokens": reasoning_tokens, "total_tokens": total_tokens,
                    "annotations": annotations, "pure_annotations": pure_annotations,
                    "text": text, "pure_text": pure_text}
                # Save Log Entry
                with open(log_file_path, 'a', encoding='utf-8') as outfile:
                    outfile.write(json.dumps(log) + '\n')
                print(f"Log for UID {uid} has been saved.")
            break
        except Exception as e:
            print(f"Failure #{j+1}: {e}")
            time.sleep(10)
    else:
        print("Exceeded preset retry limit. Abandoning attempt.")

# Multidimensional Evaluation

In [None]:
# GRRs List
grr = [
    "Does the report include a clear three-part structure (introduction, body, conclusion)? Yes=2, No=0",
    "Does the report clearly state the research question or objective at the beginning? Yes=2, No=0",
    "Does the report provide background and purpose in the introduction? Yes=1, No=0",
    "Does the report develop coherent arguments in the body section? Yes=2, No=0",
    "Does the report summarize key findings in the conclusion? Yes=2, No=0",
    "Does the report offer actionable recommendations or future directions? Yes=2, No=0",
    "Does the report use smooth transitions between paragraphs or sections? Yes=1, No=0",
    "Does the report use headings and subheadings to organize content? Yes=1, No=0",
    "Does the report avoid information dumping and present ideas clearly? Yes=2, No=0",
    "Does the report use precise and clear language to express ideas? Yes=2, No=0",
    "Does the report avoid grammar, spelling, or sentence structure issues? Yes=1, No=0",
    "Does the report demonstrate logical reasoning such as cause-effect or comparison? Yes=2, No=0",
    "Does the report reflect critical thinking or independent judgment? Yes=2, No=0",
    "Does the report conclude with insightful perspectives or calls to action? Yes=1, No=0",
    "Does the report maintain a formal, academic, and objective tone throughout? Yes=1, No=0",
    "Does the report cover all key aspects of the research topic? Yes=2, No=0",
    "Does the report avoid missing important background or variables? Yes=1, No=0",
    "Does the report provide sufficient evidence to support its claims? Yes=2, No=0",
    "Does the report analyze underlying causes or trends in the data? Yes=2, No=0",
    "Does the report incorporate multiple angles or dimensions in its analysis? Yes=1, No=0",
    "Does the report demonstrate both breadth and depth of understanding? Yes=2, No=0",
    "Does the report avoid vague or repetitive statements? Yes=1, No=0",
    "Does the report cite authoritative academic journals or professional sources? Yes=2, No=0",
    "Does the report provide clear citation formatting? Yes=1, No=0",
    "Does the report cite sources that are highly relevant to the topic? Yes=2, No=0",
    "Does the report avoid fabricated, unclear, or misleading references? Yes=2, No=0",
    "Does the report embed citations within the body rather than only at the end? Yes=1, No=0",
    "Does the report distinguish between primary and secondary sources? Yes=1, No=0",
    "Does the report offer a unique perspective or analytical framework? Yes=2, No=0",
    "Does the report critique existing viewpoints thoughtfully? Yes=2, No=0",
    "Does the report propose innovative ideas or future research directions? Yes=2, No=0",
    "Does the report show deep understanding of complex issues? Yes=2, No=0",
    "Does the report avoid simply repeating existing conclusions? Yes=1, No=0",
    "Does the report reflect the author’s reasoning and intellectual depth? Yes=2, No=0",
    "Does the report use credible and verifiable data sources? Yes=2, No=0",
    "Does the report interpret and explain data appropriately? Yes=2, No=0",
    "Does the report use charts, tables, or visuals to support analysis? Yes=1, No=0",
    "Does the report avoid misusing statistics or exaggerating findings? Yes=2, No=0",
    "Does the report analyze data with causal or trend-based reasoning? Yes=2, No=0",
    "Does the report acknowledge limitations or biases in the data? Yes=1, No=0",
    "Does the report include source and date information for cited data? Yes=1, No=0",
    "Does the report use proper Markdown heading levels (e.g., #, ##, ###)? Yes=1, No=0",
    "Does the report use ordered or unordered lists to present key points? Yes=1, No=0",
    "Does the report correctly use Markdown elements like code blocks, quotes, or tables? Yes=1, No=0",
    "Does the report avoid Markdown syntax errors or formatting issues? Yes=1, No=0",
    "Does the report maintain clean, readable, and visually consistent layout? Yes=1, No=0",
    "Does the report use consistent terminology and avoid style shifts? Yes=1, No=0",
    "Does the report avoid informal or conversational language? Yes=1, No=0"
]

In [None]:
# URL Normalization Function
def normalize_url(url):
    parsed = urlparse(url.strip())
    path = parsed.path.rstrip('/').lower()
    normalized = urlunparse((parsed.scheme.lower(), parsed.netloc.lower(), path, '', '', ''))
    return normalized

# Get URL Hostname
def extract_hostname(url):
    parsed = urlparse(url.strip())
    return parsed.netloc.lower()

# Count the Number of URL Matches
def count_matching_urls(refe, anno):
    normalize_refe = list(set(normalize_url(url) for url in refe))
    normalize_anno = list(set(normalize_url(url) for url in anno))
    hostmane_normalize_refe = list(set(extract_hostname(refe_item) for refe_item in normalize_refe))
    full_count = 0
    for refe_item in normalize_refe:
        for anno_item in normalize_anno:
            if refe_item in anno_item:
                full_count += 1
    host_count = 0
    for host_refe_item in hostmane_normalize_refe:
        for anno_item in normalize_anno:
            if host_refe_item in anno_item:
                host_count += 1
    return full_count, host_count

# Count Keyword Occurrences
def count_single_keyword(keyword, text):
    text_lower = text.lower()
    pattern = re.escape(keyword.lower())
    count = len(re.findall(pattern, text_lower))
    return count

In [None]:
# Load Log
log_data = []
with open(log_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        log = json.loads(line.strip())
        log_data.append(log)

for i in range(len(log_data)):
    log = log_data[i]
    uid = log["uid"]
    query = log["query"]
    qsr = log["qsr"]
    tsl = log["tsl"]
    fak = log["fak"]
    fdk = log["fdk"]
    grr = grr
    input_tokens = log["input_tokens"]
    output_tokens = log["output_tokens"]
    reasoning_tokens = log["reasoning_tokens"]
    total_tokens = log["total_tokens"]
    annotations = log["annotations"]
    pure_annotations = log["pure_annotations"]
    text = log["text"]
    pure_text = log["pure_text"]

    # URL Match Evaluation
    full_match, host_match = count_matching_urls(tsl, pure_annotations)
    host_match = host_match - full_match
    print(full_match, host_match)

    # Keyword Match and Relevance Evaluation
    keywords_judger_sprompt = """
        You are a scoring evaluator tasked with assessing the relevance of a specific keyword within a research report. You will be provided with:
            1. A report text  
            2. A keyword to evaluate
        Your task is to:
            - Read the report carefully
            - Judge how semantically relevant the keyword is to the report content
            - Consider not just frequency, but depth of discussion, thematic importance, and contextual integration
        Use the following 5-point relevance scale:
        [5] Extremely Relevant: The keyword is a central theme of the report; It appears multiple times and is discussed in depth; The report’s main arguments or findings revolve around it
        [4] Highly Relevant: The keyword is a major topic; It appears more than once and is clearly explained or referenced; contributes directly to the report’s purpose
        [3] Moderately Relevant: The keyword is mentioned but not emphasized; It may appear once or twice; It supports the report contextually but is not a focus
        [2] Slightly Relevant: The keyword is briefly mentioned; It has little impact on the report’s core content; It may be incidental or peripheral
        [1] Not Relevant: The keyword does not appear in the report; Or it appears in a way that is unrelated to the report’s topic
        Output format example:
            [4] The keyword “QUIC” is referenced multiple times in the report, particularly in the context of protocol evolution and RFC publication. While not the sole focus, it is clearly a major topic.
        """
    keywords_judger = GPT("gpt-4o-2024-11-20", sprompt=keywords_judger_sprompt)
    
    keywords_fak_count = []
    keywords_fak_relevance = []
    for keyword in fak:
        fak_count = count_single_keyword(keyword, pure_text)
        keywords_fak_count.append(fak_count)
        rules_judger_uprompt = f"""
            Report text: {pure_text} 
            Keyword: {keyword}
            """
        fak_eval = keywords_judger(rules_judger_uprompt)
        fak_score = int(re.findall(r"\[(\d+)\]", fak_eval)[0])
        keywords_fak_relevance.append(fak_score)
    keywords_fdk_count = []
    keywords_fdk_relevance = []
    for keyword in fdk:
        fdk_count = count_single_keyword(keyword, pure_text)
        keywords_fdk_count.append(fdk_count)
        rules_judger_uprompt = f"""
            Report text: {pure_text} 
            Keyword: {keyword}
            """
        fdk_eval = keywords_judger(rules_judger_uprompt)
        fdk_score = int(re.findall(r"\[(\d+)\]", fdk_eval)[0])
        keywords_fdk_relevance.append(fdk_score)

    # Rubrics Scoring and Evaluation
    rules_judger_sprompt = """
        You are a scoring evaluator tasked with assessing the quality of a report generated by a deep research model. You will be provided with:
            1. A report text  
            2. An evaluation rule containing specific scoring criteria and allowed score values
        Your task is to:
            - Carefully read the report  
            - Evaluate it strictly against the given rule  
            - Assign a score based only on the score values defined in the rule
        Scoring instructions:
            - Only use the score values explicitly listed in the rule (e.g., 0, 1, 2)
            - Do not invent intermediate scores or alternative formats
            - Your output must begin with the score in square brackets [], followed by a one-sentence reason
        Output format example:
            [0] No citations were provided, which violates the requirement.
            [2] The report fully meets the requirement with clear and relevant details.
        Be objective and consistent. Focus on clarity, completeness, relevance, and adherence to the rule.
        """
    rules_judger = GPT("gpt-4o-2024-11-20", sprompt=rules_judger_sprompt)
    
    qsr_sum = 0
    for rule in qsr:
        rules_judger_uprompt = f"""
            Report text: {text} 
            Rule: {rule}
            """
        rules_eval = rules_judger(rules_judger_uprompt)
        score = int(re.findall(r"\[(\d+)\]", rules_eval)[0])
        qsr_sum += score
    grr_sum = 0
    for rule in grr:
        rules_judger_uprompt = f"""
            Report text: {text} 
            Rule: {rule}
            """
        rules_eval = rules_judger(rules_judger_uprompt)
        score = int(re.findall(r"\[(\d+)\]", rules_eval)[0])
        grr_sum += score

    # Eval Entry
    eval = {"uid": uid, "qsr_sum": qsr_sum, "grr_sum": grr_sum,
        "keywords_fak_count": keywords_fak_count, "keywords_fak_relevance": keywords_fak_relevance,
        "keywords_fdk_count": keywords_fdk_count, "keywords_fdk_relevance": keywords_fdk_relevance,
        "url_full_match": full_match, "url_host_match": host_match,
        "ref_url_num": len(tsl), "report_url_num": len(list(set(pure_annotations)))}
    # Save Eval Entry
    with open(eval_file_path, 'a', encoding='utf-8') as outfile:
        outfile.write(json.dumps(eval) + '\n')
    print(f"Eval for UID {uid} has been saved.")

# Compute Core Metrics

In [None]:
# Load Eval
eval_data = []
with open(eval_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        eval = json.loads(line.strip())
        eval_data.append(eval)

quality_list = []
semantic_list = []
tboost_list = []
intergrated_list = []

for i in range(len(eval_data)):
    eval = eval_data[i]
    data_uid = eval["uid"]
    qsr_sum = eval["qsr_sum"]
    grr_sum = eval["grr_sum"]
    keywords_fak_count = eval["keywords_fak_count"]
    keywords_fak_relevance = eval["keywords_fak_relevance"]
    keywords_fdk_count = eval["keywords_fdk_count"]
    keywords_fdk_relevance = eval["keywords_fdk_relevance"]
    url_full_match = eval["url_full_match"]
    url_host_match = eval["url_host_match"]
    ref_url_num = eval["ref_url_num"]
    report_url_num = eval["report_url_num"]

    # Count QUA
    quality = 0.5 * (qsr_sum / 30) + 0.5 * (grr_sum / 73)
    quality_list.append(quality)

    # Count SDR
    fak_score = []
    fdk_score = []
    for i in range(5):
        fak_score.append(min(keywords_fak_count[i] / 2, 1) * (keywords_fak_relevance[i] / 5))
        fdk_score.append(min(keywords_fdk_count[i], 1) * (keywords_fdk_relevance[i] / 5))
    fak_drift = 1 - (sum(fak_score) / 5)
    fdk_drift = (sum(fdk_score) / 5)
    semantic = 1 - (0.7 * fak_drift + 0.3 * fdk_drift)
    semantic_list.append(semantic)

    # Count TBO
    tboost = 1 + 0.2 * (0.7 * (url_full_match / ref_url_num) + 0.3 * (url_host_match / (report_url_num + 1)))
    tboost_list.append(tboost)

    # Count ITS
    intergrated = quality * semantic * tboost
    intergrated_list.append(intergrated)

# Print Metrics
print(
    f"QUA:{np.mean(quality_list)}  SDR:{np.mean(semantic_list)}  TBO:{np.mean(tboost_list)}  ITS:{np.mean(intergrated_list)}"
)

# THE END