In [1]:
import json
from prettytable import PrettyTable
import json
import os
import random
import pandas as pd

# Read the jsonl file and convert it to a JSON list
def jsonl_to_json_list(jsonl_file_path):
    json_list = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Parse each line as JSON
            json_list.append(json_obj)
    
    return json_list

# Save the JSON list to a file
def save_as_json(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(json_list, outfile, indent=4)

def save_as_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for json_obj in json_list:
            json.dump(json_obj, outfile)
            outfile.write('\n')

In [8]:
import numpy as np

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def deduplicate_data(data):
    seen = set()
    deduplicated_data = []
    for item in data:
        idx = item['realidx']
        if idx not in seen:
            deduplicated_data.append(item)
            seen.add(idx)
    return deduplicated_data

def calculate_accuracy(data):
    correct_predictions = 0
    total_predictions = len(data)
    for item in data:
        if item['predicted_answer'] == item['answer_idx']:
            correct_predictions += 1
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

def calculate_cost_from_token_usage(data, model):
    total_cost = 0
    for item in data:
        if model == 'gpt-4o-mini':
            total_cost += item['token_usage']['prompt_tokens'] * 0.15 / 1000000 + item['token_usage']['completion_tokens'] * 0.6 / 1000000
        elif model == 'gpt-4o':
            total_cost += item['token_usage']['prompt_tokens'] * 2.5 / 1000000 + item['token_usage']['completion_tokens'] * 10 / 1000000
    return total_cost / len(data) if len(data) > 0 else 0

def calculate_time_from_data(data):
    total_time = 0
    for item in data:
        total_time += item['time_elapsed']
    return total_time / len(data) if len(data) > 0 else 0

tasks = {
    'medqa': ['test_hard'],
    'pubmedqa': ['test_hard'],
    'medmcqa': ['test_hard'],
    'medbullets': ['test_hard'],
    'mmlu': ['test_hard'],
    'mmlu-pro': ['test_hard'],
    # 'afrimedqa': ['test_hard'],
    'medexqa': ['test_hard'],
    'medxpertqa-u': ['test_hard'],
    'medxpertqa-r': ['test_hard'],
}
models = ['gpt-4o-mini', 'gpt-4o']
difficulties = ['adaptive']

print("| Model | Task | Subtask | Method | Accuracy (mean±std) | Cost per sample(USD) (mean±std) | Time per sample(s) (mean±std) | Total Number (mean±std) |")
print("|-------|------|---------|---------|---------------------|-------------------------------|-----------------------------|------------------------|")
table = PrettyTable()
table.field_names = [
    "Model", "Task", "Subtask", "Difficulty",
    "Accuracy (mean±std)", "Cost per sample(USD) (mean±std)",
    "Time per sample(s) (mean±std)", "Total Number (mean±std)"
]

total_cost = 0

for task in tasks:
    for model in models:
        for subtask in tasks[task]:
            for difficulty in difficulties:
                accuracies = []
                costs = []
                times = []
                totals = []
                for run in range(3):
                    try:
                        file_path = f'./output/run-{run}/{task}/{model}_{task}_{subtask}_{difficulty}.json'
                        data = load_json(file_path)
                        output_path = f'../../output/run-{run}/{task}/{model if model != "deepseek-V3" else "DeepSeek-V3"}-{task}-{subtask}-mdagents.json'
                        os.makedirs(os.path.dirname(output_path), exist_ok=True)
                        save_as_json(data, output_path)
                        deduplicated_data = deduplicate_data(data)
                        accuracy = calculate_accuracy(deduplicated_data)
                        total = len(deduplicated_data)
                        cost_per_sample = calculate_cost_from_token_usage(deduplicated_data, model)
                        time_per_sample = calculate_time_from_data(deduplicated_data)
                        accuracies.append(accuracy)
                        costs.append(cost_per_sample)
                        times.append(time_per_sample)
                        totals.append(total)
                        total_cost += cost_per_sample * total
                    except Exception as e:
                        print(f"Error loading file {file_path}: {e}")
                        # If a run is missing, skip it for stats
                        continue
                if len(accuracies) > 0:
                    acc_mean = np.mean(accuracies)
                    acc_std = np.std(accuracies)
                    cost_mean = np.mean(costs)
                    cost_std = np.std(costs)
                    time_mean = np.mean(times)
                    time_std = np.std(times)
                    total_mean = np.mean(totals)
                    total_std = np.std(totals)
                    print(f"| {model} | {task} | {subtask} | MDAgents | {acc_mean*100:.1f}%±{acc_std*100:.1f}% | {cost_mean:.6f}±{cost_std:.6f} | {time_mean:.2f}±{time_std:.2f} | {int(total_mean)}±{int(total_std)} |")
                    table.add_row([
                        model, task, subtask, difficulty,
                        f"{acc_mean*100:.1f}%±{acc_std*100:.1f}%",
                        f"{cost_mean:.6f}±{cost_std:.6f}",
                        f"{time_mean:.2f}±{time_std:.2f}",
                        f"{int(total_mean)}±{int(total_std)}"
                    ])
                else:
                    print(f"| {model} | {task} | {subtask} | MDAgents | N/A | N/A | N/A | N/A |")
                    table.add_row([
                        model, task, subtask, difficulty,
                        "N/A", "N/A", "N/A", "N/A"
                    ])

print(table)
print(f"\nTotal cost of experiment: ${total_cost:.2f}")

| Model | Task | Subtask | Method | Accuracy (mean±std) | Cost per sample(USD) (mean±std) | Time per sample(s) (mean±std) | Total Number (mean±std) |
|-------|------|---------|---------|---------------------|-------------------------------|-----------------------------|------------------------|
| gpt-4o-mini | medqa | test_hard | MDAgents | 24.0%±0.8% | 0.014301±0.001193 | 97.59±7.72 | 100±0 |
| gpt-4o | medqa | test_hard | MDAgents | 37.6%±1.4% | 0.106478±0.016121 | 55.56±21.74 | 94±4 |
| gpt-4o-mini | pubmedqa | test_hard | MDAgents | 17.1%±4.5% | 0.021907±0.009691 | 135.64±6.30 | 99±0 |
| gpt-4o | pubmedqa | test_hard | MDAgents | 21.0%±2.1% | 0.126942±0.005292 | 105.89±39.31 | 90±6 |
| gpt-4o-mini | medmcqa | test_hard | MDAgents | 22.2%±1.0% | 0.011123±0.003088 | 54.02±4.63 | 97±1 |
| gpt-4o | medmcqa | test_hard | MDAgents | 27.4%±1.1% | 0.029544±0.011421 | 16.41±1.58 | 98±0 |
| gpt-4o-mini | medbullets | test_hard | MDAgents | 14.2%±1.4% | 0.010425±0.001843 | 90.65±3.94 | 89±0 |