In [None]:
import pandas as pd
import json
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

files = [
    "outputs/agent_multi_results_1.jsonl",
    "outputs/agent_multi_results_2.jsonl",
    "outputs/agent_multi_results_3.jsonl",
    "outputs/agent_multi_results_4.jsonl",
    "outputs/agent_multi_results_5.jsonl"
]

In [6]:
def compute_metrics(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = [json.loads(line) for line in f]
    df = pd.DataFrame(lines)

    mapping = {"Yes": 1, "No": 0}
    y_true = df["ground_truth"].map(mapping)
    y_pred = df["judgement"].map(mapping)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    accuracy = accuracy_score(y_true, y_pred)

    return precision, recall, f1, accuracy

In [7]:
all_metrics = []

for file in files:
    p, r, f, a = compute_metrics(file)
    all_metrics.append([p, r, f, a])
    print(f"{file}: Precision={p:.4f}, Recall={r:.4f}, F1={f:.4f}, Accuracy={a:.4f}")

# compute avg
avg_metrics = pd.DataFrame(all_metrics, columns=["Precision", "Recall", "F1", "Accuracy"]).mean()
print("\nAvg:")
print(f"Precision={avg_metrics['Precision']:.4f}, Recall={avg_metrics['Recall']:.4f}, F1={avg_metrics['F1']:.4f}, Accuracy={avg_metrics['Accuracy']:.4f}")

outputs/agent_multi_results_1.jsonl: Precision=0.6759, Recall=0.7604, F1=0.7157, Accuracy=0.7100
outputs/agent_multi_results_2.jsonl: Precision=0.6604, Recall=0.7527, F1=0.7035, Accuracy=0.7050
outputs/agent_multi_results_3.jsonl: Precision=0.6525, Recall=0.8105, F1=0.7230, Accuracy=0.7050
outputs/agent_multi_results_4.jsonl: Precision=0.6417, Recall=0.8370, F1=0.7264, Accuracy=0.7100
outputs/agent_multi_results_5.jsonl: Precision=0.6897, Recall=0.7407, F1=0.7143, Accuracy=0.6800

平均指标:
Precision=0.6640, Recall=0.7803, F1=0.7166, Accuracy=0.7020
