In [1]:
import os
import json
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from tqdm import tqdm

# Path to folder with *_results.json files
RESULTS_FOLDER = "./"
OUTPUT_FOLDER = "auto_eval_results"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Load all model result files
def load_model_outputs(folder):
    data = {}
    for file in os.listdir(folder):
        if file.endswith("_results.json"):
            model_name = file.replace("ollama_", "").replace("_results.json", "")
            with open(os.path.join(folder, file)) as f:
                data[model_name] = json.load(f)
    return data

# Compute BERTScore & ROUGE
def evaluate_model_outputs(data):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    results = []

    for model, entries in tqdm(data.items(), desc="Evaluating models"):
        hyps = []
        refs = []
        meta = []

        for entry in entries:
            hyp = entry.get("response", "").strip()
            ref = entry.get("reference", "").strip()

            if not hyp or not ref:
                continue

            hyps.append(hyp)
            refs.append(ref)
            meta.append({
                "id": entry["id"],
                "condition": entry["condition"],
                "model": model,
                "response": hyp,
                "reference": ref,
            })

        # BERTScore
        P, R, F1 = bert_score(hyps, refs, lang="en", verbose=True)
        f1_scores = F1.tolist()

        # ROUGE-L
        rouge_scores = [scorer.score(ref, hyp)['rougeL'].fmeasure for hyp, ref in zip(hyps, refs)]

        # Combine all
        for i, row in enumerate(meta):
            row["bertscore_f1"] = f1_scores[i]
            row["rougeL_f1"] = rouge_scores[i]
            results.append(row)

        # Save per-model CSV
        df = pd.DataFrame(results)
        model_file = os.path.join(OUTPUT_FOLDER, f"{model}_auto_eval.csv")
        df.to_csv(model_file, index=False)
        print(f"✅ Saved: {model_file}")

    return pd.DataFrame(results)

# === Main ===
if __name__ == "__main__":
    data = load_model_outputs(RESULTS_FOLDER)
    all_results_df = evaluate_model_outputs(data)
    all_results_df.to_csv(os.path.join(OUTPUT_FOLDER, "all_models_auto_eval.csv"), index=False)
    print("🏁 Done. Combined results saved to 'all_models_auto_eval.csv'")


ModuleNotFoundError: No module named 'bert_score'

In [3]:
pip install bert_score

[0mCollecting bert_score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting torch>=1.0.0 (from bert_score)
  Using cached torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Using cached torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl (821.2 MB)
[0mInstalling collected packages: torch, bert_score
[?25l[31mERROR: Could not install packages due to an OSError: [Errno 13] Permiso denegado: '/opt/nlp-env/lib/python3.11/site-packages/functorch/_C.cpython-311-x86_64-linux-gnu.so'
Check the permissions.
[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/2[0m [torch]
[0mNote: you may need to restart the kernel to use updated packages.
