In [1]:
import os
import json
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from tqdm import tqdm

# === Config ===
RESULTS_FOLDER = "./"
OUTPUT_FOLDER = "auto_eval_results"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Load all model outputs ===
def load_model_outputs(folder):
    data = {}
    for file in os.listdir(folder):
        if file.endswith("_results.json"):
            model_name = file.replace("ollama_", "").replace("_results.json", "")
            with open(os.path.join(folder, file)) as f:
                data[model_name] = json.load(f)
    return data

# === Evaluate with BERTScore and ROUGE ===
def evaluate_model_outputs(data):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    all_results = []

    for model, entries in tqdm(data.items(), desc="Evaluating models"):
        hyps = []
        refs = []
        meta = []

        for entry in entries:
            hyp = entry.get("response", "").strip()
            ref = entry.get("reference", "").strip()  # Must be added manually to the JSON or merged beforehand

            if not ref:
                continue

            refs.append(ref)
            hyps.append(hyp)
            meta.append({
                "id": entry["id"],
                "condition": entry["condition"],
                "model": model,
                "reference": ref,
                "candidate": hyp
            })

        if not refs:
            print(f"🚫 Skipping {model} — no matching references found.")
            continue

        # Compute BERTScore
        P, R, F1 = bert_score(hyps, refs, lang="en", verbose=True)
        f1_scores = F1.tolist()

        # Compute ROUGE-L
        rouge_scores = [
            scorer.score(ref, hyp)['rougeL'].fmeasure
            for hyp, ref in zip(hyps, refs)
        ]

        # Combine metrics into results
        for i, row in enumerate(meta):
            row["bertscore_f1"] = f1_scores[i]
            row["rougeL_f1"] = rouge_scores[i]
            all_results.append(row)

        # Save per-model CSV
        df = pd.DataFrame(all_results)
        model_file = os.path.join(OUTPUT_FOLDER, f"{model}_auto_eval.csv")
        df.to_csv(model_file, index=False)
        print(f"✅ Saved: {model_file}")

    return pd.DataFrame(all_results)

# === Main ===
if __name__ == "__main__":
    data = load_model_outputs(RESULTS_FOLDER)
    all_results_df = evaluate_model_outputs(data)
    all_results_df.to_csv(os.path.join(OUTPUT_FOLDER, "all_models_auto_eval.csv"), index=False)
    print("🏁 Done. Combined results saved to 'all_models_auto_eval.csv'")


Evaluating models:   0%|          | 0/9 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/17 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 855.53 seconds, 0.72 sentences/sec


Evaluating models:  11%|█         | 1/9 [16:05<2:08:43, 965.44s/it]

✅ Saved: auto_eval_results\deepseek-r1-70b-alt_auto_eval.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/20 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 790.13 seconds, 0.77 sentences/sec


Evaluating models:  22%|██▏       | 2/9 [30:17<1:44:49, 898.48s/it]

✅ Saved: auto_eval_results\gemma3-27b_auto_eval.csv
🚫 Skipping gpt4 — no matching references found.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/17 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 575.67 seconds, 1.06 sentences/sec


Evaluating models:  44%|████▍     | 4/9 [40:25<43:50, 526.02s/it]  

✅ Saved: auto_eval_results\llama4-400b_auto_eval.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/10 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 450.49 seconds, 1.36 sentences/sec


Evaluating models:  56%|█████▌    | 5/9 [48:07<33:44, 506.07s/it]

✅ Saved: auto_eval_results\mistral-small3.1-latestllama4-400bgemma3-27bdeepseek-r1-70b-alt_auto_eval.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/19 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 698.33 seconds, 0.88 sentences/sec


Evaluating models:  67%|██████▋   | 6/9 [1:00:35<29:00, 580.30s/it]

✅ Saved: auto_eval_results\mistral-small3.1-latest_auto_eval.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/29 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/15 [00:00<?, ?it/s]

done in 1112.59 seconds, 0.81 sentences/sec


Evaluating models:  78%|███████▊  | 7/9 [1:20:00<25:16, 758.43s/it]

✅ Saved: auto_eval_results\qwen2.5-latest_auto_eval.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/29 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/15 [00:00<?, ?it/s]

done in 1346.00 seconds, 0.67 sentences/sec


Evaluating models: 100%|██████████| 9/9 [1:44:34<00:00, 697.19s/it]

✅ Saved: auto_eval_results\qwen3-0.6b_auto_eval.csv
🚫 Skipping qwen3-latest — no matching references found.





🏁 Done. Combined results saved to 'all_models_auto_eval.csv'


In [8]:
import os
import json
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from tqdm import tqdm

# === Config ===
RESULTS_FOLDER = "./new"
OUTPUT_FOLDER = "auto_eval_results"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# === Load model outputs with ID adjustments ===
def load_model_outputs(folder):
    data = {}
    for file in os.listdir(folder):
        if file.endswith("_results.json"):
            model_name = file.replace("ollama_", "").replace("_results.json", "")
            with open(os.path.join(folder, file)) as f:
                entries = json.load(f)
                
                # For qwen3-latest, only keep IDs >= 97
                if model_name == "qwen3-latest":
                    entries = [entry for entry in entries if entry["id"] >= 97]
                
                data[model_name] = entries
    return data

# === Evaluation function ===
def evaluate_model_outputs(data):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    all_results = []

    for model, entries in tqdm(data.items(), desc="Evaluating models"):
        hyps = []
        refs = []
        meta = []

        for entry in entries:
            # Skip if reference is missing
            if not entry.get("reference"):
                print(f"⚠️ Skipping entry (missing reference): ID {entry['id']}")
                continue

            hyp = entry["response"].strip()
            ref = entry["reference"].strip()

            refs.append(ref)
            hyps.append(hyp)
            meta.append({
                "id": entry["id"],
                "condition": entry["condition"],
                "model": model,
                "reference": ref,
                "candidate": hyp
            })

        if not refs:
            print(f"🚫 Skipping {model} — no valid references found.")
            continue

        # Compute BERTScore
        P, R, F1 = bert_score(hyps, refs, lang="en", verbose=True)
        f1_scores = F1.tolist()

        # Compute ROUGE-L
        rouge_scores = [
            scorer.score(ref, hyp)['rougeL'].fmeasure
            for hyp, ref in zip(hyps, refs)
        ]

        # Combine results
        for i, row in enumerate(meta):
            row["bertscore_f1"] = f1_scores[i]
            row["rougeL_f1"] = rouge_scores[i]
            all_results.append(row)

        # Save per-model CSV
        df = pd.DataFrame(all_results)
        model_file = os.path.join(OUTPUT_FOLDER, f"{model}_auto_eval.csv")
        df.to_csv(model_file, index=False)
        print(f"✅ Saved: {model_file}")

    return pd.DataFrame(all_results)

# === Main ===
if __name__ == "__main__":
    data = load_model_outputs(RESULTS_FOLDER)
    all_results_df = evaluate_model_outputs(data)
    all_results_df.to_csv(os.path.join(OUTPUT_FOLDER, "all_models_auto_eval.csv"), index=False)
    print("🏁 Done. Combined results saved to 'all_models_auto_eval.csv'")

Evaluating models:   0%|          | 0/3 [00:00<?, ?it/s]

⚠️ Skipping entry (missing reference): ID 97
⚠️ Skipping entry (missing reference): ID 97
⚠️ Skipping entry (missing reference): ID 97
⚠️ Skipping entry (missing reference): ID 98
⚠️ Skipping entry (missing reference): ID 98
⚠️ Skipping entry (missing reference): ID 98
⚠️ Skipping entry (missing reference): ID 99
⚠️ Skipping entry (missing reference): ID 99
⚠️ Skipping entry (missing reference): ID 99
⚠️ Skipping entry (missing reference): ID 100
⚠️ Skipping entry (missing reference): ID 100
⚠️ Skipping entry (missing reference): ID 100
⚠️ Skipping entry (missing reference): ID 101
⚠️ Skipping entry (missing reference): ID 101
⚠️ Skipping entry (missing reference): ID 101
⚠️ Skipping entry (missing reference): ID 102
⚠️ Skipping entry (missing reference): ID 102
⚠️ Skipping entry (missing reference): ID 102
⚠️ Skipping entry (missing reference): ID 103
⚠️ Skipping entry (missing reference): ID 103
⚠️ Skipping entry (missing reference): ID 103
⚠️ Skipping entry (missing reference): ID 1

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/29 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


computing greedy matching.


  0%|          | 0/15 [00:00<?, ?it/s]

done in 831.13 seconds, 1.08 sentences/sec


Evaluating models: 100%|██████████| 3/3 [14:44<00:00, 294.91s/it]

✅ Saved: auto_eval_results\qwen3-0.6b_auto_eval.csv
⚠️ Skipping entry (missing reference): ID 97
⚠️ Skipping entry (missing reference): ID 97
⚠️ Skipping entry (missing reference): ID 97
⚠️ Skipping entry (missing reference): ID 98
⚠️ Skipping entry (missing reference): ID 98
⚠️ Skipping entry (missing reference): ID 98
⚠️ Skipping entry (missing reference): ID 99
⚠️ Skipping entry (missing reference): ID 99
⚠️ Skipping entry (missing reference): ID 99
⚠️ Skipping entry (missing reference): ID 100
⚠️ Skipping entry (missing reference): ID 100
⚠️ Skipping entry (missing reference): ID 100
⚠️ Skipping entry (missing reference): ID 101
⚠️ Skipping entry (missing reference): ID 101
⚠️ Skipping entry (missing reference): ID 101
⚠️ Skipping entry (missing reference): ID 102
⚠️ Skipping entry (missing reference): ID 102
⚠️ Skipping entry (missing reference): ID 102
⚠️ Skipping entry (missing reference): ID 103
⚠️ Skipping entry (missing reference): ID 103
⚠️ Skipping entry (missing reference)




In [None]:
def evaluate_model_outputs(data):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    all_results = []

    for model, entries in tqdm(data.items(), desc="Evaluating models"):
        hyps = []
        refs = []
        meta = []

        for entry in entries:
            # Skip only if reference is missing (keep duplicates otherwise)
            if not entry.get("reference"):
                print(f"⚠️ Skipping entry (missing reference): ID {entry['id']}")
                continue

            hyp = entry["response"].strip()
            ref = entry["reference"].strip()

            refs.append(ref)
            hyps.append(hyp)
            meta.append({
                "id": entry["id"],
                "condition": entry["condition"],  # Critical for duplicates
                "model": model,
                "reference": ref,
                "candidate": hyp
            })

        # Compute metrics only if valid pairs exist
        if not refs:
            print(f"🚫 Skipping {model} — no valid references found.")
            continue

        # BERTScore (handles duplicates automatically)
        P, R, F1 = bert_score(hyps, refs, lang="en", verbose=True)
        
        # ROUGE-L
        rouge_scores = [
            scorer.score(ref, hyp)['rougeL'].fmeasure
            for hyp, ref in zip(hyps, refs)
        ]

        # Combine results
        for i, row in enumerate(meta):
            row.update({
                "bertscore_f1": F1[i].item(),  # Convert tensor to float
                "rougeL_f1": rouge_scores[i]
            })
            all_results.append(row)

    return pd.DataFrame(all_results)

In [17]:
import os
import json

REFERENCE_MODEL = "qwen3-latest"  # or "gpt-4"
RESULTS_FOLDER = "./"

# Load reference outputs
with open(os.path.join(RESULTS_FOLDER, f"ollama_{REFERENCE_MODEL}_results.json")) as f:
    reference_data = json.load(f)

# Build lookup: (id, condition) → reference_response
reference_lookup = {
    (entry["id"], entry["condition"]): entry["response"]
    for entry in reference_data
}

# Inject reference into other model outputs
for file in os.listdir(RESULTS_FOLDER):
    if file.endswith("_results.json") and REFERENCE_MODEL not in file:
        path = os.path.join(RESULTS_FOLDER, file)
        with open(path) as f:
            data = json.load(f)

        # Add reference field
        for entry in data:
            key = (entry["id"], entry["condition"])
            entry["reference"] = reference_lookup.get(key, "")

        # Save modified file
        with open(path, "w") as f:
            json.dump(data, f, indent=2)

        print(f"✅ Updated: {file}")


✅ Updated: ollama_deepseek-r1-70b-alt_results.json
✅ Updated: ollama_gemma3-27b_results.json
✅ Updated: ollama_llama4-400b_results.json
✅ Updated: ollama_mistral-small3.1-latestllama4-400bgemma3-27bdeepseek-r1-70b-alt_results.json
✅ Updated: ollama_mistral-small3.1-latest_results.json
✅ Updated: ollama_qwen2.5-latest_results.json
✅ Updated: ollama_qwen3-0.6b_results.json


In [3]:
import os
import json
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from tqdm import tqdm

# === Config ===
RESULTS_FOLDER = "./new"
OUTPUT_FOLDER = "auto_eval_results"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def load_and_merge_references():
    # Load the reference data (qwen3-0.6b)
    with open(os.path.join(RESULTS_FOLDER, "ollama_qwen3-0.6b_results.json")) as f:
        ref_data = json.load(f)
    
    # Create reference dictionary by (id, condition)
    ref_dict = {(entry["id"], entry["condition"]): entry["reference"]
                for entry in ref_data if "reference" in entry}
    
    # Load other models
    models = {
        "gpt4": "ollama_gpt4_results.json",
        "qwen3-latest": "ollama_qwen3-latest_results.json"
    }
    
    merged_data = {}
    for model_name, filename in models.items():
        with open(os.path.join(RESULTS_FOLDER, filename)) as f:
            data = json.load(f)
            
            # For qwen3-latest, only keep IDs >= 97
            if model_name == "qwen3-latest":
                data = [entry for entry in data if entry["id"] >= 97]
            
            # Add references from qwen3-0.6b
            for entry in data:
                key = (entry["id"], entry["condition"])
                if key in ref_dict:
                    entry["reference"] = ref_dict[key]
                else:
                    entry["reference"] = ""
            
            merged_data[model_name] = data
    
    return merged_data

def evaluate_model_outputs(data):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    all_results = []
    per_model_results = {}  # Store results per model

    for model, entries in tqdm(data.items(), desc="Evaluating models"):
        hyps = []
        refs = []
        meta = []

        for entry in entries:
            # Skip if reference is empty
            if not entry.get("reference"):
                print(f"⚠️ Skipping entry (missing reference): ID {entry['id']}")
                continue

            hyp = entry["response"].strip()
            ref = entry["reference"].strip()

            refs.append(ref)
            hyps.append(hyp)
            meta.append({
                "id": entry["id"],
                "condition": entry["condition"],
                "model": model,
                "reference": ref,
                "candidate": hyp
            })

        if not refs:
            print(f"🚫 Skipping {model} — no valid references found.")
            continue

        # Compute BERTScore
        P, R, F1 = bert_score(hyps, refs, lang="en", verbose=True)
        f1_scores = F1.tolist()

        # Compute ROUGE-L
        rouge_scores = [
            scorer.score(ref, hyp)['rougeL'].fmeasure
            for hyp, ref in zip(hyps, refs)
        ]

        # Combine results
        model_results = []
        for i, row in enumerate(meta):
            row["bertscore_f1"] = f1_scores[i]
            row["rougeL_f1"] = rouge_scores[i]
            all_results.append(row)
            model_results.append(row)
        
        per_model_results[model] = pd.DataFrame(model_results)

    return pd.DataFrame(all_results), per_model_results

if __name__ == "__main__":
    # Load and merge reference data
    data = load_and_merge_references()
    
    # Evaluate and save results
    all_results_df, model_results_dict = evaluate_model_outputs(data)
    all_results_df.to_csv(os.path.join(OUTPUT_FOLDER, "all_models_auto_eval.csv"), index=False)
    
    # Save per-model results
    for model_name, model_df in model_results_dict.items():
        model_file = os.path.join(OUTPUT_FOLDER, f"{model_name}_auto_eval.csv")
        model_df.to_csv(model_file, index=False)
        print(f"✅ Saved: {model_file}")
    
    print("🏁 Done. Results saved to:")
    print(f"- Combined: all_models_auto_eval.csv")
    print(f"- Per-model: [model_name]_auto_eval.csv")

Evaluating models:   0%|                                                                         | 0/2 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/20 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 292.82 seconds, 2.16 sentences/sec


Evaluating models:  50%|████████████████████████████████                                | 1/2 [05:01<05:01, 301.76s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/10 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 232.59 seconds, 2.63 sentences/sec


Evaluating models: 100%|████████████████████████████████████████████████████████████████| 2/2 [22:43<00:00, 681.51s/it]


✅ Saved: auto_eval_results\gpt4_auto_eval.csv
✅ Saved: auto_eval_results\qwen3-latest_auto_eval.csv
🏁 Done. Results saved to:
- Combined: all_models_auto_eval.csv
- Per-model: [model_name]_auto_eval.csv


In [7]:
import os
import pandas as pd

# Folder containing the evaluation result CSV files
RESULTS_FOLDER = "./auto_eval_results/"

# LLM as Judge Results - Example Data (Replace this with your actual data)
llm_judge_data = {
    'Model': ['qwen3-latest', 'deepseek-r1-70b-alt', 'gemma3-27b'],
    'Wins': [500, 450, 400],
    'Losses': [200, 250, 300],
    'Ties': [50, 30, 100]
}

llm_judge_df = pd.DataFrame(llm_judge_data)

# HTML Template for Leaderboard
html_content = """
<html>
<head>
    <title>Model Evaluation Leaderboard</title>
    <style>
        table {
            width: 100%;
            border-collapse: collapse;
        }
        table, th, td {
            border: 1px solid black;
        }
        th, td {
            padding: 8px;
            text-align: center;
        }
        th {
            background-color: #f2f2f2;
        }
        h2 {
            text-align: center;
        }
    </style>
</head>
<body>

<h2>Model Evaluation Leaderboard (BERTScore & ROUGE-L)</h2>

<table>
    <thead>
        <tr>
            <th>Model</th>
            <th>BERTScore F1</th>
            <th>ROUGE-L F1</th>
        </tr>
    </thead>
    <tbody>
"""

# Loop through all CSV files in the results folder
for file in os.listdir(RESULTS_FOLDER):
    if file.endswith("_eval.csv"):
        # Load the CSV containing model evaluation results
        df = pd.read_csv(os.path.join(RESULTS_FOLDER, file))

        # Calculate mean scores for BERTScore and ROUGE-L (per model)
        leaderboard = df.groupby('model').agg({
            'bertscore_f1': 'mean',
            'rougeL_f1': 'mean'
        }).reset_index()

        # Add rows for BERTScore and ROUGE-L data
        for index, row in leaderboard.iterrows():
            html_content += f"""
            <tr>
                <td>{row['model']}</td>
                <td>{row['bertscore_f1']:.4f}</td>
                <td>{row['rougeL_f1']:.4f}</td>
            </tr>
            """

# Close the model evaluation leaderboard table
html_content += """
    </tbody>
</table>

<h2>LLM as a Judge - Win/Loss/Tie Results</h2>

<table>
    <thead>
        <tr>
            <th>Model</th>
            <th>Wins</th>
            <th>Losses</th>
            <th>Ties</th>
        </tr>
    </thead>
    <tbody>
"""

# Add rows for LLM as a Judge results
for index, row in llm_judge_df.iterrows():
    html_content += f"""
    <tr>
        <td>{row['Model']}</td>
        <td>{row['Wins']}</td>
        <td>{row['Losses']}</td>
        <td>{row['Ties']}</td>
    </tr>
    """

# Close the LLM as a Judge table
html_content += """
    </tbody>
</table>

</body>
</html>
"""

# Save the HTML content to a file
with open('leaderboard.html', 'w') as file:
    file.write(html_content)

print("Leaderboard HTML file created successfully!")


Leaderboard HTML file created successfully!
