In [3]:
# ======================== Bechmarking of model ============================

# Run the evaluation on the ARC-Challenge dataset
from datasets import load_dataset
import ollama

def format_prompt(q, choices):
    letters = ["A", "B", "C", "D", "E", "F"]
    options = "\n".join([f"{l}. {c}" for l, c in zip(letters, choices)])
    return f"Question: {q}\nChoices:\n{options}\nAnswer (just the letter):"

def extract_letter_from_response(resp):
    for c in resp.strip().upper():
        if c in "ABCDEF":
            return c
    return None

def run_arc_evaluation(model="llama3.2:1b", subset="ARC-Challenge", n=100): # llama3.2:1b or llama3.2:3b or llama3-1b-spamgen
    dataset = load_dataset("ai2_arc", subset, split="train")
    correct = 0
    total = min(n, len(dataset))

    for i in range(total):
        row = dataset[i]
        q = row["question"]
        choices = row["choices"]["text"]
        correct_answer = row["answerKey"]

        prompt = format_prompt(q, choices)

        result = ollama.chat(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            options={"temperature": 0.7, "max_tokens": 10}
        )
        answer = extract_letter_from_response(result["message"]["content"])

        is_correct = answer == correct_answer
        if is_correct: correct += 1

        #print(f"\nQ{i+1}: {q}")
        #full_output = result["message"]["content"]
        #print(f"Full model output:\n{full_output}")  # <<< ici tu vois exactement ce que le modèle répond
        #print(f"Extracted Answer: {answer} | Expected: {correct_answer} | {'✅' if answer == correct_answer else '❌'}")

    acc = 100 * correct / total
    print(f"\n=== Model {model} Accuracy on {subset} ({total} questions): {acc:.2f}% ===")

run_arc_evaluation()



=== Model llama3.2:1b Accuracy on ARC-Challenge (100 questions): 42.00% ===


In [4]:
run_arc_evaluation(model="llama3-1b-spamgen")


=== Model llama3-1b-spamgen Accuracy on ARC-Challenge (100 questions): 46.00% ===


In [5]:
run_arc_evaluation(model="llama3.2:3b")


=== Model llama3.2:3b Accuracy on ARC-Challenge (100 questions): 70.00% ===


In [6]:
run_arc_evaluation(model="llama3.2:1b")


=== Model llama3.2:1b Accuracy on ARC-Challenge (100 questions): 41.00% ===


In [1]:
# ======================== Advanced Multi-Model Multi-Temperature Benchmark (with progress bar) ============================
from datasets import load_dataset
import ollama
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

MODELS = ["llama3-1b-spamgen", "llama3.2:3b", "llama3.2:1b"]
TEMPERATURES = [0.0, 0.5, 0.9]
SUBSET = "ARC-Challenge"
N_QUESTIONS = 100

def format_prompt(q, choices):
    letters = ["A", "B", "C", "D", "E", "F"]
    options = "\n".join([f"{l}. {c}" for l, c in zip(letters, choices)])
    return f"Question: {q}\nChoices:\n{options}\nAnswer (just the letter):"

def extract_letter_from_response(resp):
    for c in resp.strip().upper():
        if c in "ABCDEF":
            return c
    return None

def run_full_benchmark(models=MODELS, temperatures=TEMPERATURES, subset=SUBSET, n=N_QUESTIONS):
    dataset = load_dataset("ai2_arc", subset, split="train")
    total = min(n, len(dataset))
    summary_stats = []

    for temp in temperatures:
        print(f"\n\n==== Running evaluation at temperature {temp:.1f} ====\n")
        results = defaultdict(list)

        for i in tqdm(range(total), desc=f"Temp {temp:.1f}"):
            q = dataset[i]["question"]
            choices = dataset[i]["choices"]["text"]
            correct_answer = dataset[i]["answerKey"]
            prompt = format_prompt(q, choices)

            row = {"question_id": i, "question": q, "correct": correct_answer}
            for model in models:
                response = ollama.chat(
                    model=model,
                    messages=[{"role": "user", "content": prompt}],
                    options={"temperature": temp, "max_tokens": 10}
                )
                answer = extract_letter_from_response(response["message"]["content"])
                row[model] = answer

            results["all"].append(row)

        df = pd.DataFrame(results["all"])
        for model in models:
            df[f"correct_{model}"] = df[model] == df["correct"]

        print("=== Accuracy per Model ===")
        acc_dict = {"temperature": temp}
        for model in models:
            acc = df[f"correct_{model}"].mean()
            acc_dict[f"acc_{model}"] = round(acc * 100, 2)
            print(f"{model}: {acc:.2%}")

        print("\n=== Agreement Between Models ===")
        for i in range(len(models)):
            for j in range(i+1, len(models)):
                m1, m2 = models[i], models[j]
                agreement = (df[m1] == df[m2]).mean()
                print(f"{m1} ↔ {m2}: {agreement:.2%}")
                acc_dict[f"agree_{m1}_{m2}"] = round(agreement * 100, 2)

        print("\n=== Shared Errors / Correct Statistics ===")
        all_wrong = (~df[f"correct_{models[0]}"] & ~df[f"correct_{models[1]}"] & ~df[f"correct_{models[2]}"]).sum()
        all_correct = (df[f"correct_{models[0]}"] & df[f"correct_{models[1]}"] & df[f"correct_{models[2]}"]).sum()
        partial_overlap = total - all_wrong - all_correct
        print(f"All models wrong: {all_wrong}/{total}")
        print(f"All models correct: {all_correct}/{total}")
        print(f"Partial disagreement: {partial_overlap}/{total}")
        acc_dict["all_correct"] = all_correct
        acc_dict["all_wrong"] = all_wrong
        acc_dict["partial_disagreement"] = partial_overlap

        filename = f"arc_eval_temp{temp:.1f}.csv"
        df.to_csv(filename, index=False)
        print(f"Saved detailed results to {filename}")

        summary_stats.append(acc_dict)

    summary_df = pd.DataFrame(summary_stats)
    print("\n\n==== Summary Table Across Temperatures ====")
    print(summary_df.to_string(index=False))
    summary_df.to_csv("arc_summary_all_temperatures.csv", index=False)
    print("Saved summary table to arc_summary_all_temperatures.csv")

    return summary_df

# Run all benchmarks
run_full_benchmark()


  from .autonotebook import tqdm as notebook_tqdm




==== Running evaluation at temperature 0.0 ====



Temp 0.0: 100%|██████████| 100/100 [13:34<00:00,  8.15s/it]


=== Accuracy per Model ===
llama3-1b-spamgen: 47.00%
llama3.2:3b: 72.00%
llama3.2:1b: 41.00%

=== Agreement Between Models ===
llama3-1b-spamgen ↔ llama3.2:3b: 49.00%
llama3-1b-spamgen ↔ llama3.2:1b: 38.00%
llama3.2:3b ↔ llama3.2:1b: 48.00%

=== Shared Errors / Correct Statistics ===
All models wrong: 16/100
All models correct: 24/100
Partial disagreement: 60/100
Saved detailed results to arc_eval_temp0.0.csv


==== Running evaluation at temperature 0.5 ====



Temp 0.5: 100%|██████████| 100/100 [13:26<00:00,  8.07s/it]


=== Accuracy per Model ===
llama3-1b-spamgen: 38.00%
llama3.2:3b: 74.00%
llama3.2:1b: 43.00%

=== Agreement Between Models ===
llama3-1b-spamgen ↔ llama3.2:3b: 43.00%
llama3-1b-spamgen ↔ llama3.2:1b: 34.00%
llama3.2:3b ↔ llama3.2:1b: 49.00%

=== Shared Errors / Correct Statistics ===
All models wrong: 16/100
All models correct: 21/100
Partial disagreement: 63/100
Saved detailed results to arc_eval_temp0.5.csv


==== Running evaluation at temperature 0.9 ====



Temp 0.9: 100%|██████████| 100/100 [13:10<00:00,  7.90s/it]

=== Accuracy per Model ===
llama3-1b-spamgen: 29.00%
llama3.2:3b: 73.00%
llama3.2:1b: 38.00%

=== Agreement Between Models ===
llama3-1b-spamgen ↔ llama3.2:3b: 36.00%
llama3-1b-spamgen ↔ llama3.2:1b: 32.00%
llama3.2:3b ↔ llama3.2:1b: 39.00%

=== Shared Errors / Correct Statistics ===
All models wrong: 14/100
All models correct: 11/100
Partial disagreement: 75/100
Saved detailed results to arc_eval_temp0.9.csv


==== Summary Table Across Temperatures ====
 temperature  acc_llama3-1b-spamgen  acc_llama3.2:3b  acc_llama3.2:1b  agree_llama3-1b-spamgen_llama3.2:3b  agree_llama3-1b-spamgen_llama3.2:1b  agree_llama3.2:3b_llama3.2:1b  all_correct  all_wrong  partial_disagreement
         0.0                   47.0             72.0             41.0                                 49.0                                 38.0                           48.0           24         16                    60
         0.5                   38.0             74.0             43.0                              




Unnamed: 0,temperature,acc_llama3-1b-spamgen,acc_llama3.2:3b,acc_llama3.2:1b,agree_llama3-1b-spamgen_llama3.2:3b,agree_llama3-1b-spamgen_llama3.2:1b,agree_llama3.2:3b_llama3.2:1b,all_correct,all_wrong,partial_disagreement
0,0.0,47.0,72.0,41.0,49.0,38.0,48.0,24,16,60
1,0.5,38.0,74.0,43.0,43.0,34.0,49.0,21,16,63
2,0.9,29.0,73.0,38.0,36.0,32.0,39.0,11,14,75


In [2]:
from datasets import load_dataset
import ollama
import numpy as np

MODELS = ["llama3-1b-spamgen", "llama3.2:1b"]
TEMPERATURE = 0.0
N_RUNS = 3
QUESTIONS_PER_RUN = 20

# Load dataset
dataset = load_dataset("ai2_arc", "ARC-Challenge", split="train")
total_questions = len(dataset)

accuracies = {model: [] for model in MODELS}

for run in range(N_RUNS):
    # Ensure different questions for each run
    indices = np.random.choice(total_questions, QUESTIONS_PER_RUN, replace=False)
    correct_counts = {model: 0 for model in MODELS}

    print(f"\nRun {run + 1}/{N_RUNS}")
    for idx in indices:
        question_data = dataset[int(idx)]
        q = question_data["question"]
        choices = question_data["choices"]["text"]
        correct_answer = question_data["answerKey"]
        prompt = f"Question: {q}\nChoices:\n" + "\n".join([f"{chr(65+i)}. {c}" for i, c in enumerate(choices)]) + "\nAnswer (just the letter):"

        for model in MODELS:
            response = ollama.chat(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                options={"temperature": TEMPERATURE, "max_tokens": 10}
            )
            predicted_answer = next((c for c in response["message"]["content"].strip().upper() if c in "ABCDEF"), None)
            correct_counts[model] += (predicted_answer == correct_answer)

    # Calculate accuracy for this run
    for model in MODELS:
        accuracy = correct_counts[model] / QUESTIONS_PER_RUN
        accuracies[model].append(accuracy)
        print(f"{model} Accuracy (Run {run + 1}): {accuracy:.2%}")

# Final accuracy averaged over runs
print("\nAverage Accuracies Across Runs:")
for model in MODELS:
    avg_accuracy = np.mean(accuracies[model])
    print(f"{model}: {avg_accuracy:.2%}")


Run 1/3
llama3-1b-spamgen Accuracy (Run 1): 50.00%
llama3.2:1b Accuracy (Run 1): 15.00%

Run 2/3
llama3-1b-spamgen Accuracy (Run 2): 50.00%
llama3.2:1b Accuracy (Run 2): 35.00%

Run 3/3
llama3-1b-spamgen Accuracy (Run 3): 45.00%
llama3.2:1b Accuracy (Run 3): 35.00%

Average Accuracies Across Runs:
llama3-1b-spamgen: 48.33%
llama3.2:1b: 28.33%
