In [None]:
from google.colab import drive
drive.mount('/content/drive/')
!pip install transformers huggingface_hub
!pip install -q keras

In [None]:
from huggingface_hub import login
import json
import re
from pathlib import Path
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import os

GoogleDrive= "/content/drive/MyDrive"

In [None]:
Hugging_Face_token= "..." # put your hugging face token to be able to use this
MODEL_ID = "MetaMath-7B-V1.0"

model = AutoModelForCausalLM.from_pretrained(Model_ID)
tokenizer = AutoTokenizer.from_pretrained(Model_ID)
pipe= pipeline("text-generation", model=model, tokenizer=tokenizer)


In [None]:
def true_answer(file_position: str) -> str:
    with open(file_position, "r", encoding="utf-8") as f:
        content = f.read()
    return content.strip()

In [None]:
Metrics = [
    "Final Accuracy", "Validity Rate", "Coherence Score", "Robustness Consistency",
    "Calibration Confidence", "Correctness", "Efficiency", "Clarity", "Accuracy",
    "Pass Rate", "Stepwise Correctness", "Redundancy", "Comp Efficiency", "Similarity"
]


Metric_Definitions = {
    "Final Accuracy":
        "Exact-match accuracy: fraction of test cases where the generated solution exactly matches the reference solution.",
    "Validity Rate":
        "Proportion of generated code answers that compile without syntax errors and satisfy basic functional tests.",
    "Coherence Score":
        "Entity-grid coherence: measures logical flow by tracking entity recurrence across sentences.",
    "Robustness Consistency":
        "Stability of outputs under small prompt perturbations, measured by change in evaluation metrics.",
    "Calibration Confidence":
        "Alignment between model-predicted probabilities and actual correctness, e.g. via expected calibration error.",
    "Correctness":
        "Fraction of intermediate and final reasoning steps that are mathematically or logically valid.",
    "Efficiency":
        "Resource usage of the presented solution, typically reported via time complexity or empirical runtime.",
    "Clarity":
        "Readability and organization of the explanation, e.g. via readability scores like Flesch–Kincaid.",
    "Accuracy":
        "Standard classification accuracy: (TP + TN) / (TP + TN + FP + FN) over all test cases.",
    "Pass Rate":
        "pass@k metric: probability that at least one of the top-k generated samples passes all unit tests.",
    "Stepwise Correctness":
        "Proportion of individual chain-of-thought steps that are correct, as in REVEAL evaluations.",
    "Redundancy":
        "Degree to which the answer avoids unnecessary repetition or superfluous content.",
    "Comp Efficiency":
        "Computational efficiency of any algorithm presented, e.g. its Big-O time complexity.",
    "Similarity":
        "Semantic similarity (e.g. via BERTScore) between generated and reference answers."
}

In [None]:
Hint_Labels = [f"Hint {i}" for i in range(1, 10)]

def make_latex_table_template():
    columns = "|l|" + "c|" * 9
    header = " & ".join(Hint_Labels)
    rows = []
    for m in Metrics:
        rows.append(f"{m:<26} & " + " & ".join(["" for _ in Hint_Labels]) + r" \\")
    body = "\n".join(rows)
    return f"""
\\begin{{tabular}}{{{columns}}}
Metric                     & {header} \\\\ \\hline
{body}
\\end{{tabular}}
"""

In [None]:
def build_prompt(true_ans: str, generated: dict) -> str:
    definitions = "\n".join(f"- **{m}**: {Metric_Definitions[m]}" for m in Metrics)
    table_tpl = make_latex_table_template()

    prompt = (
        "You are given a reference answer and nine model-generated answers under different hints.\n"
        "Evaluate each generated answer *against* the reference on the following metrics, each as a real number in [0,1]:\n\n"
        f"{definitions}\n\n"
        "### Reference Answer:\n"
        "```\n"
        f"{true_ans}\n"
        "```\n\n"
        "### Model-Generated Answers:\n"
    )
    for hint, info in generated.items():
        prompt += f"\n#### {hint}\n{info['generated_answer']}\n"

    prompt += (
        "\nNow, fill in the following LaTeX table with numeric scores in [0,1] "
        "(no placeholders) for each hint and metric, and output *only* the LaTeX code:\n"
        f"{table_tpl}"
    )
    return prompt

In [None]:
for j in range(1, 31): #change these to use the code
    true_path = f"/content/drive/MyDrive/ANT/Prompts Alg NT/Question {j}.txt"
    gen_path  = f"/content/drive/MyDrive/ANT/Prompts Alg NT/outputs for questions/Question_{j}_outputs.json"
    out_path  = f"/content/drive/MyDrive/ANT/Prompts Alg NT/Outputs for Questions Evaluation/Question_{j}_evaluation.txt"

    true_answer0 = true_answer(true_path)
    with open(gen_path, "r", encoding="utf-8") as f:
        generated_answer = json.load(f)

    prompt = build_prompt(true_answer0, generated_answer)
    result = pipe(prompt, max_length=2048, temperature=0.0, top_p=0.9, do_sample=False, eos_token_id=tokenizer.eos_token_id)[0]["generated_text"]

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(result)

    print(f"Saved evaluation for Question {j} to {out_path}")
