In [None]:
import os
import openai
import time
import traceback
import json
from tqdm.auto import tqdm
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai.api_base = os.environ.get("OPENAI_API_BASE")

In [None]:
def prompt_api(model, eval_prompt):
    for i in range(10):
        try:
            chat_completion = openai.ChatCompletion.create(
            model=model,
            messages=[{"role": "user", "content": eval_prompt}],
            stream=False,
            # max_tokens=100,
            )
            break
        except Exception as exc:
            print(traceback.format_exc())
            print(exc)
            if i != 9:
                print(f"Retrying... (i = {i})")
            chat_completion = {"choices": [{"message": {"content": "ERROR"}}]}
            time.sleep(3)
    return chat_completion

In [None]:
def get_eval_prompt(answer_doc):
    context = """"""
    for i, article in enumerate(answer_doc["context"]):
        context += f"   Article {i+1}: {article['metadata']['title']}\n"
        context += f"   Abstract: {article['metadata']['abstract']}\n"
    question = f"""{answer_doc["question"]}"""
    answer = f"""{answer_doc["answer"]}"""
    eval_prompt = f"""Your task is to evaluate a student's response to a given exercise. In the exercise, the student is provided with some general context consisting of the titles and abstracts of medical articles.
The student is furthermore asked a question, which he should answer correctly making use of the provided context.
The exercise tests the student's abilities regarding grammar, reading comprehension and logical reasoning. The student's answer starts after the *** symbol.
Please provide your general critical assessment about the answer provided by the student (the part after the *** symbol).
Is it correct? Is it grammatically correct? Is it consistent with the given context?
Furthermore, grade the student’s answer in terms of grammar, coherence, consistency with the context and whether it is correct or not. Use the following grade format: Grammar: #/10, Coherence: #/10, Context: #/10, Correctness: #/10, where the "#" should be replaces by a number between 0 (worst) and 10 (best).
Remember to be very strict about the grading!
Context: {context}
Question: {question}
Answer: *** {answer}"""
    return eval_prompt

In [None]:
eval_model = "gpt-4-1106-preview"
# model_id = "gpt-3.5-turbo"
model_id = "healio"

with open(f"eval_answers_{model_id}.json", "r") as f:
    answers = json.load(f)

evaluations = []
for a in tqdm(answers):
    evals = []
    for i in range(10):
        eval_prompt = get_eval_prompt(a)
        chat_completion = prompt_api(eval_model, eval_prompt)
        evals.append(chat_completion["choices"][0]["message"]["content"])
    evaluations.append({
        "id": a["id"],
        "type": a["type"],
        "question": a["question"],
        "context": a["context"],
        "answer": a["answer"],
        "evaluation": evals
    })
with open(f"evaluations_{model_id}_repeated.json", "w") as f:
    json.dump(evaluations, f, indent="    ")

In [None]:
eval_model = "gpt-4-1106-preview"
model_id = "gpt-3.5-turbo"
# model_id = "healio"

with open(f"eval_answers_{model_id}.json", "r") as f:
    answers = json.load(f)

evaluations = []
for a in tqdm(answers):
    evals = []
    for i in range(10):
        eval_prompt = get_eval_prompt(a)
        chat_completion = prompt_api(eval_model, eval_prompt)
        evals.append(chat_completion["choices"][0]["message"]["content"])
    evaluations.append({
        "id": a["id"],
        "type": a["type"],
        "question": a["question"],
        "context": a["context"],
        "answer": a["answer"],
        "evaluation": evals
    })
with open(f"evaluations_{model_id}_repeated.json", "w") as f:
    json.dump(evaluations, f, indent="    ")

In [None]:
def get_single_score(eval_answers, id):
    a = eval_answers[id]
    eval_prompt = get_eval_prompt(a)
    chat_completion = prompt_api(eval_model, eval_prompt)
    return {
        "id": a["id"],
        "type": a["type"],
        "question": a["question"],
        "context": a["context"],
        "answer": a["answer"],
        "evaluation": chat_completion["choices"][0]["message"]["content"]
    }

In [None]:
for id in tqdm(range(60)):
    with open(f"evaluations_{model_id}.json", "r") as f:
        evaluations = json.load(f)
    evaluations[id] = get_single_score(answers, id)
    with open(f"evaluations_{model_id}.json", "w") as f:
        json.dump(evaluations, f, indent="    ")