In [None]:
# imports
import json
import numpy as np
import openai
import os
import time
import traceback
from transformers import AutoTokenizer, AutoModelForCausalLM
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
# general settings
os.chdir("..")
print(os.getcwd())

In [None]:
# model setup
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5")

In [None]:
# evaluator setup
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai.api_base = os.environ.get("OPENAI_API_BASE")
evaluator_model = "gpt-4-1106-preview"

In [None]:
# get question
question_ind = np.random.choice(60)
question_data = json.load(open("evaluation/eval_questions.json", "r"))[question_ind]
question = question_data["question"]
print(question)

In [None]:
# get context
query = {
    "size": 5,
    "query": {
        "multi_match": {
            "query": question,
            "fields": ["title^2", "abstract"]
        }
    }
}
retriever_response = retriever_client.search(
    body=query,
    index=index_name
)
titles = []
abstracts = []
for i, hit in enumerate(retriever_response["hits"]["hits"]):
    titles.append(hit['_source']['title'])
    abstracts.append(hit['_source']['abstract'])
    if i == 2:  # only keep top 3 results
        break
context = f"""
    Article 1: {titles[0]}
    {abstracts[0]}
    Article 2: {titles[1]}
    {abstracts[1]}
    Article 3: {titles[2]}
    {abstracts[2]}
"""
print(context)

In [None]:
# # get answer
# model_prompt = f"""
# Please answer the following question using the context provided.
# Context: {context}
# Question: {question}
# Answer:
# """
# model_inputs = tokenizer(
#     model_prompt,
#     return_tensors="pt",
#     return_attention_mask=False
# )
# model_outputs = model.generate(**model_inputs, max_length=500)
# answer = tokenizer.batch_decode(model_outputs.flatten()[len(model_inputs["input_ids"].flatten()):])[0]
# print(answer)

In [None]:
# get answer from chatgpt-3.5 for testing
model_prompt = f"""
Please answer the following question using the context provided.
Context: {context}
Question: {question}
Answer:
"""
for i in range(10):
        try:
                chat_completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": model_prompt}],
                stream=False,
                # max_tokens=100,
                )
                if isinstance(chat_completion, dict):
                        # not stream
                        answer = chat_completion.choices[0].message.content
                else:
                        # stream
                        for token in chat_completion:
                                answer = token["choices"][0]["delta"].get("content")
                break
        except Exception as exc:
                print(traceback.format_exc())
                print(exc)
                if i != 9:
                        print(f"Retrying... (i = {i})")
                        time.sleep(3)
print(answer)

In [None]:
# get evaluation
eval_prompt = f"""
        Your task is to evaluate a student's response to a given exercise. In the exercise, the student is provided with some general context consisting of 3 titles and abstracts of medical articles.
        The student is furthermore asked a question, which he should answer correctly making use of the provided context.
        The exercise tests the student's abilities regarding grammar, reading comprehension and logical reasoning. The student's answer starts after the *** symbol.
        Please provide your general assessment about the answer provided by the student (the part after the *** symbol).
        Is it correct? Is it grammatically correct? Is it consistent with the given context?
        Furthermore, grade the student’s answer in terms of grammar, coherence, consistency with the context and whether it is correct or not. Moreover, please provide your best guess of what the academic degree of the student might be, as reflected from the answer. Choose from possible 4 possible categories: A: no degree. B: bachelor's degree. C: master's degree. D: doctoral degree. Use the following grade format: Grammar: #/10, Coherence: #/10, Context: #/10, Correctness: #/10, where the "#" should be replaces by a number between 0 (worst) and 10 (best).
        Context: {context}
        Question: {question}
        Answer: *** {answer}
"""
for i in range(10):
        try:
                chat_completion = openai.ChatCompletion.create(
                model=evaluator_model,
                messages=[{"role": "user", "content": eval_prompt}],
                stream=False,
                # max_tokens=100,
                )
                if isinstance(chat_completion, dict):
                        # not stream
                        eval_output = chat_completion.choices[0].message.content
                else:
                        # stream
                        for token in chat_completion:
                                eval_output = token["choices"][0]["delta"].get("content")
                break
        except Exception as exc:
                # print(traceback.format_exc())
                # print(exc)
                if i == 0:
                        print("Retrying.", end="")
                if i != 9:
                        print(f".", end="")
                        time.sleep(3)
print("")
print(eval_output)