In [1]:
import os
import openai
import time
import traceback
import json
from tqdm.auto import tqdm
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai.api_base = os.environ.get("OPENAI_API_BASE")
if "s" not in openai.api_key:
    print("KEY LOADING FAILED!")
    raise NameError

In [11]:
def prompt_api(model, eval_prompt1, eval_prompt2):
    print("starting eval 1")
    for i in range(10):
        try:
            print("trying eval 1")
            eval_step1 = openai.ChatCompletion.create(
                model=model,
                stream=False,
                messages=[{"role": "user", "content": eval_prompt1}],
            )
            print("done eval 1")
            break
        except Exception as exc:
            print(traceback.format_exc())
            print(exc)
            if i != 9:
                print(f"Retrying... (i = {i})")
            eval_step1 = {"choices": [{"message": {"content": "ERROR"}}]}
            time.sleep(3)
    if eval_step1["choices"][0]["message"]["content"] == "ERROR":
        eval_step2 = {"choices": [{"message": {"content": "ERROR"}}]}
        return eval_step1, eval_step2
    print("starting eval 2")
    for i in range(10):
        try:
            eval_step2 = openai.ChatCompletion.create(
                model=model,
                stream=False,
                messages=[
                    {"role": "user", "content": eval_prompt1},
                    {"role": "assistant", "content": eval_step1.choices[0].message.content},
                    {"role": "user", "content": eval_prompt2},
                ]
            )
            break
        except Exception as exc:
            print(traceback.format_exc())
            print(exc)
            if i != 9:
                print(f"Retrying... (i = {i})")
            eval_step2 = {"choices": [{"message": {"content": "ERROR"}}]}
            time.sleep(3)
    return eval_step1, eval_step2
def get_eval_prompts(answer, lang):
    if lang == "en":
        eval_prompt1 = f"""In the following exercise, the student is given a beginning of a story. The student needs to complete it into a full story.
The exercise tests the student´s language abilities and creativity. The symbol *** marks the separator between the
prescribed beginning and the student’s completion:
{answer["prompt"]}***{answer["completion"]}
Please provide your general assessment about the part written by the student (the one after the *** symbol).
Is it gramatically correct? Is it consistent with the beginning of the story? Pay special attention to whether the
student manages to complete the sentence which is split in the middle by the separator ***.
"""
        eval_prompt2 = f"""Now, grade the student’s completion in terms of grammar, creativity, consistency with the story’s beginning and
whether the plot makes sense. Moreover, please provide your best guess of what the age of the student might be,
as reflected from the completion. Choose from possible age groups: A: 3 or under. B: 4-5. C: 6-7. D: 8-9. E:
10-12. F: 13-16.
Use the format below replacing # and X with your answer and do not write anything more than that:
grammar: #/10, creativity: #/10, consistency: #/10, age: X"""
    else:
        raise NotImplementedError
    return eval_prompt1, eval_prompt2
def get_single_eval(eval_model, answer, lang):
    eval_prompt1, eval_prompt2 = get_eval_prompts(answer, lang)
    eval_step1, eval_step2 = prompt_api(eval_model, eval_prompt1, eval_prompt2)
    answer["eval_step1"] = eval_step1["choices"][0]["message"]["content"]
    answer["eval_step2"] = eval_step2["choices"][0]["message"]["content"]
    return answer
def get_evals(eval_model, lang, model_id,):
    with open(f"answers_{lang}_{model_id}.json", "r") as f:
        answers = json.load(f)

    evaluations = []
    for a in tqdm(answers[:3]):
        try:
            ev = get_single_eval(eval_model, a, lang)
        except Exception as e:
            print("error in get_single_eval")
            ev = a
            ev["eval_step1"] = "ERROR: SOMETHING WENT WRONG"
            ev["eval_step2"] = "ERROR: SOMETHING WENT WRONG"
        evaluations.append(ev)
    with open(f"evaluations_{lang}_{model_id}.json", "w") as f:
        json.dump(evaluations, f, indent="    ", ensure_ascii=False)
eval_model = "gpt-4-1106-preview"
lang = "en"  # "en" / "de"
model_id = "gpt"  # "gpt" / "ts" / "own"

# get_evals(eval_model, lang, model_id)

In [None]:
def get_single_eval(eval_model, answer):
    eval_prompt1, eval_prompt2 = get_eval_prompts(answer)
    eval_step1, eval_step2 = prompt_api(eval_model, eval_prompt1, eval_prompt2)
    answer["eval_step1"] = eval_step1["choices"][0]["message"]["content"]
    answer["eval_step2"] = eval_step2["choices"][0]["message"]["content"]
    return answer

In [12]:
# model = "gpt-4-1106-preview"
model = "gpt-4-0125-preview"
lang = "en"
model_id = "gpt"
with open(f"answers_{lang}_{model_id}.json", "r") as f:
    answers = json.load(f)

eval_prompt1, eval_prompt2 = get_eval_prompts(answers[0], lang)

In [7]:
eval_step1 = openai.ChatCompletion.create(
    model=model,
    stream=False,
    messages=[{"role": "user", "content": eval_prompt1}],
)
print(eval_step1)

{
  "id": "chatcmpl-95Y2pvWdgIo7pVAnzq5LpO0vSk3zf",
  "object": "chat.completion",
  "created": 1711109535,
  "model": "gpt-4-0125-preview",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "The completion provided by the student after the *** symbol is generally grammatically correct, well-structured, and consistent with the beginning of the story. The student successfully managed to complete the sentence split by the separator in a way that flows naturally and logically, transitioning smoothly from the preset beginning to their own continuation.\n\nIn terms of grammar, the passage holds up well with proper use of tenses, punctuation, and sentence structure that facilitates easy reading and comprehension. Importantly, the student maintains a coherent narrative pace and uses descriptive language that enhances the imagery and emotional depth of the story, enriching the reader's experience.\n\nThe consistency of the storyline is co

In [14]:
eval_step2 = openai.ChatCompletion.create(
    model=model,
    stream=False,
    messages=[
        {"role": "user", "content": eval_prompt1},
        {"role": "assistant", "content": eval_step1.choices[0].message.content},
        {"role": "user", "content": eval_prompt2},
    ]
)
print(eval_step2)

{
  "id": "chatcmpl-95YAmAZTlkMZbDHbvrDflnLedg1jM",
  "object": "chat.completion",
  "created": 1711110028,
  "model": "gpt-4-0125-preview",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "grammar: 9/10, creativity: 8/10, consistency: 10/10, age: E"
      },
      "logprobs": null,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 938,
    "completion_tokens": 24,
    "total_tokens": 962
  },
  "system_fingerprint": "fp_b233daab0d"
}


In [15]:
print(eval_step1.choices[0].message.content)
print("----------")
print(eval_step2.choices[0].message.content)

The completion provided by the student after the *** symbol is generally grammatically correct, well-structured, and consistent with the beginning of the story. The student successfully managed to complete the sentence split by the separator in a way that flows naturally and logically, transitioning smoothly from the preset beginning to their own continuation.

In terms of grammar, the passage holds up well with proper use of tenses, punctuation, and sentence structure that facilitates easy reading and comprehension. Importantly, the student maintains a coherent narrative pace and uses descriptive language that enhances the imagery and emotional depth of the story, enriching the reader's experience.

The consistency of the storyline is commendable. The student adeptly picks up on the initial setup - a timid turtle named Timmy who encounters a fairy - and develops it into a full narrative arc that follows a logical and satisfying progression. It shifts from Timmy's initial hesitation an

In [None]:
def get_evals(eval_model, lang, model_id,):
#     with open(f"answers_{lang}_{model_id}.json", "r") as f:
#         answers = json.load(f)

#     evaluations = []
#     for a in tqdm(answers):
#         eval_prompt1, eval_prompt2 = get_eval_prompts(a)
#         eval_step1, eval_step2 = prompt_api(eval_model, eval_prompt1, eval_prompt2)
#         a["eval_step1"] = eval_step1["choices"][0]["message"]["content"]
#         a["eval_step2"] = eval_step2["choices"][0]["message"]["content"]
#         evaluations.append(a)
#     with open(f"evaluations_{lang}_{model_id}.json", "w") as f:
#         json.dump(evaluations, f, indent="    ", ensure_ascii=False)

In [None]:
# # SCRAPS FOR LATER
# template = "Once there was a dude name Doug. Doug loved to eat"
# completion = " strawberry icecream while riding his horse through Town."
# prompt1 = f"""In the following exercise, the student is given a beginning of a story. The student needs to complete it into a full story.
# The exercise tests the student´s language abilities and creativity. The symbol *** marks the separator between the
# prescribed beginning and the student’s completion:
# {template}***{completion}
# Please provide your general assessment about the part written by the student (the one after the *** symbol).
# Is it gramatically correct? Is it consistent with the beginning of the story? Pay special attention to whether the
# student manages to complete the sentence which is split in the middle by the separator ***.
# """

# prompt2 = f"""Now, grade the student’s completion in terms of grammar, creativity, consistency with the story’s beginning and
# whether the plot makes sense. Moreover, please provide your best guess of what the age of the student might be,
# as reflected from the completion. Choose from possible age groups: A: 3 or under. B: 4-5. C: 6-7. D: 8-9. E:
# 10-12. F: 13-16."""

# p1 = openai.ChatCompletion.create(
#     model = "gpt-3.5-turbo",
#     stream=False,
#     messages = [{"role": "user", "content": prompt1}]
# )

# for s in p1.choices[0].message.content.split(". "):
#     print(s + ".")

# print("--------------------------------------------------")

# p2 = openai.ChatCompletion.create(
#     model = "gpt-3.5-turbo",
#     stream=False,
#     messages = [
#         {"role": "user", "content": prompt1},
#         {"role": "assistant", "content": p1.choices[0].message.content},
#         {"role": "user", "content": prompt2},
#     ]
# )

# for s in p2.choices[0].message.content.split(". "):
#     print(s + ".")

In [None]:
# story1 = "Write a short story about a flying Mango and his best friend a small log."
# story2 = "Continue above story incorporating a very fast Caterpillar."
# p1 = openai.ChatCompletion.create(
#     model = "gpt-3.5-turbo",
#     stream=False,
#     messages = [{"role": "user", "content": story1}]
# )

# for s in p1.choices[0].message.content.split(". "):
#     print(s + ".")

# print("--------------------------------------------------")

# p2 = openai.ChatCompletion.create(
#     model = "gpt-3.5-turbo",
#     stream=False,
#     messages = [
#         {"role": "user", "content": story1},
#         {"role": "assistant", "content": p1.choices[0].message.content},
#         {"role": "user", "content": story2},
#     ]
# )

# for s in p2.choices[0].message.content.split(". "):
#     print(s + ".")

In [None]:
# story1 = "Write a short story about a flying Mango and his best friend a small log."
# story2 = "Continue above story incorporating a very fast Caterpillar."
# p1 = openai.ChatCompletion.create(
#     model = "gpt-3.5-turbo",
#     stream=False,
#     messages = [{"role": "user", "content": story1}]
# )

# for s in p1.choices[0].message.content.split(". "):
#     print(s + ".")

# print("--------------------------------------------------")

# p2 = openai.ChatCompletion.create(
#     model = "gpt-3.5-turbo",
#     stream=False,
#     messages = [
#         {"role": "user", "content": story1},
#         {"role": "assistant", "content": p1.choices[0].message.content},
#         {"role": "user", "content": story2},
#     ]
# )

# for s in p2.choices[0].message.content.split(". "):
#     print(s + ".")

In [None]:
# eval_step1 = {"choices": [{"message": {"content": "ERROR"}}]}
# print(eval_step1["choices"][0]["message"]["content"] == "ERROR")