In [19]:
import json
import re
from pathlib import Path
from typing import Tuple

import numpy as np
import tyro
from openai import OpenAI
from tqdm import tqdm

from mops.utils import open_jsonl
from mops.constants import client, logger, openai_model
from mops.prompts import (
    COMPLETENESS_SCORE_PROMPT,
    FASCINATION_SCORE_PROMPT,
    ORIGINALITY_SCORE_PROMPT,
)

In [20]:
def get_response(
    client: OpenAI,
    content: str,
    model: str = openai_model,
    temperature: float = 0.6,
):
    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": content}],
        temperature=temperature,
    )
    response = completion.choices[0].message.content
    assert isinstance(response, str)
    return response

In [21]:
def score_extraction(text: str) -> Tuple[str, str]:
    def extract_first_uppercase(text):
        match = re.search(r"\b((0|[1-9]|[1-9][0-9]|100))\b", text)
        if match:
            return match.group()
        else:
            return ""

    text_list = text.split("\n")
    text_list = [line for line in text_list if line]
    if len(text_list) > 1:
        score, explanation = text_list[:2]  # choice and explanation
    elif len(text_list) == 1:
        score, explanation = text_list[0], ""
    else:
        score, explanation = "", ""
    score = extract_first_uppercase(score)
    return score, explanation

In [125]:
def score_extraction(text: str) -> Tuple[str, str]:
    def extract_first_uppercase(text):
        match = re.search(r"\b((0|[1-9]|[1-9][0-9]|100))\b", text)
        if match:
            return match.group()
        else:
            return ""

    text_list = text.split("\n")
    text_list = [line for line in text_list if line]
    if len(text_list) > 1:
        score, explanation = text_list[:2]  # choice and explanation
    elif len(text_list) == 1:
        score, explanation = text_list[0], ""
    else:
        score, explanation = "", ""
    score = extract_first_uppercase(score)
    return score, explanation

In [128]:
def score(
    client: OpenAI,
    premise_path: Path,
    evaluation_dir: Path,
    method: str,
    prompt: str,
    metric: str,
    model: str,
):
    score_dict_path = evaluation_dir / f"{metric}_{model}.jsonl"

    all_premise_dicts = open_jsonl(premise_path)
    all_method_score_dicts = open_jsonl(score_dict_path, create_if_not_exists=True)

    logger.info(f"Load {method} premises from {premise_path}")
    logger.info(f"Save {metric} evaluation to {score_dict_path}")

    score_dicts = [
        score_dict
        for score_dict in all_method_score_dicts
        if score_dict["method"] == method
    ]

    existing_ids = [score_dict["id"] for score_dict in score_dicts]
    existing_scores = [score_dict["score"] for score_dict in score_dicts]
    
    premise_dicts = [
        premise_dict
        for premise_dict in all_premise_dicts
        if premise_dict["id"] not in existing_ids
    ]

    pbar = tqdm(
        premise_dicts,
        total=len(all_premise_dicts),
        initial=len(all_premise_dicts) - len(premise_dicts),
    )
    pbar.set_description(f"Evaluating {metric}: {np.mean(existing_scores):.3f}")

    for premise_dict in pbar:
        premise_prompt = prompt.format(premise=premise_dict["premise"])
        response = get_response(client, premise_prompt, model, temperature=0.0)

        score, explanation = score_extraction(response)

        if score == "":
            logger.warning(
                f"No score detected!, id: {premise_dict['id']}, set to score:`0`"
            )
            score = "0"

        score = int(score)
        existing_scores.append(score)
        pbar.set_description(f"Evaluating {metric}: {np.mean(existing_scores):.3f}")

        score_dict = dict(
            id=premise_dict["id"],
            method=method,
            score=score,
            explanation=explanation,
            premise=premise_dict["premise"],
        )

        with open(score_dict_path, "a") as f:
            f.write(json.dumps(score_dict) + "\n")

### Evaluation: Fascination Score

In [129]:
premise_path: Path = Path("../assets/premises/mops/moderate.jsonl")
evaluation_dir: Path = Path("../assets/premises/evaluation")
method: str = "mops"
model:  str = "gpt-4-1106-preview"
metric: str = "fascination_score"

logger.info(f"Evaluation: {metric}")
logger.info(f"Method:     {method}")
logger.info(f"Model:      {model}")
score(
    client,
    premise_path,
    evaluation_dir,
    method=method,
    prompt=FASCINATION_SCORE_PROMPT,
    metric=metric,
    model=model,
)

[32m06-09 14:46:34[0m [32mINFO    [0m [34mEvaluation: fascination_score[0m
[32m06-09 14:46:34[0m [32mINFO    [0m [34mMethod:     mops[0m
[32m06-09 14:46:34[0m [32mINFO    [0m [34mModel:      gpt-4-1106-preview[0m
[32m06-09 14:46:34[0m [32mINFO    [0m [34mLoad mops premises from ../assets/premises/mops/moderate.jsonl[0m
[32m06-09 14:46:34[0m [32mINFO    [0m [34mSave fascination_score evaluation to ../assets/premises/evaluation/fascination_score_gpt-4-1106-preview.jsonl[0m
Evaluating fascination_score: 75.662: 100%|█████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]


### Evaluation: Completeness Score

In [130]:
premise_path: Path = Path("../assets/premises/mops/moderate.jsonl")
evaluation_dir: Path = Path("../assets/premises/evaluation")
method: str = "mops"
model:  str = "gpt-4-1106-preview"
metric: str = "completeness_score"

logger.info(f"Evaluation: {metric}")
logger.info(f"Method:     {method}")
logger.info(f"Model:      {model}")
score(
    client,
    premise_path,
    evaluation_dir,
    method=method,
    prompt=FASCINATION_SCORE_PROMPT,
    metric=metric,
    model=model,
)

[32m06-09 14:46:50[0m [32mINFO    [0m [34mEvaluation: completeness_score[0m
[32m06-09 14:46:50[0m [32mINFO    [0m [34mMethod:     mops[0m
[32m06-09 14:46:50[0m [32mINFO    [0m [34mModel:      gpt-4-1106-preview[0m
[32m06-09 14:46:50[0m [32mINFO    [0m [34mLoad mops premises from ../assets/premises/mops/moderate.jsonl[0m
[32m06-09 14:46:50[0m [32mINFO    [0m [34mSave completeness_score evaluation to ../assets/premises/evaluation/completeness_score_gpt-4-1106-preview.jsonl[0m
Evaluating completeness_score: 74.780: 100%|████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]


### Evaluation: Originality Score

In [131]:
premise_path: Path = Path("../assets/premises/mops/moderate.jsonl")
evaluation_dir: Path = Path("../assets/premises/evaluation")
method: str = "mops"
model:  str = "gpt-4-1106-preview"
metric: str = "originality_score"

logger.info(f"Evaluation: {metric}")
logger.info(f"Method:     {method}")
logger.info(f"Model:      {model}")
score(
    client,
    premise_path,
    evaluation_dir,
    method=method,
    prompt=FASCINATION_SCORE_PROMPT,
    metric=metric,
    model=model,
)

[32m06-09 14:46:59[0m [32mINFO    [0m [34mEvaluation: originality_score[0m
[32m06-09 14:46:59[0m [32mINFO    [0m [34mMethod:     mops[0m
[32m06-09 14:46:59[0m [32mINFO    [0m [34mModel:      gpt-4-1106-preview[0m
[32m06-09 14:46:59[0m [32mINFO    [0m [34mLoad mops premises from ../assets/premises/mops/moderate.jsonl[0m
[32m06-09 14:46:59[0m [32mINFO    [0m [34mSave originality_score evaluation to ../assets/premises/evaluation/originality_score_gpt-4-1106-preview.jsonl[0m
Evaluating originality_score: 60.013: 100%|█████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]
